diff --git a/.gitignore b/.gitignore
index 4b098b6bd..741704db8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.autosave
 *.pyc
 *.user
+*~
 .*.swp
 .DS_Store
 .sw[a-z]
diff --git a/.tgitconfig b/.tgitconfig
new file mode 100644
index 000000000..5fa522d23
--- /dev/null
+++ b/.tgitconfig
@@ -0,0 +1,2 @@
+[tgit]
+    icon = doc/opencv.ico
diff --git a/3rdparty/include/MultiMon.h b/3rdparty/include/MultiMon.h
deleted file mode 100644
index 8e9cd5726..000000000
--- a/3rdparty/include/MultiMon.h
+++ /dev/null
@@ -1,502 +0,0 @@
-//=============================================================================
-//
-// multimon.h -- Stub module that fakes multiple monitor apis on Win32 OSes
-//               without them.
-//
-// By using this header your code will get back default values from
-// GetSystemMetrics() for new metrics, and the new multimonitor APIs
-// will act like only one display is present on a Win32 OS without
-// multimonitor APIs.
-//
-// Exactly one source must include this with COMPILE_MULTIMON_STUBS defined.
-//
-// Copyright (c) Microsoft Corporation. All rights reserved.
-//
-//=============================================================================
-
-#ifdef __cplusplus
-extern "C" {            // Assume C declarations for C++
-#endif // __cplusplus
-
-//
-// If we are building with Win95/NT4 headers, we need to declare
-// the multimonitor-related metrics and APIs ourselves.
-//
-#ifndef SM_CMONITORS
-
-#define SM_XVIRTUALSCREEN       76
-#define SM_YVIRTUALSCREEN       77
-#define SM_CXVIRTUALSCREEN      78
-#define SM_CYVIRTUALSCREEN      79
-#define SM_CMONITORS            80
-#define SM_SAMEDISPLAYFORMAT    81
-
-// HMONITOR is already declared if WINVER >= 0x0500 in windef.h
-// This is for components built with an older version number.
-//
-#if !defined(HMONITOR_DECLARED) && (WINVER < 0x0500)
-DECLARE_HANDLE(HMONITOR);
-#define HMONITOR_DECLARED
-#endif
-
-#define MONITOR_DEFAULTTONULL       0x00000000
-#define MONITOR_DEFAULTTOPRIMARY    0x00000001
-#define MONITOR_DEFAULTTONEAREST    0x00000002
-
-#define MONITORINFOF_PRIMARY        0x00000001
-
-typedef struct tagMONITORINFO
-{
-    DWORD   cbSize;
-    RECT    rcMonitor;
-    RECT    rcWork;
-    DWORD   dwFlags;
-} MONITORINFO, *LPMONITORINFO;
-
-#ifndef CCHDEVICENAME
-#define CCHDEVICENAME 32
-#endif
-
-#ifdef __cplusplus
-typedef struct tagMONITORINFOEXA : public tagMONITORINFO
-{
-    CHAR        szDevice[CCHDEVICENAME];
-} MONITORINFOEXA, *LPMONITORINFOEXA;
-typedef struct tagMONITORINFOEXW : public tagMONITORINFO
-{
-    WCHAR       szDevice[CCHDEVICENAME];
-} MONITORINFOEXW, *LPMONITORINFOEXW;
-#ifdef UNICODE
-typedef MONITORINFOEXW MONITORINFOEX;
-typedef LPMONITORINFOEXW LPMONITORINFOEX;
-#else
-typedef MONITORINFOEXA MONITORINFOEX;
-typedef LPMONITORINFOEXA LPMONITORINFOEX;
-#endif // UNICODE
-#else // ndef __cplusplus
-typedef struct tagMONITORINFOEXA
-{
-    MONITORINFO;
-    CHAR        szDevice[CCHDEVICENAME];
-} MONITORINFOEXA, *LPMONITORINFOEXA;
-typedef struct tagMONITORINFOEXW
-{
-    MONITORINFO;
-    WCHAR       szDevice[CCHDEVICENAME];
-} MONITORINFOEXW, *LPMONITORINFOEXW;
-#ifdef UNICODE
-typedef MONITORINFOEXW MONITORINFOEX;
-typedef LPMONITORINFOEXW LPMONITORINFOEX;
-#else
-typedef MONITORINFOEXA MONITORINFOEX;
-typedef LPMONITORINFOEXA LPMONITORINFOEX;
-#endif // UNICODE
-#endif
-
-typedef BOOL (CALLBACK* MONITORENUMPROC)(HMONITOR, HDC, LPRECT, LPARAM);
-
-#ifndef DISPLAY_DEVICE_ATTACHED_TO_DESKTOP
-typedef struct _DISPLAY_DEVICEA {
-    DWORD  cb;
-    CHAR   DeviceName[32];
-    CHAR   DeviceString[128];
-    DWORD  StateFlags;
-    CHAR   DeviceID[128];
-    CHAR   DeviceKey[128];
-} DISPLAY_DEVICEA, *PDISPLAY_DEVICEA, *LPDISPLAY_DEVICEA;
-typedef struct _DISPLAY_DEVICEW {
-    DWORD  cb;
-    WCHAR  DeviceName[32];
-    WCHAR  DeviceString[128];
-    DWORD  StateFlags;
-    WCHAR  DeviceID[128];
-    WCHAR  DeviceKey[128];
-} DISPLAY_DEVICEW, *PDISPLAY_DEVICEW, *LPDISPLAY_DEVICEW;
-#ifdef UNICODE
-typedef DISPLAY_DEVICEW DISPLAY_DEVICE;
-typedef PDISPLAY_DEVICEW PDISPLAY_DEVICE;
-typedef LPDISPLAY_DEVICEW LPDISPLAY_DEVICE;
-#else
-typedef DISPLAY_DEVICEA DISPLAY_DEVICE;
-typedef PDISPLAY_DEVICEA PDISPLAY_DEVICE;
-typedef LPDISPLAY_DEVICEA LPDISPLAY_DEVICE;
-#endif // UNICODE
-
-#define DISPLAY_DEVICE_ATTACHED_TO_DESKTOP 0x00000001
-#define DISPLAY_DEVICE_MULTI_DRIVER        0x00000002
-#define DISPLAY_DEVICE_PRIMARY_DEVICE      0x00000004
-#define DISPLAY_DEVICE_MIRRORING_DRIVER    0x00000008
-#define DISPLAY_DEVICE_VGA_COMPATIBLE      0x00000010
-#endif
-
-#endif  // SM_CMONITORS
-
-#undef GetMonitorInfo
-#undef GetSystemMetrics
-#undef MonitorFromWindow
-#undef MonitorFromRect
-#undef MonitorFromPoint
-#undef EnumDisplayMonitors
-#undef EnumDisplayDevices
-
-//
-// Define COMPILE_MULTIMON_STUBS to compile the stubs;
-// otherwise, you get the declarations.
-//
-#ifdef COMPILE_MULTIMON_STUBS
-
-//-----------------------------------------------------------------------------
-//
-// Implement the API stubs.
-//
-//-----------------------------------------------------------------------------
-
-#ifndef _MULTIMON_USE_SECURE_CRT
-#if defined(__GOT_SECURE_LIB__) && __GOT_SECURE_LIB__ >= 200402L
-#define _MULTIMON_USE_SECURE_CRT 1
-#else
-#define _MULTIMON_USE_SECURE_CRT 0
-#endif
-#endif
-
-#ifndef MULTIMON_FNS_DEFINED
-
-int      (WINAPI* g_pfnGetSystemMetrics)(int) = NULL;
-HMONITOR (WINAPI* g_pfnMonitorFromWindow)(HWND, DWORD) = NULL;
-HMONITOR (WINAPI* g_pfnMonitorFromRect)(LPCRECT, DWORD) = NULL;
-HMONITOR (WINAPI* g_pfnMonitorFromPoint)(POINT, DWORD) = NULL;
-BOOL     (WINAPI* g_pfnGetMonitorInfo)(HMONITOR, LPMONITORINFO) = NULL;
-BOOL     (WINAPI* g_pfnEnumDisplayMonitors)(HDC, LPCRECT, MONITORENUMPROC, LPARAM) = NULL;
-BOOL     (WINAPI* g_pfnEnumDisplayDevices)(PVOID, DWORD, PDISPLAY_DEVICE,DWORD) = NULL;
-BOOL     g_fMultiMonInitDone = FALSE;
-BOOL     g_fMultimonPlatformNT = FALSE;
-
-#endif
-
-BOOL IsPlatformNT()
-{
-    OSVERSIONINFOA osvi = {0};
-    osvi.dwOSVersionInfoSize = sizeof(osvi);
-    GetVersionExA((OSVERSIONINFOA*)&osvi);
-    return (VER_PLATFORM_WIN32_NT == osvi.dwPlatformId);
-}
-
-BOOL InitMultipleMonitorStubs(void)
-{
-    HMODULE hUser32;
-    if (g_fMultiMonInitDone)
-    {
-        return g_pfnGetMonitorInfo != NULL;
-    }
-
-    g_fMultimonPlatformNT = IsPlatformNT();
-    hUser32 = GetModuleHandle(TEXT("USER32"));
-    if (hUser32 &&
-        (*(FARPROC*)&g_pfnGetSystemMetrics    = GetProcAddress(hUser32,"GetSystemMetrics")) != NULL &&
-        (*(FARPROC*)&g_pfnMonitorFromWindow   = GetProcAddress(hUser32,"MonitorFromWindow")) != NULL &&
-        (*(FARPROC*)&g_pfnMonitorFromRect     = GetProcAddress(hUser32,"MonitorFromRect")) != NULL &&
-        (*(FARPROC*)&g_pfnMonitorFromPoint    = GetProcAddress(hUser32,"MonitorFromPoint")) != NULL &&
-        (*(FARPROC*)&g_pfnEnumDisplayMonitors = GetProcAddress(hUser32,"EnumDisplayMonitors")) != NULL &&
-#ifdef UNICODE
-        (*(FARPROC*)&g_pfnEnumDisplayDevices  = GetProcAddress(hUser32,"EnumDisplayDevicesW")) != NULL &&
-        (*(FARPROC*)&g_pfnGetMonitorInfo      = g_fMultimonPlatformNT ? GetProcAddress(hUser32,"GetMonitorInfoW") :
-                                                GetProcAddress(hUser32,"GetMonitorInfoA")) != NULL
-#else
-        (*(FARPROC*)&g_pfnGetMonitorInfo      = GetProcAddress(hUser32,"GetMonitorInfoA")) != NULL &&
-        (*(FARPROC*)&g_pfnEnumDisplayDevices  = GetProcAddress(hUser32,"EnumDisplayDevicesA")) != NULL
-#endif
-    ) {
-        g_fMultiMonInitDone = TRUE;
-        return TRUE;
-    }
-    else
-    {
-        g_pfnGetSystemMetrics    = NULL;
-        g_pfnMonitorFromWindow   = NULL;
-        g_pfnMonitorFromRect     = NULL;
-        g_pfnMonitorFromPoint    = NULL;
-        g_pfnGetMonitorInfo      = NULL;
-        g_pfnEnumDisplayMonitors = NULL;
-        g_pfnEnumDisplayDevices  = NULL;
-
-        g_fMultiMonInitDone = TRUE;
-        return FALSE;
-    }
-}
-
-//-----------------------------------------------------------------------------
-//
-// fake implementations of Monitor APIs that work with the primary display
-// no special parameter validation is made since these run in client code
-//
-//-----------------------------------------------------------------------------
-
-int WINAPI
-xGetSystemMetrics(int nIndex)
-{
-    if (InitMultipleMonitorStubs())
-        return g_pfnGetSystemMetrics(nIndex);
-
-    switch (nIndex)
-    {
-    case SM_CMONITORS:
-    case SM_SAMEDISPLAYFORMAT:
-        return 1;
-
-    case SM_XVIRTUALSCREEN:
-    case SM_YVIRTUALSCREEN:
-        return 0;
-
-    case SM_CXVIRTUALSCREEN:
-        nIndex = SM_CXSCREEN;
-        break;
-
-    case SM_CYVIRTUALSCREEN:
-        nIndex = SM_CYSCREEN;
-        break;
-    }
-
-    return GetSystemMetrics(nIndex);
-}
-
-#define xPRIMARY_MONITOR ((HMONITOR)0x12340042)
-
-HMONITOR WINAPI
-xMonitorFromPoint(POINT ptScreenCoords, DWORD dwFlags)
-{
-    if (InitMultipleMonitorStubs())
-        return g_pfnMonitorFromPoint(ptScreenCoords, dwFlags);
-
-    if ((dwFlags & (MONITOR_DEFAULTTOPRIMARY | MONITOR_DEFAULTTONEAREST)) ||
-        ((ptScreenCoords.x >= 0) &&
-        (ptScreenCoords.x < GetSystemMetrics(SM_CXSCREEN)) &&
-        (ptScreenCoords.y >= 0) &&
-        (ptScreenCoords.y < GetSystemMetrics(SM_CYSCREEN))))
-    {
-        return xPRIMARY_MONITOR;
-    }
-
-    return NULL;
-}
-
-HMONITOR WINAPI
-xMonitorFromRect(LPCRECT lprcScreenCoords, DWORD dwFlags)
-{
-    if (InitMultipleMonitorStubs())
-        return g_pfnMonitorFromRect(lprcScreenCoords, dwFlags);
-
-    if ((dwFlags & (MONITOR_DEFAULTTOPRIMARY | MONITOR_DEFAULTTONEAREST)) ||
-        ((lprcScreenCoords->right > 0) &&
-        (lprcScreenCoords->bottom > 0) &&
-        (lprcScreenCoords->left < GetSystemMetrics(SM_CXSCREEN)) &&
-        (lprcScreenCoords->top < GetSystemMetrics(SM_CYSCREEN))))
-    {
-        return xPRIMARY_MONITOR;
-    }
-
-    return NULL;
-}
-
-HMONITOR WINAPI
-xMonitorFromWindow(HWND hWnd, DWORD dwFlags)
-{
-    WINDOWPLACEMENT wp;
-
-    if (InitMultipleMonitorStubs())
-        return g_pfnMonitorFromWindow(hWnd, dwFlags);
-
-    if (dwFlags & (MONITOR_DEFAULTTOPRIMARY | MONITOR_DEFAULTTONEAREST))
-        return xPRIMARY_MONITOR;
-
-    if (IsIconic(hWnd) ?
-            GetWindowPlacement(hWnd, &wp) :
-            GetWindowRect(hWnd, &wp.rcNormalPosition)) {
-
-        return xMonitorFromRect(&wp.rcNormalPosition, dwFlags);
-    }
-
-    return NULL;
-}
-
-BOOL WINAPI
-xGetMonitorInfo(HMONITOR hMonitor, __inout LPMONITORINFO lpMonitorInfo)
-{
-    RECT rcWork;
-
-    if (InitMultipleMonitorStubs())
-    {
-        BOOL f = g_pfnGetMonitorInfo(hMonitor, lpMonitorInfo);
-#ifdef UNICODE
-        if (f && !g_fMultimonPlatformNT && (lpMonitorInfo->cbSize >= sizeof(MONITORINFOEX)))
-        {
-            MultiByteToWideChar(CP_ACP, 0,
-                (LPSTR)((MONITORINFOEX*)lpMonitorInfo)->szDevice, -1,
-                ((MONITORINFOEX*)lpMonitorInfo)->szDevice, (sizeof(((MONITORINFOEX*)lpMonitorInfo)->szDevice)/sizeof(TCHAR)));
-        }
-#endif
-        return f;
-    }
-
-    if ((hMonitor == xPRIMARY_MONITOR) &&
-        lpMonitorInfo &&
-        (lpMonitorInfo->cbSize >= sizeof(MONITORINFO)) &&
-        SystemParametersInfoA(SPI_GETWORKAREA, 0, &rcWork, 0))
-    {
-        lpMonitorInfo->rcMonitor.left = 0;
-        lpMonitorInfo->rcMonitor.top  = 0;
-        lpMonitorInfo->rcMonitor.right  = GetSystemMetrics(SM_CXSCREEN);
-        lpMonitorInfo->rcMonitor.bottom = GetSystemMetrics(SM_CYSCREEN);
-        lpMonitorInfo->rcWork = rcWork;
-        lpMonitorInfo->dwFlags = MONITORINFOF_PRIMARY;
-
-        if (lpMonitorInfo->cbSize >= sizeof(MONITORINFOEX))
-        {
-#ifdef UNICODE
-            MultiByteToWideChar(CP_ACP, 0, "DISPLAY", -1, ((MONITORINFOEX*)lpMonitorInfo)->szDevice, (sizeof(((MONITORINFOEX*)lpMonitorInfo)->szDevice)/sizeof(TCHAR)));
-#else // UNICODE
-#if _MULTIMON_USE_SECURE_CRT
-            strncpy_s(((MONITORINFOEX*)lpMonitorInfo)->szDevice, (sizeof(((MONITORINFOEX*)lpMonitorInfo)->szDevice)/sizeof(TCHAR)), TEXT("DISPLAY"), (sizeof(((MONITORINFOEX*)lpMonitorInfo)->szDevice)/sizeof(TCHAR)) - 1);
-#else
-            lstrcpyn(((MONITORINFOEX*)lpMonitorInfo)->szDevice, TEXT("DISPLAY"), (sizeof(((MONITORINFOEX*)lpMonitorInfo)->szDevice)/sizeof(TCHAR)));
-#endif // _MULTIMON_USE_SECURE_CRT
-#endif // UNICODE
-        }
-
-        return TRUE;
-    }
-
-    return FALSE;
-}
-
-BOOL WINAPI
-xEnumDisplayMonitors(
-        HDC             hdcOptionalForPainting,
-        LPCRECT         lprcEnumMonitorsThatIntersect,
-        MONITORENUMPROC lpfnEnumProc,
-        LPARAM          dwData)
-{
-    RECT rcLimit;
-
-    if (InitMultipleMonitorStubs()) {
-        return g_pfnEnumDisplayMonitors(
-                hdcOptionalForPainting,
-                lprcEnumMonitorsThatIntersect,
-                lpfnEnumProc,
-                dwData);
-    }
-
-    if (!lpfnEnumProc)
-        return FALSE;
-
-    rcLimit.left   = 0;
-    rcLimit.top    = 0;
-    rcLimit.right  = GetSystemMetrics(SM_CXSCREEN);
-    rcLimit.bottom = GetSystemMetrics(SM_CYSCREEN);
-
-    if (hdcOptionalForPainting)
-    {
-        RECT    rcClip;
-        POINT   ptOrg;
-
-        switch (GetClipBox(hdcOptionalForPainting, &rcClip))
-        {
-        default:
-            if (!GetDCOrgEx(hdcOptionalForPainting, &ptOrg))
-                return FALSE;
-
-            OffsetRect(&rcLimit, -ptOrg.x, -ptOrg.y);
-            if (IntersectRect(&rcLimit, &rcLimit, &rcClip) &&
-                (!lprcEnumMonitorsThatIntersect ||
-                     IntersectRect(&rcLimit, &rcLimit, lprcEnumMonitorsThatIntersect))) {
-
-                break;
-            }
-            //fall thru
-        case NULLREGION:
-             return TRUE;
-        case ERROR:
-             return FALSE;
-        }
-    } else {
-        if (    lprcEnumMonitorsThatIntersect &&
-                !IntersectRect(&rcLimit, &rcLimit, lprcEnumMonitorsThatIntersect)) {
-
-            return TRUE;
-        }
-    }
-
-    return lpfnEnumProc(
-            xPRIMARY_MONITOR,
-            hdcOptionalForPainting,
-            &rcLimit,
-            dwData);
-}
-
-BOOL WINAPI
-xEnumDisplayDevices(
-    PVOID Unused,
-    DWORD iDevNum,
-    __inout PDISPLAY_DEVICE lpDisplayDevice,
-    DWORD dwFlags)
-{
-    if (InitMultipleMonitorStubs())
-        return g_pfnEnumDisplayDevices(Unused, iDevNum, lpDisplayDevice, dwFlags);
-
-    if (Unused != NULL)
-        return FALSE;
-
-    if (iDevNum != 0)
-        return FALSE;
-
-    if (lpDisplayDevice == NULL || lpDisplayDevice->cb < sizeof(DISPLAY_DEVICE))
-        return FALSE;
-
-#ifdef UNICODE
-    MultiByteToWideChar(CP_ACP, 0, "DISPLAY", -1, lpDisplayDevice->DeviceName, (sizeof(lpDisplayDevice->DeviceName)/sizeof(TCHAR)));
-    MultiByteToWideChar(CP_ACP, 0, "DISPLAY", -1, lpDisplayDevice->DeviceString, (sizeof(lpDisplayDevice->DeviceString)/sizeof(TCHAR)));
-#else // UNICODE
-#if _MULTIMON_USE_SECURE_CRT
-    strncpy_s((LPTSTR)lpDisplayDevice->DeviceName, (sizeof(lpDisplayDevice->DeviceName)/sizeof(TCHAR)), TEXT("DISPLAY"), (sizeof(lpDisplayDevice->DeviceName)/sizeof(TCHAR)) - 1);
-    strncpy_s((LPTSTR)lpDisplayDevice->DeviceString, (sizeof(lpDisplayDevice->DeviceString)/sizeof(TCHAR)), TEXT("DISPLAY"), (sizeof(lpDisplayDevice->DeviceName)/sizeof(TCHAR)) - 1);
-#else
-    lstrcpyn((LPTSTR)lpDisplayDevice->DeviceName,   TEXT("DISPLAY"), (sizeof(lpDisplayDevice->DeviceName)/sizeof(TCHAR)));
-    lstrcpyn((LPTSTR)lpDisplayDevice->DeviceString, TEXT("DISPLAY"), (sizeof(lpDisplayDevice->DeviceString)/sizeof(TCHAR)));
-#endif // _MULTIMON_USE_SECURE_CRT
-#endif // UNICODE
-
-    lpDisplayDevice->StateFlags = DISPLAY_DEVICE_ATTACHED_TO_DESKTOP | DISPLAY_DEVICE_PRIMARY_DEVICE;
-
-    return TRUE;
-}
-
-#undef xPRIMARY_MONITOR
-#undef COMPILE_MULTIMON_STUBS
-
-#else   // COMPILE_MULTIMON_STUBS
-
-extern int  WINAPI xGetSystemMetrics(int);
-extern HMONITOR WINAPI xMonitorFromWindow(HWND, DWORD);
-extern HMONITOR WINAPI xMonitorFromRect(LPCRECT, DWORD);
-extern HMONITOR WINAPI xMonitorFromPoint(POINT, DWORD);
-extern BOOL WINAPI xGetMonitorInfo(HMONITOR, LPMONITORINFO);
-extern BOOL WINAPI xEnumDisplayMonitors(HDC, LPCRECT, MONITORENUMPROC, LPARAM);
-extern BOOL WINAPI xEnumDisplayDevices(PVOID, DWORD, PDISPLAY_DEVICE, DWORD);
-
-#endif  // COMPILE_MULTIMON_STUBS
-
-//
-// build defines that replace the regular APIs with our versions
-//
-#define GetSystemMetrics    xGetSystemMetrics
-#define MonitorFromWindow   xMonitorFromWindow
-#define MonitorFromRect     xMonitorFromRect
-#define MonitorFromPoint    xMonitorFromPoint
-#define GetMonitorInfo      xGetMonitorInfo
-#define EnumDisplayMonitors xEnumDisplayMonitors
-#define EnumDisplayDevices  xEnumDisplayDevices
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-
-
diff --git a/3rdparty/include/opencl/1.2/CL/cl.hpp b/3rdparty/include/opencl/1.2/CL/cl.hpp
index 0480e3116..2502d4c52 100644
--- a/3rdparty/include/opencl/1.2/CL/cl.hpp
+++ b/3rdparty/include/opencl/1.2/CL/cl.hpp
@@ -210,7 +210,7 @@
 #include <string>
 #endif 
 
-#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#if defined(__linux__) || defined(__APPLE__) || defined(__MACOSX)
 #include <alloca.h>
 
 #include <emmintrin.h>
diff --git a/3rdparty/include/opencl/1.2/CL/cl_platform.h b/3rdparty/include/opencl/1.2/CL/cl_platform.h
index e94949a31..42c35d5ce 100644
--- a/3rdparty/include/opencl/1.2/CL/cl_platform.h
+++ b/3rdparty/include/opencl/1.2/CL/cl_platform.h
@@ -332,13 +332,13 @@ typedef unsigned int cl_GLenum;
 /* Define basic vector types */
 #if defined( __VEC__ )
    #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
-   typedef vector unsigned char     __cl_uchar16;
-   typedef vector signed char       __cl_char16;
-   typedef vector unsigned short    __cl_ushort8;
-   typedef vector signed short      __cl_short8;
-   typedef vector unsigned int      __cl_uint4;
-   typedef vector signed int        __cl_int4;
-   typedef vector float             __cl_float4;
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
    #define  __CL_UCHAR16__  1
    #define  __CL_CHAR16__   1
    #define  __CL_USHORT8__  1
diff --git a/3rdparty/libjasper/CMakeLists.txt b/3rdparty/libjasper/CMakeLists.txt
index 4e6aa45a7..c1a822612 100644
--- a/3rdparty/libjasper/CMakeLists.txt
+++ b/3rdparty/libjasper/CMakeLists.txt
@@ -47,5 +47,5 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${JASPER_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(${JASPER_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/3rdparty/libjpeg/CMakeLists.txt b/3rdparty/libjpeg/CMakeLists.txt
index 49730edf8..028a583cf 100644
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@@ -46,5 +46,5 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${JPEG_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(${JPEG_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/3rdparty/libpng/CMakeLists.txt b/3rdparty/libpng/CMakeLists.txt
index 2ecbe3f83..7dd06dfd4 100644
--- a/3rdparty/libpng/CMakeLists.txt
+++ b/3rdparty/libpng/CMakeLists.txt
@@ -55,5 +55,5 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${PNG_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(${PNG_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/3rdparty/libtiff/CMakeLists.txt b/3rdparty/libtiff/CMakeLists.txt
index 16d23c132..7a974dbbc 100644
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -115,5 +115,5 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${TIFF_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(${TIFF_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/3rdparty/libtiff/tif_config.h.cmakein b/3rdparty/libtiff/tif_config.h.cmakein
index 182f2833d..24f58119b 100644
--- a/3rdparty/libtiff/tif_config.h.cmakein
+++ b/3rdparty/libtiff/tif_config.h.cmakein
@@ -54,7 +54,7 @@
 
 /* Native cpu byte order: 1 if big-endian (Motorola) or 0 if little-endian
    (Intel) */
-#define HOST_BIGENDIAN 0
+#define HOST_BIGENDIAN @WORDS_BIGENDIAN@
 
 /* Set the native cpu bit order (FILLORDER_LSB2MSB or FILLORDER_MSB2LSB) */
 #define HOST_FILLORDER FILLORDER_LSB2MSB
@@ -156,15 +156,7 @@
 
 /* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
    significant byte first (like Motorola and SPARC, unlike Intel). */
-#if defined AC_APPLE_UNIVERSAL_BUILD
-# if defined __BIG_ENDIAN__
-#  define WORDS_BIGENDIAN 1
-# endif
-#else
-# ifndef WORDS_BIGENDIAN
-/* #  undef WORDS_BIGENDIAN */
-# endif
-#endif
+#cmakedefine WORDS_BIGENDIAN 1
 
 /* Support Deflate compression */
 #define ZIP_SUPPORT 1
diff --git a/3rdparty/openexr/CMakeLists.txt b/3rdparty/openexr/CMakeLists.txt
index 1d48c7c7d..e15bc5270 100644
--- a/3rdparty/openexr/CMakeLists.txt
+++ b/3rdparty/openexr/CMakeLists.txt
@@ -64,7 +64,7 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(IlmImf EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(IlmImf EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
 
 set(OPENEXR_INCLUDE_PATHS ${OPENEXR_INCLUDE_PATHS} PARENT_SCOPE)
diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt
index f91303d47..06de24981 100644
--- a/3rdparty/tbb/CMakeLists.txt
+++ b/3rdparty/tbb/CMakeLists.txt
@@ -232,9 +232,9 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 ocv_install_target(tbb EXPORT OpenCVModules
-    RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main
-    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
-    ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main
+    RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs
+    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT libs
+    ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev
     )
 
 # get TBB version
diff --git a/3rdparty/zlib/CMakeLists.txt b/3rdparty/zlib/CMakeLists.txt
index f1b28fd39..410f2420b 100644
--- a/3rdparty/zlib/CMakeLists.txt
+++ b/3rdparty/zlib/CMakeLists.txt
@@ -95,5 +95,5 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(NOT BUILD_SHARED_LIBS)
-  ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+  ocv_install_target(${ZLIB_LIBRARY} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fff2d614..2d4704779 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,7 +116,7 @@ endif()
 OCV_OPTION(WITH_1394           "Include IEEE1394 support"                    ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_AVFOUNDATION   "Use AVFoundation for Video I/O"              ON   IF IOS)
 OCV_OPTION(WITH_CARBON         "Use Carbon for UI instead of Cocoa"          OFF  IF APPLE )
-OCV_OPTION(WITH_VTK            "Include VTK library support (and build opencv_viz module eiher)"             OFF IF (NOT ANDROID AND NOT IOS) )
+OCV_OPTION(WITH_VTK            "Include VTK library support (and build opencv_viz module eiher)"             ON  IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_CUDA           "Include NVidia Cuda Runtime support"                                         ON  IF (NOT IOS) )
 OCV_OPTION(WITH_CUFFT          "Include NVidia Cuda Fast Fourier Transform (FFT) library support"            ON  IF (NOT IOS) )
 OCV_OPTION(WITH_CUBLAS         "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (NOT IOS) )
@@ -132,7 +132,7 @@ OCV_OPTION(WITH_JASPER         "Include JPEG2K support"                      ON
 OCV_OPTION(WITH_JPEG           "Include JPEG support"                        ON)
 OCV_OPTION(WITH_WEBP           "Include WebP support"                        ON   IF (NOT IOS) )
 OCV_OPTION(WITH_OPENEXR        "Include ILM support via OpenEXR"             ON   IF (NOT IOS) )
-OCV_OPTION(WITH_OPENGL         "Include OpenGL support"                      OFF  IF (NOT ANDROID AND NOT APPLE) )
+OCV_OPTION(WITH_OPENGL         "Include OpenGL support"                      OFF  IF (NOT ANDROID) )
 OCV_OPTION(WITH_OPENNI         "Include OpenNI support"                      OFF  IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_PNG            "Include PNG support"                         ON)
 OCV_OPTION(WITH_PVAPI          "Include Prosilica GigE support"              ON   IF (NOT ANDROID AND NOT IOS) )
@@ -156,6 +156,7 @@ OCV_OPTION(WITH_OPENCL         "Include OpenCL Runtime support"              ON
 OCV_OPTION(WITH_OPENCLAMDFFT   "Include AMD OpenCL FFT library support"      ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_OPENCLAMDBLAS  "Include AMD OpenCL BLAS library support"     ON   IF (NOT ANDROID AND NOT IOS) )
 OCV_OPTION(WITH_DIRECTX        "Include DirectX support"                     ON   IF WIN32 )
+OCV_OPTION(WITH_INTELPERC      "Include Intel Perceptual Computing support"  OFF  IF WIN32 )
 
 
 # OpenCV build components
@@ -190,13 +191,14 @@ OCV_OPTION(INSTALL_C_EXAMPLES       "Install C examples"        OFF )
 OCV_OPTION(INSTALL_PYTHON_EXAMPLES  "Install Python examples"   OFF )
 OCV_OPTION(INSTALL_ANDROID_EXAMPLES "Install Android examples"  OFF IF ANDROID )
 OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help with side by side installs." OFF IF (UNIX AND NOT ANDROID AND NOT IOS AND BUILD_SHARED_LIBS) )
-
+OCV_OPTION(INSTALL_TESTS            "Install accuracy and performance test binaries and test data" OFF)
 
 # OpenCV build options
 # ===================================================
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
+OCV_OPTION(ENABLE_COVERAGE            "Enable coverage collection with  GCov"                    OFF  IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
@@ -207,10 +209,12 @@ OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"
 OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_AVX                 "Enable AVX instructions"                                  OFF  IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND ARM) )
+OCV_OPTION(ENABLE_NEON                "Enable NEON instructions"                                 OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
+OCV_OPTION(ENABLE_VFPV3               "Enable VFPv3-D32 instructions"                            OFF  IF CMAKE_COMPILER_IS_GNUCXX AND ARM )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
 OCV_OPTION(ENABLE_WINRT_MODE          "Build with Windows Runtime support"                       OFF  IF WIN32 )
+OCV_OPTION(ENABLE_WINRT_MODE_NATIVE   "Build with Windows Runtime native C++ support"            OFF  IF WIN32 )
 
 
 # ----------------------------------------------------------------------------
@@ -226,6 +230,15 @@ include(cmake/OpenCVVersion.cmake)
 # Save libs and executables in the same place
 set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" )
 
+if (ANDROID)
+  if (ANDROID_ABI MATCHES "NEON")
+    set(ENABLE_NEON ON)
+  endif()
+  if (ANDROID_ABI MATCHES "VFPV3")
+    set(ENABLE_VFPV3 ON)
+  endif()
+endif()
+
 if(ANDROID OR WIN32)
   set(OPENCV_DOC_INSTALL_PATH doc)
 elseif(INSTALL_TO_MANGLED_PATHS)
@@ -241,13 +254,27 @@ if(WIN32)
     message(STATUS "Can't detect runtime and/or arch")
     set(OpenCV_INSTALL_BINARIES_PREFIX "")
   endif()
+elseif(ANDROID)
+  set(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/")
 else()
   set(OpenCV_INSTALL_BINARIES_PREFIX "")
 endif()
 
-set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
+if(ANDROID)
+  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}")
+else()
+  set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples")
+endif()
 
-set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
+if(ANDROID)
+  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}")
+else()
+  set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin")
+endif()
+
+if(NOT OPENCV_TEST_INSTALL_PATH)
+  set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}")
+endif()
 
 if(ANDROID)
   set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}")
@@ -256,6 +283,7 @@ if(ANDROID)
   set(OPENCV_3P_LIB_INSTALL_PATH  sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME})
   set(OPENCV_CONFIG_INSTALL_PATH  sdk/native/jni)
   set(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include)
+  set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native)
 else()
   set(LIBRARY_OUTPUT_PATH         "${OpenCV_BINARY_DIR}/lib")
   set(3P_LIBRARY_OUTPUT_PATH      "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}")
@@ -266,9 +294,11 @@ else()
       set(OPENCV_LIB_INSTALL_PATH   "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}")
     endif()
     set(OPENCV_3P_LIB_INSTALL_PATH  "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}")
+    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    samples/native)
   else()
     set(OPENCV_LIB_INSTALL_PATH     lib${LIB_SUFFIX})
     set(OPENCV_3P_LIB_INSTALL_PATH  share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH})
+    set(OPENCV_SAMPLES_SRC_INSTALL_PATH    share/OpenCV/samples)
   endif()
   set(OPENCV_INCLUDE_INSTALL_PATH "include")
 
@@ -373,6 +403,8 @@ if(UNIX)
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m log)
     elseif(${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|NetBSD|DragonFly")
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} m pthread)
+    elseif(EMSCRIPTEN)
+      # no need to link to system libs with emscripten
     else()
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m pthread rt)
     endif()
@@ -384,6 +416,19 @@ endif()
 include(cmake/OpenCVPCHSupport.cmake)
 include(cmake/OpenCVModule.cmake)
 
+# ----------------------------------------------------------------------------
+#  Detect endianness of build platform
+# ----------------------------------------------------------------------------
+
+if(CMAKE_SYSTEM_NAME STREQUAL iOS)
+  # test_big_endian needs try_compile, which doesn't work for iOS
+  # http://public.kitware.com/Bug/view.php?id=12288
+  set(WORDS_BIGENDIAN 0)
+else()
+  include(TestBigEndian)
+  test_big_endian(WORDS_BIGENDIAN)
+endif()
+
 # ----------------------------------------------------------------------------
 #  Detect 3rd-party libraries
 # ----------------------------------------------------------------------------
@@ -521,6 +566,49 @@ include(cmake/OpenCVGenConfig.cmake)
 # Generate Info.plist for the IOS framework
 include(cmake/OpenCVGenInfoPlist.cmake)
 
+# Generate environment setup file
+if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH AND UNIX)
+  if(ANDROID)
+    get_filename_component(TEST_PATH ${OPENCV_TEST_INSTALL_PATH} DIRECTORY)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_android.sh.in"
+                   "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh" @ONLY)
+    install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
+            DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT tests)
+  else()
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_testing.sh.in"
+                   "${CMAKE_BINARY_DIR}/unix-install/opencv_testing.sh" @ONLY)
+    install(FILES "${CMAKE_BINARY_DIR}/unix-install/opencv_testing.sh"
+            DESTINATION /etc/profile.d/ COMPONENT tests)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/templates/opencv_run_all_tests_unix.sh.in"
+                   "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh" @ONLY)
+    install(PROGRAMS "${CMAKE_BINARY_DIR}/unix-install/opencv_run_all_tests.sh"
+            DESTINATION ${OPENCV_TEST_INSTALL_PATH} COMPONENT tests)
+
+  endif()
+endif()
+
+if(NOT OPENCV_README_FILE)
+  if(ANDROID)
+    set(OPENCV_README_FILE ${CMAKE_CURRENT_SOURCE_DIR}/platforms/android/README.android)
+  endif()
+endif()
+
+if(NOT OPENCV_LICENSE_FILE)
+  set(OPENCV_LICENSE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
+endif()
+
+# for UNIX it does not make sense as LICENSE and readme will be part of the package automatically
+if(ANDROID OR NOT UNIX)
+  install(FILES ${OPENCV_LICENSE_FILE}
+        PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
+  if(OPENCV_README_FILE)
+    install(FILES ${OPENCV_README_FILE}
+            PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
+            DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT libs)
+  endif()
+endif()
+
 # ----------------------------------------------------------------------------
 # Summary:
 # ----------------------------------------------------------------------------
@@ -630,7 +718,7 @@ endif()
 if(WIN32)
 status("")
     status("  Windows RT support:" HAVE_WINRT THEN YES ELSE NO)
-    if (ENABLE_WINRT_MODE)
+    if (ENABLE_WINRT_MODE OR ENABLE_WINRT_MODE_NATIVE)
       status("    Windows SDK v8.0:" ${WINDOWS_SDK_PATH})
       status("    Visual Studio 2012:" ${VISUAL_STUDIO_PATH})
     endif()
@@ -820,6 +908,11 @@ if(DEFINED WITH_XINE)
   status("    Xine:"           HAVE_XINE           THEN "YES (ver ${ALIASOF_libxine_VERSION})"     ELSE NO)
 endif(DEFINED WITH_XINE)
 
+if(DEFINED WITH_INTELPERC)
+  status("    Intel PerC:"     HAVE_INTELPERC      THEN "YES"                                 ELSE NO)
+endif(DEFINED WITH_INTELPERC)
+
+
 # ========================== Other third-party libraries ==========================
 status("")
 status("  Other third-party libraries:")
@@ -952,3 +1045,9 @@ ocv_finalize_status()
 if("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
   message(WARNING "The source directory is the same as binary directory. \"make clean\" may damage the source tree")
 endif()
+
+# ----------------------------------------------------------------------------
+# CPack stuff
+# ----------------------------------------------------------------------------
+
+include(cmake/OpenCVPackaging.cmake)
diff --git a/README.md b/README.md
index 403f118ee..3a26ad855 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 ### OpenCV: Open Source Computer Vision Library
 
+[![Gittip](http://img.shields.io/gittip/OpenCV.png)](https://www.gittip.com/OpenCV/)
+
 #### Resources
 
 * Homepage: <http://opencv.org>
@@ -18,6 +20,3 @@ Summary of guidelines:
 * Include tests and documentation;
 * Clean up "oops" commits before submitting;
 * Follow the coding style guide.
-
-[![Donate OpenCV project](http://opencv.org/wp-content/uploads/2013/07/gittip1.png)](https://www.gittip.com/OpenCV/)
-[![Donate OpenCV project](http://opencv.org/wp-content/uploads/2013/07/paypal-donate-button.png)](https://www.paypal.com/cgi-bin/webscr?item_name=Donation+to+OpenCV&cmd=_donations&business=accountant%40opencv.org)
\ No newline at end of file
diff --git a/apps/haartraining/CMakeLists.txt b/apps/haartraining/CMakeLists.txt
index 92fdf914b..63bbff635 100644
--- a/apps/haartraining/CMakeLists.txt
+++ b/apps/haartraining/CMakeLists.txt
@@ -71,14 +71,14 @@ set_target_properties(opencv_performance PROPERTIES
 
 if(INSTALL_CREATE_DISTRIB)
   if(BUILD_SHARED_LIBS)
-    install(TARGETS opencv_haartraining RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT main)
-    install(TARGETS opencv_createsamples RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT main)
-    install(TARGETS opencv_performance RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT main)
+    install(TARGETS opencv_haartraining RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
+    install(TARGETS opencv_createsamples RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
+    install(TARGETS opencv_performance RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
   endif()
 else()
-  install(TARGETS opencv_haartraining RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main)
-  install(TARGETS opencv_createsamples RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main)
-  install(TARGETS opencv_performance RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main)
+  install(TARGETS opencv_haartraining RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+  install(TARGETS opencv_createsamples RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
+  install(TARGETS opencv_performance RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
 endif()
 
 if(ENABLE_SOLUTION_FOLDERS)
diff --git a/apps/haartraining/cvclassifier.h b/apps/haartraining/cvclassifier.h
index 9a4344118..c1ae7f5ae 100644
--- a/apps/haartraining/cvclassifier.h
+++ b/apps/haartraining/cvclassifier.h
@@ -340,7 +340,7 @@ typedef enum CvBoostType
     CV_LKCLASS  = 5, /* classification (K class problem)    */
     CV_LSREG    = 6, /* least squares regression            */
     CV_LADREG   = 7, /* least absolute deviation regression */
-    CV_MREG     = 8, /* M-regression (Huber loss)           */
+    CV_MREG     = 8  /* M-regression (Huber loss)           */
 } CvBoostType;
 
 /****************************************************************************************\
diff --git a/apps/traincascade/CMakeLists.txt b/apps/traincascade/CMakeLists.txt
index 2d1162175..f36e4b247 100644
--- a/apps/traincascade/CMakeLists.txt
+++ b/apps/traincascade/CMakeLists.txt
@@ -35,8 +35,8 @@ endif()
 
 if(INSTALL_CREATE_DISTRIB)
   if(BUILD_SHARED_LIBS)
-    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT main)
+    install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} CONFIGURATIONS Release COMPONENT dev)
   endif()
 else()
-  install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main)
+  install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT dev)
 endif()
diff --git a/cmake/OpenCVCRTLinkage.cmake b/cmake/OpenCVCRTLinkage.cmake
index 8a297c685..5265e3e8a 100644
--- a/cmake/OpenCVCRTLinkage.cmake
+++ b/cmake/OpenCVCRTLinkage.cmake
@@ -9,7 +9,7 @@ set(HAVE_WINRT FALSE)
 # search Windows Platform SDK
 message(STATUS "Checking for Windows Platform SDK")
 GET_FILENAME_COMPONENT(WINDOWS_SDK_PATH  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Microsoft SDKs\\Windows\\v8.0;InstallationFolder]" ABSOLUTE CACHE)
-if (WINDOWS_SDK_PATH STREQUAL "")
+if(WINDOWS_SDK_PATH STREQUAL "")
   set(HAVE_MSPDK FALSE)
   message(STATUS "Windows Platform SDK 8.0 was not found")
 else()
@@ -19,7 +19,7 @@ endif()
 #search for Visual Studio 11.0 install directory
 message(STATUS "Checking for Visual Studio 2012")
 GET_FILENAME_COMPONENT(VISUAL_STUDIO_PATH [HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\11.0\\Setup\\VS;ProductDir] REALPATH CACHE)
-if (VISUAL_STUDIO_PATH STREQUAL "")
+if(VISUAL_STUDIO_PATH STREQUAL "")
   set(HAVE_MSVC2012 FALSE)
   message(STATUS "Visual Studio 2012 was not found")
 else()
@@ -30,11 +30,15 @@ try_compile(HAVE_WINRT_SDK
   "${OpenCV_BINARY_DIR}"
   "${OpenCV_SOURCE_DIR}/cmake/checks/winrttest.cpp")
 
-if (ENABLE_WINRT_MODE AND HAVE_WINRT_SDK AND HAVE_MSVC2012 AND HAVE_MSPDK)
+if(ENABLE_WINRT_MODE AND HAVE_WINRT_SDK AND HAVE_MSVC2012 AND HAVE_MSPDK)
   set(HAVE_WINRT TRUE)
+  set(HAVE_WINRT_CX TRUE)
+elseif(ENABLE_WINRT_MODE_NATIVE AND HAVE_WINRT_SDK AND HAVE_MSVC2012 AND HAVE_MSPDK)
+  set(HAVE_WINRT TRUE)
+  set(HAVE_WINRT_CX FALSE)
 endif()
 
-if (HAVE_WINRT)
+if(HAVE_WINRT)
   add_definitions(/DWINVER=0x0602 /DNTDDI_VERSION=NTDDI_WIN8 /D_WIN32_WINNT=0x0602)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /appcontainer")
   set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /appcontainer")
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index fd36a45c6..ba74ebe4a 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -124,6 +124,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   if(ENABLE_SSE2)
     add_extra_compiler_option(-msse2)
   endif()
+  if (ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=neon")
+  endif()
+  if (ENABLE_VFPV3 AND NOT ENABLE_NEON)
+    add_extra_compiler_option("-mfpu=vfpv3")
+  endif()
 
   # SSE3 and further should be disabled under MingW because it generates compiler errors
   if(NOT MINGW)
@@ -179,6 +185,11 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     add_extra_compiler_option(-ffunction-sections)
   endif()
 
+  if(ENABLE_COVERAGE)
+    set(OPENCV_EXTRA_C_FLAGS "${OPENCV_EXTRA_C_FLAGS} --coverage")
+    set(OPENCV_EXTRA_CXX_FLAGS "${OPENCV_EXTRA_CXX_FLAGS} --coverage")
+  endif()
+
   set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} -DNDEBUG")
   set(OPENCV_EXTRA_FLAGS_DEBUG "${OPENCV_EXTRA_FLAGS_DEBUG} -O0 -DDEBUG -D_DEBUG")
 endif()
diff --git a/cmake/OpenCVDetectAndroidSDK.cmake b/cmake/OpenCVDetectAndroidSDK.cmake
index 0173223d4..7fc45108c 100644
--- a/cmake/OpenCVDetectAndroidSDK.cmake
+++ b/cmake/OpenCVDetectAndroidSDK.cmake
@@ -344,20 +344,20 @@ macro(add_android_project target path)
     add_custom_command(TARGET ${target} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${android_proj_bin_dir}/bin/${target}-debug.apk" "${OpenCV_BINARY_DIR}/bin/${target}.apk")
     if(INSTALL_ANDROID_EXAMPLES AND "${target}" MATCHES "^example-")
       #apk
-      install(FILES "${OpenCV_BINARY_DIR}/bin/${target}.apk" DESTINATION "samples" COMPONENT main)
+      install(FILES "${OpenCV_BINARY_DIR}/bin/${target}.apk" DESTINATION "samples" COMPONENT samples)
       get_filename_component(sample_dir "${path}" NAME)
       #java part
       list(REMOVE_ITEM android_proj_files ${ANDROID_MANIFEST_FILE})
       foreach(f ${android_proj_files} ${ANDROID_MANIFEST_FILE})
         get_filename_component(install_subdir "${f}" PATH)
-        install(FILES "${android_proj_bin_dir}/${f}" DESTINATION "samples/${sample_dir}/${install_subdir}" COMPONENT main)
+        install(FILES "${android_proj_bin_dir}/${f}" DESTINATION "samples/${sample_dir}/${install_subdir}" COMPONENT samples)
       endforeach()
       #jni part + eclipse files
       file(GLOB_RECURSE jni_files RELATIVE "${path}" "${path}/jni/*" "${path}/.cproject")
       ocv_list_filterout(jni_files "\\\\.svn")
       foreach(f ${jni_files} ".classpath" ".project" ".settings/org.eclipse.jdt.core.prefs")
         get_filename_component(install_subdir "${f}" PATH)
-        install(FILES "${path}/${f}" DESTINATION "samples/${sample_dir}/${install_subdir}" COMPONENT main)
+        install(FILES "${path}/${f}" DESTINATION "samples/${sample_dir}/${install_subdir}" COMPONENT samples)
       endforeach()
       #update proj
       if(android_proj_lib_deps_commands)
@@ -365,9 +365,9 @@ macro(add_android_project target path)
       endif()
       install(CODE "EXECUTE_PROCESS(COMMAND ${ANDROID_EXECUTABLE} --silent update project --path . --target \"${android_proj_sdk_target}\" --name \"${target}\" ${inst_lib_opt}
                                     WORKING_DIRECTORY \"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/samples/${sample_dir}\"
-                                   )"  COMPONENT main)
+                                   )"  COMPONENT samples)
       #empty 'gen'
-      install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/samples/${sample_dir}/gen\")" COMPONENT main)
+      install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/samples/${sample_dir}/gen\")" COMPONENT samples)
     endif()
   endif()
 endmacro()
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index f655ce8cd..89602acaa 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -178,9 +178,8 @@ if(CUDA_FOUND)
       # we remove -Wsign-promo as it generates warnings under linux
       string(REPLACE "-Wsign-promo" "" ${var} "${${var}}")
 
-      # we remove -fvisibility-inlines-hidden because it's used for C++ compiler
-      # but NVCC uses C compiler by default
-      string(REPLACE "-fvisibility-inlines-hidden" "" ${var} "${${var}}")
+      # we remove -Wno-sign-promo as it generates warnings under linux
+      string(REPLACE "-Wno-sign-promo" "" ${var} "${${var}}")
 
       # we remove -Wno-delete-non-virtual-dtor because it's used for C++ compiler
       # but NVCC uses C compiler by default
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index 7f258dc25..95a26dbf3 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -24,7 +24,8 @@ if(PYTHONINTERP_FOUND)
 
   if(NOT ANDROID AND NOT IOS)
     ocv_check_environment_variables(PYTHON_LIBRARY PYTHON_INCLUDE_DIR)
-    find_host_package(PythonLibs "${PYTHON_VERSION_STRING}" EXACT)
+    # not using PYTHON_VERSION_STRING here, because it might not conform to the CMake version format
+    find_host_package(PythonLibs "${PYTHON_VERSION_MAJOR_MINOR}.${PYTHON_VERSION_PATCH}" EXACT)
   endif()
 
   if(NOT ANDROID AND NOT IOS)
@@ -59,23 +60,38 @@ if(PYTHONINTERP_FOUND)
     SET(PYTHON_PACKAGES_PATH "${_PYTHON_PACKAGES_PATH}" CACHE PATH "Where to install the python packages.")
 
     if(NOT PYTHON_NUMPY_INCLUDE_DIRS)
-      # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
-      execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c
-                        "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
-                      RESULT_VARIABLE PYTHON_NUMPY_PROCESS
-                      OUTPUT_VARIABLE PYTHON_NUMPY_INCLUDE_DIRS
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+      if(CMAKE_CROSSCOMPILING)
+        message(STATUS "Cannot probe for Python/Numpy support (because we are cross-compiling OpenCV)")
+        message(STATUS "If you want to enable Python/Numpy support, set the following variables:")
+        message(STATUS "  PYTHON_INCLUDE_PATH")
+        message(STATUS "  PYTHON_LIBRARIES")
+        message(STATUS "  PYTHON_NUMPY_INCLUDE_DIRS")
+      else()
+        # Attempt to discover the NumPy include directory. If this succeeds, then build python API with NumPy
+        execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c "import os; os.environ['DISTUTILS_USE_SDK']='1'; import numpy.distutils; print(os.pathsep.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
+                        RESULT_VARIABLE PYTHON_NUMPY_PROCESS
+                        OUTPUT_VARIABLE PYTHON_NUMPY_INCLUDE_DIRS
+                        OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-      if(PYTHON_NUMPY_PROCESS EQUAL 0)
-        file(TO_CMAKE_PATH "${PYTHON_NUMPY_INCLUDE_DIRS}" _PYTHON_NUMPY_INCLUDE_DIRS)
-        set(PYTHON_NUMPY_INCLUDE_DIRS "${_PYTHON_NUMPY_INCLUDE_DIRS}" CACHE PATH "Path to numpy headers")
+        if(NOT PYTHON_NUMPY_PROCESS EQUAL 0)
+          unset(PYTHON_NUMPY_INCLUDE_DIRS)
+        endif()
       endif()
     endif()
 
     if(PYTHON_NUMPY_INCLUDE_DIRS)
-      execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c "import numpy; print(numpy.version.version)"
-                      OUTPUT_VARIABLE PYTHON_NUMPY_VERSION
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
+      file(TO_CMAKE_PATH "${PYTHON_NUMPY_INCLUDE_DIRS}" _PYTHON_NUMPY_INCLUDE_DIRS)
+      set(PYTHON_NUMPY_INCLUDE_DIRS ${_PYTHON_NUMPY_INCLUDE_DIRS} CACHE PATH "Path to numpy headers")
+      if(CMAKE_CROSSCOMPILING)
+        if(NOT PYTHON_NUMPY_VERSION)
+          set(PYTHON_NUMPY_VERSION "undefined - cannot be probed because of the cross-compilation")
+        endif()
+      else()
+        execute_process(COMMAND "${PYTHON_EXECUTABLE}" -c "import numpy; print(numpy.version.version)"
+                        RESULT_VARIABLE PYTHON_NUMPY_PROCESS
+                        OUTPUT_VARIABLE PYTHON_NUMPY_VERSION
+                        OUTPUT_STRIP_TRAILING_WHITESPACE)
+      endif()
     endif()
   endif(NOT ANDROID AND NOT IOS)
 endif()
diff --git a/cmake/OpenCVDetectVTK.cmake b/cmake/OpenCVDetectVTK.cmake
index f0d28d552..78d1a73b6 100644
--- a/cmake/OpenCVDetectVTK.cmake
+++ b/cmake/OpenCVDetectVTK.cmake
@@ -2,7 +2,12 @@ if(NOT WITH_VTK OR ANDROID OR IOS)
   return()
 endif()
 
-find_package(VTK 6.0 QUIET COMPONENTS vtkRenderingCore vtkInteractionWidgets vtkInteractionStyle vtkIOLegacy vtkIOPLY vtkRenderingFreeType vtkRenderingLOD vtkFiltersTexture NO_MODULE)
+if (HAVE_QT5)
+  message(STATUS "VTK is disabled because OpenCV is linked with Q5. Some VTK disributives are compiled with Q4 and therefore can't be linked together Qt5.")
+  return()
+endif()
+
+find_package(VTK 6.0 QUIET COMPONENTS vtkRenderingCore vtkInteractionWidgets vtkInteractionStyle vtkIOLegacy vtkIOPLY vtkRenderingFreeType vtkRenderingLOD vtkFiltersTexture vtkIOExport NO_MODULE)
 
 if(NOT DEFINED VTK_FOUND OR NOT VTK_FOUND)
   find_package(VTK 5.10 QUIET COMPONENTS vtkCommon vtkFiltering vtkRendering vtkWidgets vtkImaging NO_MODULE)
@@ -18,4 +23,4 @@ if(VTK_FOUND)
 else()
   set(HAVE_VTK OFF)
   message(STATUS "VTK is not found. Please set -DVTK_DIR in CMake to VTK build directory, or set $VTK_DIR enviroment variable to VTK install subdirectory with VTKConfig.cmake file (for windows)")
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/OpenCVExtraTargets.cmake b/cmake/OpenCVExtraTargets.cmake
index b4d339155..ecb2a3b36 100644
--- a/cmake/OpenCVExtraTargets.cmake
+++ b/cmake/OpenCVExtraTargets.cmake
@@ -4,7 +4,7 @@
 CONFIGURE_FILE(
   "${OpenCV_SOURCE_DIR}/cmake/templates/cmake_uninstall.cmake.in"
   "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
-  IMMEDIATE @ONLY)
+  @ONLY)
 
 ADD_CUSTOM_TARGET(uninstall "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake")
 if(ENABLE_SOLUTION_FOLDERS)
diff --git a/cmake/OpenCVFindIPP.cmake b/cmake/OpenCVFindIPP.cmake
index 772cae886..db02e6acb 100644
--- a/cmake/OpenCVFindIPP.cmake
+++ b/cmake/OpenCVFindIPP.cmake
@@ -163,9 +163,16 @@ function(set_ipp_new_libraries _LATEST_VERSION)
         ${IPP_LIB_PREFIX}${IPP_PREFIX}${IPPCV}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
         ${IPP_LIB_PREFIX}${IPP_PREFIX}${IPPI}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
         ${IPP_LIB_PREFIX}${IPP_PREFIX}${IPPS}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
-        ${IPP_LIB_PREFIX}${IPP_PREFIX}${IPPCORE}${IPP_SUFFIX}${IPP_LIB_SUFFIX}
-        PARENT_SCOPE)
+        ${IPP_LIB_PREFIX}${IPP_PREFIX}${IPPCORE}${IPP_SUFFIX}${IPP_LIB_SUFFIX})
 
+    if (UNIX)
+        set(IPP_LIBRARIES
+            ${IPP_LIBRARIES}
+            ${IPP_LIB_PREFIX}irc${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${IPP_LIB_PREFIX}imf${CMAKE_SHARED_LIBRARY_SUFFIX}
+            ${IPP_LIB_PREFIX}svml${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+    set(IPP_LIBRARIES ${IPP_LIBRARIES} PARENT_SCOPE)
     return()
 
 endfunction()
@@ -208,19 +215,39 @@ function(set_ipp_variables _LATEST_VERSION)
         set(IPP_INCLUDE_DIRS ${IPP_ROOT_DIR}/include PARENT_SCOPE)
 
         if (APPLE)
-            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib PARENT_SCOPE)
+            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib)
         elseif (IPP_X64)
             if(NOT EXISTS ${IPP_ROOT_DIR}/lib/intel64)
                 message(SEND_ERROR "IPP EM64T libraries not found")
             endif()
-            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib/intel64 PARENT_SCOPE)
+            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib/intel64)
         else()
             if(NOT EXISTS ${IPP_ROOT_DIR}/lib/ia32)
                 message(SEND_ERROR "IPP IA32 libraries not found")
             endif()
-            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib/ia32 PARENT_SCOPE)
+            set(IPP_LIBRARY_DIRS ${IPP_ROOT_DIR}/lib/ia32)
         endif()
 
+        if (UNIX)
+            get_filename_component(INTEL_COMPILER_LIBRARY_DIR ${IPP_ROOT_DIR}/../lib REALPATH)
+            if (IPP_X64)
+                if(NOT EXISTS ${INTEL_COMPILER_LIBRARY_DIR}/intel64)
+                    message(SEND_ERROR "Intel compiler EM64T libraries not found")
+                endif()
+                set(IPP_LIBRARY_DIRS
+                    ${IPP_LIBRARY_DIRS}
+                    ${INTEL_COMPILER_LIBRARY_DIR}/intel64)
+            else()
+                if(NOT EXISTS ${INTEL_COMPILER_LIBRARY_DIR}/ia32)
+                    message(SEND_ERROR "Intel compiler IA32 libraries not found")
+                endif()
+                set(IPP_LIBRARY_DIRS
+                    ${IPP_LIBRARY_DIRS}
+                    ${INTEL_COMPILER_LIBRARY_DIR}/ia32)
+            endif()
+        endif()
+        set(IPP_LIBRARY_DIRS ${IPP_LIBRARY_DIRS} PARENT_SCOPE)
+
         # set IPP_LIBRARIES variable (7.x or 8.x lib names)
         set_ipp_new_libraries(${_LATEST_VERSION})
         set(IPP_LIBRARIES ${IPP_LIBRARIES} PARENT_SCOPE)
diff --git a/cmake/OpenCVFindIntelPerCSDK.cmake b/cmake/OpenCVFindIntelPerCSDK.cmake
new file mode 100644
index 000000000..724310560
--- /dev/null
+++ b/cmake/OpenCVFindIntelPerCSDK.cmake
@@ -0,0 +1,20 @@
+# Main variables:
+# INTELPERC_LIBRARIES and INTELPERC_INCLUDE to link Intel Perceptial Computing SDK modules
+# HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK
+
+if(X86_64)
+    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
+    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/x64" DOC "Path to Intel Perceptual Computing SDK interface libraries")
+else()
+    find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers")
+    find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Path to Intel Perceptual Computing SDK interface libraries")
+endif()
+
+if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
+    set(HAVE_INTELPERC TRUE)
+else()
+    set(HAVE_INTELPERC FALSE)
+    message(WARNING "Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries.")
+endif() #if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES)
+
+mark_as_advanced(FORCE INTELPERC_LIBRARIES INTELPERC_INCLUDE_DIR)
\ No newline at end of file
diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake
index 807f4fbbf..93cce2b7a 100644
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@@ -277,3 +277,8 @@ if (NOT IOS)
     set(HAVE_QTKIT YES)
   endif()
 endif()
+
+# --- Intel Perceptual Computing SDK ---
+if(WITH_INTELPERC)
+  include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake")
+endif(WITH_INTELPERC)
diff --git a/cmake/OpenCVGenAndroidMK.cmake b/cmake/OpenCVGenAndroidMK.cmake
index ba67f4189..318c802ff 100644
--- a/cmake/OpenCVGenAndroidMK.cmake
+++ b/cmake/OpenCVGenAndroidMK.cmake
@@ -91,7 +91,7 @@ if(ANDROID)
   set(OPENCV_LIBS_DIR_CONFIGCMAKE "\$(OPENCV_THIS_DIR)/lib/\$(OPENCV_TARGET_ARCH_ABI)")
   set(OPENCV_3RDPARTY_LIBS_DIR_CONFIGCMAKE "\$(OPENCV_THIS_DIR)/3rdparty/lib/\$(OPENCV_TARGET_ARCH_ABI)")
 
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCV.mk.in" "${CMAKE_BINARY_DIR}/OpenCV.mk" IMMEDIATE @ONLY)
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCV.mk.in" "${CMAKE_BINARY_DIR}/OpenCV.mk" @ONLY)
 
   # -------------------------------------------------------------------------------------------
   #  Part 2/2: ${BIN_DIR}/unix-install/OpenCV.mk -> For use with "make install"
@@ -101,6 +101,6 @@ if(ANDROID)
   set(OPENCV_LIBS_DIR_CONFIGCMAKE "\$(OPENCV_THIS_DIR)/../libs/\$(OPENCV_TARGET_ARCH_ABI)")
   set(OPENCV_3RDPARTY_LIBS_DIR_CONFIGCMAKE "\$(OPENCV_THIS_DIR)/../3rdparty/libs/\$(OPENCV_TARGET_ARCH_ABI)")
 
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCV.mk.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCV.mk" IMMEDIATE @ONLY)
-  install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCV.mk DESTINATION ${OPENCV_CONFIG_INSTALL_PATH})
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCV.mk.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCV.mk" @ONLY)
+  install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCV.mk DESTINATION ${OPENCV_CONFIG_INSTALL_PATH} COMPONENT dev)
 endif(ANDROID)
diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index 411d22582..cdf418ec8 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -83,9 +83,9 @@ endif()
 
 export(TARGETS ${OpenCVModules_TARGETS} FILE "${CMAKE_BINARY_DIR}/OpenCVModules${modules_file_suffix}.cmake")
 
-configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig.cmake" IMMEDIATE @ONLY)
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig.cmake" @ONLY)
 #support for version checking when finding opencv. find_package(OpenCV 2.3.1 EXACT) should now work.
-configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig-version.cmake" IMMEDIATE @ONLY)
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig-version.cmake" @ONLY)
 
 # --------------------------------------------------------------------------------------------
 #  Part 2/3: ${BIN_DIR}/unix-install/OpenCVConfig.cmake -> For use *with* "make install"
@@ -98,8 +98,8 @@ if(INSTALL_TO_MANGLED_PATHS)
   set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE}\"")
 endif()
 
-configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" IMMEDIATE @ONLY)
-configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake" IMMEDIATE @ONLY)
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" @ONLY)
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake" @ONLY)
 
 if(UNIX) # ANDROID configuration is created here also
   #http://www.vtk.org/Wiki/CMake/Tutorials/Packaging reference
@@ -109,18 +109,18 @@ if(UNIX) # ANDROID configuration is created here also
   #                <prefix>/(share|lib)/<name>*/                           (U)
   #                <prefix>/(share|lib)/<name>*/(cmake|CMake)/             (U)
   if(INSTALL_TO_MANGLED_PATHS)
-    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/)
-    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/)
-    install(EXPORT OpenCVModules DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/ FILE OpenCVModules${modules_file_suffix}.cmake)
+    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/ COMPONENT dev)
+    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/ COMPONENT dev)
+    install(EXPORT OpenCVModules DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}-${OPENCV_VERSION}/ FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
   else()
-    install(FILES "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/)
-    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/)
-    install(EXPORT OpenCVModules DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ FILE OpenCVModules${modules_file_suffix}.cmake)
+    install(FILES "${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
+    install(FILES ${CMAKE_BINARY_DIR}/unix-install/OpenCVConfig-version.cmake DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
+    install(EXPORT OpenCVModules DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
   endif()
 endif()
 
 if(ANDROID)
-  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/)
+  install(FILES "${OpenCV_SOURCE_DIR}/platforms/android/android.toolchain.cmake" DESTINATION ${OPENCV_CONFIG_INSTALL_PATH}/ COMPONENT dev)
 endif()
 
 # --------------------------------------------------------------------------------------------
@@ -131,15 +131,15 @@ if(WIN32)
   set(OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "\"\"")
 
   exec_program(mkdir ARGS "-p \"${CMAKE_BINARY_DIR}/win-install/\"" OUTPUT_VARIABLE RET_VAL)
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" IMMEDIATE @ONLY)
-  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" IMMEDIATE @ONLY)
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" @ONLY)
+  configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.in" "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" @ONLY)
   if(BUILD_SHARED_LIBS)
-    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib")
-    install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib" FILE OpenCVModules${modules_file_suffix}.cmake)
+    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib" COMPONENT dev)
+    install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}lib" FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
   else()
-    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib")
-    install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" FILE OpenCVModules${modules_file_suffix}.cmake)
+    install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig.cmake" DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" COMPONENT dev)
+    install(EXPORT OpenCVModules DESTINATION "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib" FILE OpenCVModules${modules_file_suffix}.cmake COMPONENT dev)
   endif()
-  install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" DESTINATION "${CMAKE_INSTALL_PREFIX}")
-  install(FILES "${OpenCV_SOURCE_DIR}/cmake/OpenCVConfig.cmake" DESTINATION "${CMAKE_INSTALL_PREFIX}/")
+  install(FILES "${CMAKE_BINARY_DIR}/win-install/OpenCVConfig-version.cmake" DESTINATION "${CMAKE_INSTALL_PREFIX}" COMPONENT dev)
+  install(FILES "${OpenCV_SOURCE_DIR}/cmake/OpenCVConfig.cmake" DESTINATION "${CMAKE_INSTALL_PREFIX}/" COMPONENT dev)
 endif()
diff --git a/cmake/OpenCVGenHeaders.cmake b/cmake/OpenCVGenHeaders.cmake
index 35da0fb4b..c892a929c 100644
--- a/cmake/OpenCVGenHeaders.cmake
+++ b/cmake/OpenCVGenHeaders.cmake
@@ -23,4 +23,4 @@ set(OPENCV_MODULE_DEFINITIONS_CONFIGMAKE "${OPENCV_MODULE_DEFINITIONS_CONFIGMAKE
 #endforeach()
 
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/opencv_modules.hpp.in" "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp")
-install(FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp" DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2 COMPONENT main)
+install(FILES "${OPENCV_CONFIG_FILE_INCLUDE_DIR}/opencv2/opencv_modules.hpp" DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2 COMPONENT dev)
diff --git a/cmake/OpenCVGenPkgconfig.cmake b/cmake/OpenCVGenPkgconfig.cmake
index cd54f11bf..fa57db9d3 100644
--- a/cmake/OpenCVGenPkgconfig.cmake
+++ b/cmake/OpenCVGenPkgconfig.cmake
@@ -78,8 +78,8 @@ else()
 endif()
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/opencv-XXX.pc.in"
                "${CMAKE_BINARY_DIR}/unix-install/${OPENCV_PC_FILE_NAME}"
-               @ONLY IMMEDIATE)
+               @ONLY)
 
 if(UNIX AND NOT ANDROID)
-  install(FILES ${CMAKE_BINARY_DIR}/unix-install/${OPENCV_PC_FILE_NAME} DESTINATION ${OPENCV_LIB_INSTALL_PATH}/pkgconfig)
+  install(FILES ${CMAKE_BINARY_DIR}/unix-install/${OPENCV_PC_FILE_NAME} DESTINATION ${OPENCV_LIB_INSTALL_PATH}/pkgconfig COMPONENT dev)
 endif()
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index c445e3fe5..19c285794 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -135,13 +135,13 @@ macro(ocv_add_module _name)
 
     # parse list of dependencies
     if("${ARGV1}" STREQUAL "INTERNAL" OR "${ARGV1}" STREQUAL "BINDINGS")
-      set(OPENCV_MODULE_${the_module}_CLASS "${ARGV1}" CACHE INTERNAL "The cathegory of the module")
+      set(OPENCV_MODULE_${the_module}_CLASS "${ARGV1}" CACHE INTERNAL "The category of the module")
       set(__ocv_argn__ ${ARGN})
       list(REMOVE_AT __ocv_argn__ 0)
       ocv_add_dependencies(${the_module} ${__ocv_argn__})
       unset(__ocv_argn__)
     else()
-      set(OPENCV_MODULE_${the_module}_CLASS "PUBLIC" CACHE INTERNAL "The cathegory of the module")
+      set(OPENCV_MODULE_${the_module}_CLASS "PUBLIC" CACHE INTERNAL "The category of the module")
       ocv_add_dependencies(${the_module} ${ARGN})
       if(BUILD_${the_module})
         set(OPENCV_MODULES_PUBLIC ${OPENCV_MODULES_PUBLIC} "${the_module}" CACHE INTERNAL "List of OpenCV modules marked for export")
@@ -583,9 +583,9 @@ macro(ocv_create_module)
   endif()
 
   ocv_install_target(${the_module} EXPORT OpenCVModules
-    RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main
-    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
-    ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
+    RUNTIME DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs
+    LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT libs
+    ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT dev
     )
 
   # only "public" headers need to be installed
@@ -593,7 +593,7 @@ macro(ocv_create_module)
     foreach(hdr ${OPENCV_MODULE_${the_module}_HEADERS})
       string(REGEX REPLACE "^.*opencv2/" "opencv2/" hdr2 "${hdr}")
       if(NOT hdr2 MATCHES "opencv2/${the_module}/private.*" AND hdr2 MATCHES "^(opencv2/?.*)/[^/]+.h(..)?$" )
-        install(FILES ${hdr} DESTINATION "${OPENCV_INCLUDE_INSTALL_PATH}/${CMAKE_MATCH_1}" COMPONENT main)
+        install(FILES ${hdr} DESTINATION "${OPENCV_INCLUDE_INSTALL_PATH}/${CMAKE_MATCH_1}" COMPONENT dev)
       endif()
     endforeach()
   endif()
@@ -717,6 +717,9 @@ function(ocv_add_perf_tests)
     else(OCV_DEPENDENCIES_FOUND)
       # TODO: warn about unsatisfied dependencies
     endif(OCV_DEPENDENCIES_FOUND)
+    if(INSTALL_TESTS)
+      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_TEST_INSTALL_PATH} COMPONENT tests)
+    endif()
   endif()
 endfunction()
 
@@ -770,6 +773,10 @@ function(ocv_add_accuracy_tests)
     else(OCV_DEPENDENCIES_FOUND)
       # TODO: warn about unsatisfied dependencies
     endif(OCV_DEPENDENCIES_FOUND)
+
+    if(INSTALL_TESTS)
+      install(TARGETS ${the_target} RUNTIME DESTINATION ${OPENCV_TEST_INSTALL_PATH} COMPONENT tests)
+    endif()
   endif()
 endfunction()
 
@@ -801,7 +808,7 @@ function(ocv_add_samples)
         endif()
 
         if(WIN32)
-          install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${module_id}" COMPONENT main)
+          install(TARGETS ${the_target} RUNTIME DESTINATION "samples/${module_id}" COMPONENT samples)
         endif()
       endforeach()
     endif()
@@ -810,8 +817,8 @@ function(ocv_add_samples)
   if(INSTALL_C_EXAMPLES AND NOT WIN32 AND EXISTS "${samples_path}")
     file(GLOB sample_files "${samples_path}/*")
     install(FILES ${sample_files}
-            DESTINATION share/OpenCV/samples/${module_id}
-            PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+            DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/${module_id}
+            PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
   endif()
 endfunction()
 
diff --git a/cmake/OpenCVPackaging.cmake b/cmake/OpenCVPackaging.cmake
new file mode 100644
index 000000000..91f594096
--- /dev/null
+++ b/cmake/OpenCVPackaging.cmake
@@ -0,0 +1,110 @@
+if(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
+set(CPACK_set_DESTDIR "on")
+
+if(NOT OPENCV_CUSTOM_PACKAGE_INFO)
+  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Open Computer Vision Library")
+  set(CPACK_PACKAGE_DESCRIPTION
+"OpenCV (Open Source Computer Vision Library) is an open source computer vision
+and machine learning software library. OpenCV was built to provide a common
+infrastructure for computer vision applications and to accelerate the use of
+machine perception in the commercial products. Being a BSD-licensed product,
+OpenCV makes it easy for businesses to utilize and modify the code.")
+  set(CPACK_PACKAGE_VENDOR "OpenCV Foundation")
+  set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+  set(CPACK_PACKAGE_CONTACT "admin@opencv.org")
+  set(CPACK_PACKAGE_VERSION_MAJOR "${OPENCV_VERSION_MAJOR}")
+  set(CPACK_PACKAGE_VERSION_MINOR "${OPENCV_VERSION_MINOR}")
+  set(CPACK_PACKAGE_VERSION_PATCH "${OPENCV_VERSION_PATCH}")
+  set(CPACK_PACKAGE_VERSION "${OPENCV_VCSVERSION}")
+endif(NOT OPENCV_CUSTOM_PACKAGE_INFO)
+
+#arch
+if(X86)
+  set(CPACK_DEBIAN_ARCHITECTURE "i386")
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE "i686")
+elseif(X86_64)
+  set(CPACK_DEBIAN_ARCHITECTURE "amd64")
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
+elseif(ARM)
+  set(CPACK_DEBIAN_ARCHITECTURE "armhf")
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE "armhf")
+else()
+  set(CPACK_DEBIAN_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+if(CPACK_GENERATOR STREQUAL "DEB")
+  set(OPENCV_PACKAGE_ARCH_SUFFIX ${CPACK_DEBIAN_ARCHITECTURE})
+elseif(CPACK_GENERATOR STREQUAL "RPM")
+  set(OPENCV_PACKAGE_ARCH_SUFFIX ${CPACK_RPM_PACKAGE_ARCHITECTURE})
+else()
+  set(OPENCV_PACKAGE_ARCH_SUFFIX ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+set(CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${OPENCV_VCSVERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
+set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${OPENCV_VCSVERSION}-${OPENCV_PACKAGE_ARCH_SUFFIX}")
+
+#rpm options
+set(CPACK_RPM_COMPONENT_INSTALL TRUE)
+set(CPACK_RPM_PACKAGE_SUMMARY ${CPACK_PACKAGE_DESCRIPTION_SUMMARY})
+set(CPACK_RPM_PACKAGE_DESCRIPTION ${CPACK_PACKAGE_DESCRIPTION})
+set(CPACK_RPM_PACKAGE_URL "http://opencv.org")
+set(CPACK_RPM_PACKAGE_LICENSE "BSD")
+
+#deb options
+set(CPACK_DEB_COMPONENT_INSTALL TRUE)
+set(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+set(CPACK_DEBIAN_PACKAGE_SECTION "libs")
+set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "http://opencv.org")
+
+#depencencies
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS TRUE)
+set(CPACK_COMPONENT_samples_DEPENDS libs)
+set(CPACK_COMPONENT_dev_DEPENDS libs)
+set(CPACK_COMPONENT_docs_DEPENDS libs)
+set(CPACK_COMPONENT_java_DEPENDS libs)
+set(CPACK_COMPONENT_python_DEPENDS libs)
+set(CPACK_COMPONENT_tests_DEPENDS libs)
+
+if(HAVE_CUDA)
+  string(REPLACE "." "-" cuda_version_suffix ${CUDA_VERSION})
+  set(CPACK_DEB_libs_PACKAGE_DEPENDS "cuda-core-libs-${cuda_version_suffix}, cuda-extra-libs-${cuda_version_suffix}")
+  set(CPACK_COMPONENT_dev_DEPENDS libs)
+  set(CPACK_DEB_dev_PACKAGE_DEPENDS "cuda-headers-${cuda_version_suffix}")
+endif()
+
+if(NOT OPENCV_CUSTOM_PACKAGE_INFO)
+  set(CPACK_COMPONENT_libs_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}")
+  set(CPACK_COMPONENT_libs_DESCRIPTION "Open Computer Vision Library")
+
+  set(CPACK_COMPONENT_python_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-python")
+  set(CPACK_COMPONENT_python_DESCRIPTION "Python bindings for Open Source Computer Vision Library")
+
+  set(CPACK_COMPONENT_java_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-java")
+  set(CPACK_COMPONENT_java_DESCRIPTION "Java bindings for Open Source Computer Vision Library")
+
+  set(CPACK_COMPONENT_dev_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-dev")
+  set(CPACK_COMPONENT_dev_DESCRIPTION "Development files for Open Source Computer Vision Library")
+
+  set(CPACK_COMPONENT_docs_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-docs")
+  set(CPACK_COMPONENT_docs_DESCRIPTION "Documentation for Open Source Computer Vision Library")
+
+  set(CPACK_COMPONENT_samples_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-samples")
+  set(CPACK_COMPONENT_samples_DESCRIPTION "Samples for Open Source Computer Vision Library")
+
+  set(CPACK_COMPONENT_tests_DISPLAY_NAME "lib${CMAKE_PROJECT_NAME}-tests")
+  set(CPACK_COMPONENT_tests_DESCRIPTION "Accuracy and performance tests for Open Source Computer Vision Library")
+endif(NOT OPENCV_CUSTOM_PACKAGE_INFO)
+
+if(NOT OPENCV_CUSTOM_PACKAGE_LAYOUT)
+  set(CPACK_libs_COMPONENT_INSTALL TRUE)
+  set(CPACK_dev_COMPONENT_INSTALL TRUE)
+  set(CPACK_docs_COMPONENT_INSTALL TRUE)
+  set(CPACK_python_COMPONENT_INSTALL TRUE)
+  set(CPACK_java_COMPONENT_INSTALL TRUE)
+  set(CPACK_samples_COMPONENT_INSTALL TRUE)
+endif(NOT OPENCV_CUSTOM_PACKAGE_LAYOUT)
+
+include(CPack)
+
+ENDif(EXISTS "${CMAKE_ROOT}/Modules/CPack.cmake")
\ No newline at end of file
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 677d7f5d5..f2a0197f8 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -467,6 +467,20 @@ macro(ocv_convert_to_full_paths VAR)
 endmacro()
 
 
+# convert list of paths to libraries names without lib prefix
+macro(ocv_convert_to_lib_name var)
+  set(__tmp "")
+  foreach(path ${ARGN})
+    get_filename_component(__tmp_name "${path}" NAME_WE)
+    string(REGEX REPLACE "^lib" "" __tmp_name ${__tmp_name})
+    list(APPEND __tmp "${__tmp_name}")
+  endforeach()
+  set(${var} ${__tmp})
+  unset(__tmp)
+  unset(__tmp_name)
+endmacro()
+
+
 # add install command
 function(ocv_install_target)
   install(TARGETS ${ARGN})
diff --git a/cmake/cl2cpp.cmake b/cmake/cl2cpp.cmake
index 24d3eb2dc..09cac6c4c 100644
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@@ -29,6 +29,7 @@ ${nested_namespace_start}
 set(STR_HPP "// This file is auto-generated. Do not edit!
 
 #include \"opencv2/core/ocl_genbase.hpp\"
+#include \"opencv2/core/opencl/ocl_defs.hpp\"
 
 namespace cv
 {
@@ -64,8 +65,8 @@ foreach(cl ${cl_list})
   set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
   set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n")
   if(new_mode)
-    set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
-    set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n")
+    set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
+    set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource ${cl_filename}_oclsrc;\n")
   endif()
 
   set(STR_CPP "${STR_CPP}${STR_CPP_DECL}")
diff --git a/cmake/templates/OpenCV.mk.in b/cmake/templates/OpenCV.mk.in
index 078e02039..33d36601a 100644
--- a/cmake/templates/OpenCV.mk.in
+++ b/cmake/templates/OpenCV.mk.in
@@ -2,6 +2,13 @@
 # you might need to define NDK_USE_CYGPATH=1 before calling the ndk-build
 
 USER_LOCAL_PATH:=$(LOCAL_PATH)
+
+USER_LOCAL_C_INCLUDES:=$(LOCAL_C_INCLUDES)
+USER_LOCAL_CFLAGS:=$(LOCAL_CFLAGS)
+USER_LOCAL_STATIC_LIBRARIES:=$(LOCAL_STATIC_LIBRARIES)
+USER_LOCAL_SHARED_LIBRARIES:=$(LOCAL_SHARED_LIBRARIES)
+USER_LOCAL_LDLIBS:=$(LOCAL_LDLIBS)
+
 LOCAL_PATH:=$(subst ?,,$(firstword ?$(subst \, ,$(subst /, ,$(call my-dir)))))
 
 OPENCV_TARGET_ARCH_ABI:=$(TARGET_ARCH_ABI)
@@ -47,7 +54,7 @@ else
     endif
 endif
 
-ifeq (${OPENCV_CAMERA_MODULES},on)
+ifeq ($(OPENCV_CAMERA_MODULES),on)
     ifeq ($(TARGET_ARCH_ABI),armeabi)
         OPENCV_CAMERA_MODULES:=@OPENCV_CAMERA_LIBS_ARMEABI_CONFIGCMAKE@
     endif
@@ -113,6 +120,13 @@ ifeq ($(OPENCV_LOCAL_CFLAGS),)
 endif
 
 include $(CLEAR_VARS)
+
+LOCAL_C_INCLUDES:=$(USER_LOCAL_C_INCLUDES)
+LOCAL_CFLAGS:=$(USER_LOCAL_CFLAGS)
+LOCAL_STATIC_LIBRARIES:=$(USER_LOCAL_STATIC_LIBRARIES)
+LOCAL_SHARED_LIBRARIES:=$(USER_LOCAL_SHARED_LIBRARIES)
+LOCAL_LDLIBS:=$(USER_LOCAL_LDLIBS)
+
 LOCAL_C_INCLUDES += $(OPENCV_LOCAL_C_INCLUDES)
 LOCAL_CFLAGS     += $(OPENCV_LOCAL_CFLAGS)
 
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 554b91cef..3f316da46 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -88,6 +88,9 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #cmakedefine HAVE_INTTYPES_H 1
 
+/* Intel Perceptual Computing SDK library */
+#cmakedefine HAVE_INTELPERC
+
 /* Intel Integrated Performance Primitives */
 #cmakedefine HAVE_IPP
 
@@ -164,6 +167,6 @@
 /* Xine video library */
 #cmakedefine HAVE_XINE
 
-/* Define to 1 if your processor stores words with the most significant byte
+/* Define if your processor stores words with the most significant byte
    first (like Motorola and SPARC, unlike Intel and VAX). */
 #cmakedefine WORDS_BIGENDIAN
diff --git a/cmake/templates/opencv_run_all_tests_android.sh.in b/cmake/templates/opencv_run_all_tests_android.sh.in
new file mode 100644
index 000000000..93373fa96
--- /dev/null
+++ b/cmake/templates/opencv_run_all_tests_android.sh.in
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+BASE_DIR=`dirname $0`
+OPENCV_TEST_PATH=$BASE_DIR/@TEST_PATH@
+OPENCV_TEST_DATA_PATH=$BASE_DIR/sdk/etc/testdata/
+
+if [ $# -ne 1 ]; then
+  echo "Device architecture is not preset in command line"
+  echo "Tests are available for architectures: `ls -m ${OPENCV_TEST_PATH}`"
+  echo "Usage: $0 <target_device_arch>"
+  return 1
+else
+  TARGET_ARCH=$1
+fi
+
+if [ -z `which adb` ]; then
+  echo "adb command was not found in PATH"
+  return 1
+fi
+
+adb push $OPENCV_TEST_DATA_PATH /sdcard/opencv_testdata
+
+adb shell "mkdir -p /data/local/tmp/opencv_test"
+SUMMARY_STATUS=0
+for t in "$OPENCV_TEST_PATH/$TARGET_ARCH/"opencv_test_* "$OPENCV_TEST_PATH/$TARGET_ARCH/"opencv_perf_*;
+do
+  test_name=`basename "$t"`
+  report="$test_name-`date --rfc-3339=date`.xml"
+  adb push $t /data/local/tmp/opencv_test/
+  adb shell "export OPENCV_TEST_DATA_PATH=/sdcard/opencv_testdata && /data/local/tmp/opencv_test/$test_name --perf_min_samples=1 --perf_force_samples=1 --gtest_output=xml:/data/local/tmp/opencv_test/$report"
+  adb pull "/data/local/tmp/opencv_test/$report" $report
+  TEST_STATUS=0
+  if [ -e $report ]; then
+    if [ `grep -c "<fail" $report` -ne 0 ]; then
+      TEST_STATUS=2
+    fi
+  else
+    TEST_STATUS=3
+  fi
+  if [ $TEST_STATUS -ne 0 ]; then
+    SUMMARY_STATUS=$TEST_STATUS
+  fi
+done
+
+if [ $SUMMARY_STATUS -eq 0 ]; then
+  echo "All OpenCV tests finished successfully"
+else
+  echo "OpenCV tests finished with status $SUMMARY_STATUS"
+fi
+
+return $SUMMARY_STATUS
\ No newline at end of file
diff --git a/cmake/templates/opencv_run_all_tests_unix.sh.in b/cmake/templates/opencv_run_all_tests_unix.sh.in
new file mode 100644
index 000000000..77dc1191a
--- /dev/null
+++ b/cmake/templates/opencv_run_all_tests_unix.sh.in
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+OPENCV_TEST_PATH=@CMAKE_INSTALL_PREFIX@/@OPENCV_TEST_INSTALL_PATH@
+export OPENCV_TEST_DATA_PATH=@CMAKE_INSTALL_PREFIX@/share/OpenCV/testdata
+
+SUMMARY_STATUS=0
+for t in "$OPENCV_TEST_PATH/"opencv_test_* "$OPENCV_TEST_PATH/"opencv_perf_*;
+do
+  report="`basename "$t"`-`date --rfc-3339=date`.xml"
+  "$t" --perf_min_samples=1 --perf_force_samples=1 --gtest_output=xml:"$report"
+  TEST_STATUS=$?
+  if [ $TEST_STATUS -ne 0 ]; then
+    SUMMARY_STATUS=$TEST_STATUS
+  fi
+done
+
+rm -f /tmp/__opencv_temp.*
+
+if [ $SUMMARY_STATUS -eq 0 ]; then
+  echo "All OpenCV tests finished successfully"
+else
+  echo "OpenCV tests finished with status $SUMMARY_STATUS"
+fi
+
+return $SUMMARY_STATUS
\ No newline at end of file
diff --git a/cmake/templates/opencv_testing.sh.in b/cmake/templates/opencv_testing.sh.in
new file mode 100644
index 000000000..3140136eb
--- /dev/null
+++ b/cmake/templates/opencv_testing.sh.in
@@ -0,0 +1,2 @@
+# Environment setup for OpenCV testing
+export OPENCV_TEST_DATA_PATH=@CMAKE_INSTALL_PREFIX@/share/OpenCV/testdata
\ No newline at end of file
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
index 70efd6fd0..998e78520 100644
--- a/data/CMakeLists.txt
+++ b/data/CMakeLists.txt
@@ -2,9 +2,21 @@ file(GLOB HAAR_CASCADES haarcascades/*.xml)
 file(GLOB LBP_CASCADES lbpcascades/*.xml)
 
 if(ANDROID)
-  install(FILES ${HAAR_CASCADES} DESTINATION sdk/etc/haarcascades COMPONENT main)
-  install(FILES ${LBP_CASCADES}  DESTINATION sdk/etc/lbpcascades  COMPONENT main)
+  install(FILES ${HAAR_CASCADES} DESTINATION sdk/etc/haarcascades COMPONENT libs)
+  install(FILES ${LBP_CASCADES}  DESTINATION sdk/etc/lbpcascades  COMPONENT libs)
 elseif(NOT WIN32)
-  install(FILES ${HAAR_CASCADES} DESTINATION share/OpenCV/haarcascades COMPONENT main)
-  install(FILES ${LBP_CASCADES}  DESTINATION share/OpenCV/lbpcascades  COMPONENT main)
+  install(FILES ${HAAR_CASCADES} DESTINATION share/OpenCV/haarcascades COMPONENT libs)
+  install(FILES ${LBP_CASCADES}  DESTINATION share/OpenCV/lbpcascades  COMPONENT libs)
 endif()
+
+if(INSTALL_TESTS AND OPENCV_TEST_DATA_PATH)
+  if(ANDROID)
+    install(DIRECTORY ${OPENCV_TEST_DATA_PATH} DESTINATION sdk/etc/testdata COMPONENT tests)
+  elseif(NOT WIN32)
+    # CPack does not set correct permissions by default, so we do it explicitly.
+    install(DIRECTORY ${OPENCV_TEST_DATA_PATH}
+            DIRECTORY_PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+              GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
+            DESTINATION share/OpenCV/testdata COMPONENT tests)
+  endif()
+endif()
\ No newline at end of file
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 217e4b6f6..1c284539e 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -33,7 +33,7 @@ if(BUILD_DOCS AND HAVE_SPHINX)
     endif()
   endforeach()
 
-  set(FIXED_ORDER_MODULES core imgproc highgui video calib3d features2d objdetect ml flann photo stitching nonfree contrib legacy bioinspired)
+  set(FIXED_ORDER_MODULES core imgproc highgui video calib3d features2d objdetect ml flann photo stitching nonfree contrib legacy)
 
   list(REMOVE_ITEM BASE_MODULES ${FIXED_ORDER_MODULES})
 
@@ -148,11 +148,11 @@ if(BUILD_DOCS AND HAVE_SPHINX)
   endif()
 
   foreach(f ${DOC_LIST})
-    install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" COMPONENT main)
+    install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" COMPONENT docs)
   endforeach()
 
   foreach(f ${OPTIONAL_DOC_LIST})
-    install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" OPTIONAL)
+    install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" OPTIONAL COMPONENT docs)
   endforeach()
 
 endif()
diff --git a/doc/conf.py b/doc/conf.py
index 7d5908fc7..0112725b9 100755
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -54,7 +54,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'OpenCV'
-copyright = u'2011-2013, opencv dev team'
+copyright = u'2011-2014, opencv dev team'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/doc/haartraining.htm b/doc/haartraining.htm
index c8c3a0e92..9a0767d26 100644
--- a/doc/haartraining.htm
+++ b/doc/haartraining.htm
@@ -492,7 +492,7 @@ class=Typewch><span lang=EN-US>- weighttrimming &lt;weight_trimming&gt;</span></
 <p class=MsoNormal style='margin-left:17.1pt;text-indent:-17.1pt'><span
 class=Typewch><span lang=EN-US>� </span></span><span class=Typewch><span
 lang=EN-US style='font-family:"Times New Roman";font-weight:normal'>Specifies
-wheter and how much weight trimming should be used. A decent choice is 0.90.</span></span></p>
+whether and how much weight trimming should be used. A decent choice is 0.90.</span></span></p>
 
 <p class=MsoNormal style='margin-left:17.1pt;text-indent:-17.1pt'><span
 class=Typewch><span lang=EN-US>- eqw</span></span></p>
diff --git a/doc/license.txt b/doc/license.txt
deleted file mode 100644
index 8824228d0..000000000
--- a/doc/license.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-
- By downloading, copying, installing or using the software you agree to this license.
- If you do not agree to this license, do not download, install,
- copy or use the software.
-
-
-                          License Agreement
-               For Open Source Computer Vision Library
-
-Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-Third party copyrights are property of their respective owners.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-  * Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
-  * Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
-  * The name of the copyright holders may not be used to endorse or promote products
-    derived from this software without specific prior written permission.
-
-This software is provided by the copyright holders and contributors "as is" and
-any express or implied warranties, including, but not limited to, the implied
-warranties of merchantability and fitness for a particular purpose are disclaimed.
-In no event shall the Intel Corporation or contributors be liable for any direct,
-indirect, incidental, special, exemplary, or consequential damages
-(including, but not limited to, procurement of substitute goods or services;
-loss of use, data, or profits; or business interruption) however caused
-and on any theory of liability, whether in contract, strict liability,
-or tort (including negligence or otherwise) arising in any way out of
-the use of this software, even if advised of the possibility of such damage.
diff --git a/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.rst b/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.rst
index abf041177..4f0e7cc06 100644
--- a/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.rst
+++ b/doc/py_tutorials/py_calib3d/py_calibration/py_calibration.rst
@@ -110,11 +110,11 @@ Once we find the corners, we can increase their accuracy using **cv2.cornerSubPi
         if ret == True:
             objpoints.append(objp)
 
-            corners2 = cv2.cornerSubPix(gray,corners,(11,11),(-1,-1),criteria)
-            imgpoints.append(corners2)
+            cv2.cornerSubPix(gray,corners,(11,11),(-1,-1),criteria)
+            imgpoints.append(corners)
 
             # Draw and display the corners
-            img = cv2.drawChessboardCorners(img, (7,6), corners2,ret)
+            cv2.drawChessboardCorners(img, (7,6), corners2,ret)
             cv2.imshow('img',img)
             cv2.waitKey(500)
 
diff --git a/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst b/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
index 86f448e8a..55b1eec91 100644
--- a/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
+++ b/doc/py_tutorials/py_gui/py_drawing_functions/py_drawing_functions.rst
@@ -33,21 +33,21 @@ To draw a line, you need to pass starting and ending coordinates of line. We wil
     img = np.zeros((512,512,3), np.uint8)
 
     # Draw a diagonal blue line with thickness of 5 px
-    img = cv2.line(img,(0,0),(511,511),(255,0,0),5)
+    cv2.line(img,(0,0),(511,511),(255,0,0),5)
 
 Drawing Rectangle
 -------------------
 To draw a rectangle, you need top-left corner and bottom-right corner of rectangle. This time we will draw a green rectangle at the top-right corner of image.
 ::
 
-    img = cv2.rectangle(img,(384,0),(510,128),(0,255,0),3)
+    cv2.rectangle(img,(384,0),(510,128),(0,255,0),3)
 
 Drawing Circle
 ----------------
 To draw a circle, you need its center coordinates and radius. We will draw a circle inside the rectangle drawn above.
 ::
 
-    img = cv2.circle(img,(447,63), 63, (0,0,255), -1)
+    cv2.circle(img,(447,63), 63, (0,0,255), -1)
 
 Drawing Ellipse
 --------------------
@@ -55,7 +55,7 @@ Drawing Ellipse
 To draw the ellipse, we need to pass several arguments. One argument is the center location (x,y). Next argument is axes lengths (major axis length, minor axis length). ``angle`` is the angle of rotation of ellipse in anti-clockwise direction. ``startAngle`` and ``endAngle`` denotes the starting and ending of ellipse arc measured in clockwise direction from major axis. i.e. giving values 0 and 360 gives the full ellipse. For more details, check the documentation of **cv2.ellipse()**. Below example draws a half ellipse at the center of the image.
 ::
 
-    img = cv2.ellipse(img,(256,256),(100,50),0,0,180,255,-1)
+    cv2.ellipse(img,(256,256),(100,50),0,0,180,255,-1)
 
 
 Drawing Polygon
@@ -65,7 +65,7 @@ To draw a polygon, first you need coordinates of vertices. Make those points int
 
     pts = np.array([[10,5],[20,30],[70,20],[50,10]], np.int32)
     pts = pts.reshape((-1,1,2))
-    img = cv2.polylines(img,[pts],True,(0,255,255))
+    cv2.polylines(img,[pts],True,(0,255,255))
 
 .. Note:: If third argument is ``False``, you will get a polylines joining all the points, not a closed shape.
 
@@ -103,4 +103,4 @@ Additional Resources
 
 Exercises
 ==============
-#. Try to create the logo of OpenCV using drawing functions available in OpenCV
+#. Try to create the logo of OpenCV using drawing functions available in OpenCV.
diff --git a/doc/py_tutorials/py_gui/py_image_display/py_image_display.rst b/doc/py_tutorials/py_gui/py_image_display/py_image_display.rst
index 75f674a50..a05db370c 100644
--- a/doc/py_tutorials/py_gui/py_image_display/py_image_display.rst
+++ b/doc/py_tutorials/py_gui/py_image_display/py_image_display.rst
@@ -59,6 +59,8 @@ A screenshot of the window will look like this (in Fedora-Gnome machine):
 
 **cv2.waitKey()** is a keyboard binding function. Its argument is the time in milliseconds. The function waits for specified milliseconds for any keyboard event. If you press any key in that time, the program continues. If **0** is passed, it waits indefinitely for a key stroke. It can also be set to detect specific key strokes like, if key `a` is pressed etc which we will discuss below.
 
+.. note:: Besides binding keyboard events this function also processes many other GUI events, so you MUST use it to actually display the image.
+
 **cv2.destroyAllWindows()** simply destroys all the windows we created. If you want to destroy any specific window, use the function **cv2.destroyWindow()** where you pass the exact window name as the argument.
 
 .. note:: There is a special case where you can already create a window and load image to it later. In that case, you can specify whether window is resizable or not. It is done with the function **cv2.namedWindow()**. By default, the flag is ``cv2.WINDOW_AUTOSIZE``. But if you specify flag to be ``cv2.WINDOW_NORMAL``, you can resize window. It will be helpful when image is too large in dimension and adding track bar to windows.
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
index 53eaa64b4..6b7c661cc 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.rst
@@ -119,7 +119,7 @@ Let (x,y) be the top-left coordinate of the rectangle and (w,h) be its width and
 ::
 
     x,y,w,h = cv2.boundingRect(cnt)
-    img = cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
+    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
 
 7.b. Rotated Rectangle
 -----------------------
@@ -129,7 +129,7 @@ Here, bounding rectangle is drawn with minimum area, so it considers the rotatio
     rect = cv2.minAreaRect(cnt)
     box = cv2.boxPoints(rect)
     box = np.int0(box)
-    im = cv2.drawContours(im,[box],0,(0,0,255),2)
+    cv2.drawContours(img,[box],0,(0,0,255),2)
 
 Both the rectangles are shown in a single image. Green rectangle shows the normal bounding rect. Red rectangle is the rotated rect.
 
@@ -145,7 +145,7 @@ Next we find the circumcircle of an object using the function **cv2.minEnclosing
     (x,y),radius = cv2.minEnclosingCircle(cnt)
     center = (int(x),int(y))
     radius = int(radius)
-    img = cv2.circle(img,center,radius,(0,255,0),2)
+    cv2.circle(img,center,radius,(0,255,0),2)
 
 .. image:: images/circumcircle.png
         :alt: Minimum Enclosing Circle
@@ -158,7 +158,7 @@ Next one is to fit an ellipse to an object. It returns the rotated rectangle in
 ::
 
     ellipse = cv2.fitEllipse(cnt)
-    im = cv2.ellipse(im,ellipse,(0,255,0),2)
+    cv2.ellipse(img,ellipse,(0,255,0),2)
 
 .. image:: images/fitellipse.png
         :alt: Fitting an Ellipse
@@ -175,7 +175,7 @@ Similarly we can fit a line to a set of points. Below image contains a set of wh
     [vx,vy,x,y] = cv2.fitLine(cnt, cv2.DIST_L2,0,0.01,0.01)
     lefty = int((-x*vy/vx) + y)
     righty = int(((cols-x)*vy/vx)+y)
-    img = cv2.line(img,(cols-1,righty),(0,lefty),(0,255,0),2)
+    cv2.line(img,(cols-1,righty),(0,lefty),(0,255,0),2)
 
 .. image:: images/fitline.jpg
         :alt: Fitting a Line
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.rst b/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.rst
index d0a9e8ca0..494123676 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.rst
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.rst
@@ -28,9 +28,9 @@ Let's see how to find contours of a binary image:
     im = cv2.imread('test.jpg')
     imgray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
     ret,thresh = cv2.threshold(imgray,127,255,0)
-    image, contours, hierarchy = cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
+    contours, hierarchy = cv2.findContours(thresh,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
 
-See, there are three arguments in **cv2.findContours()** function, first one is source image, second is contour retrieval mode, third is contour approximation method. And it outputs the image, contours and hierarchy. ``contours`` is a Python list of all the contours in the image. Each individual contour is a Numpy array of (x,y) coordinates of boundary points of the object.
+See, there are three arguments in **cv2.findContours()** function, first one is source image, second is contour retrieval mode, third is contour approximation method. And it outputs the contours and hierarchy. ``contours`` is a Python list of all the contours in the image. Each individual contour is a Numpy array of (x,y) coordinates of boundary points of the object.
 
 .. note:: We will discuss second and third arguments and about hierarchy in details later. Until then, the values given to them in code sample will work fine for all images.
 
@@ -43,18 +43,18 @@ To draw the contours, ``cv2.drawContours`` function is used. It can also be used
 To draw all the contours in an image:
 ::
 
-    img = cv2.drawContour(img, contours, -1, (0,255,0), 3)
+    cv2.drawContours(img, contours, -1, (0,255,0), 3)
 
 To draw an individual contour, say 4th contour:
 ::
 
-    img = cv2.drawContours(img, contours, 3, (0,255,0), 3)
+    cv2.drawContours(img, contours, 3, (0,255,0), 3)
 
 But most of the time, below method will be useful:
 ::
 
     cnt = contours[4]
-    img = cv2.drawContours(img, [cnt], 0, (0,255,0), 3)
+    cv2.drawContours(img, [cnt], 0, (0,255,0), 3)
 
 .. note:: Last two methods are same, but when you go forward, you will see last one is more useful.
 
diff --git a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
index a10d5907c..e1716eb4c 100644
--- a/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
+++ b/doc/py_tutorials/py_objdetect/py_face_detection/py_face_detection.rst
@@ -73,7 +73,7 @@ Now we find the faces in the image. If faces are found, it returns the positions
 
     faces = face_cascade.detectMultiScale(gray, 1.3, 5)
     for (x,y,w,h) in faces:
-        img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
+        cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
         roi_gray = gray[y:y+h, x:x+w]
         roi_color = img[y:y+h, x:x+w]
         eyes = eye_cascade.detectMultiScale(roi_gray)
diff --git a/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_retina.jpg b/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_retina.jpg
deleted file mode 100644
index 251557e83..000000000
Binary files a/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_retina.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_small.jpg b/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_small.jpg
deleted file mode 100644
index 86d9f1eea..000000000
Binary files a/doc/tutorials/bioinspired/retina_model/images/retina_TreeHdr_small.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/retina_model/images/studentsSample_input.jpg b/doc/tutorials/bioinspired/retina_model/images/studentsSample_input.jpg
deleted file mode 100644
index a0ac96dde..000000000
Binary files a/doc/tutorials/bioinspired/retina_model/images/studentsSample_input.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/retina_model/images/studentsSample_magno.jpg b/doc/tutorials/bioinspired/retina_model/images/studentsSample_magno.jpg
deleted file mode 100644
index 935397869..000000000
Binary files a/doc/tutorials/bioinspired/retina_model/images/studentsSample_magno.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/retina_model/images/studentsSample_parvo.jpg b/doc/tutorials/bioinspired/retina_model/images/studentsSample_parvo.jpg
deleted file mode 100644
index 3babfefc3..000000000
Binary files a/doc/tutorials/bioinspired/retina_model/images/studentsSample_parvo.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/retina_model/retina_model.rst b/doc/tutorials/bioinspired/retina_model/retina_model.rst
deleted file mode 100644
index e8527ee8b..000000000
--- a/doc/tutorials/bioinspired/retina_model/retina_model.rst
+++ /dev/null
@@ -1,418 +0,0 @@
-.. _Retina_Model:
-
-Discovering the human retina and its use for image processing
-*************************************************************
-
-Goal
-=====
-
-I present here a model of human retina that shows some interesting properties for image preprocessing and enhancement.
-In this tutorial you will learn how to:
-
-.. container:: enumeratevisibleitemswithsquare
-
-   + discover the main two channels outing from your retina
-
-   + see the basics to use the retina model
-
-   + discover some parameters tweaks
-
-
-General overview
-================
-
-The proposed model originates from Jeanny Herault's research [herault2010]_ at `Gipsa <http://www.gipsa-lab.inpg.fr>`_. It is involved in image processing applications with `Listic <http://www.listic.univ-savoie.fr>`_ (code maintainer and user) lab. This is not a complete model but it already present interesting properties that can be involved for enhanced image processing experience. The model allows the following human retina properties to be used :
-
-* spectral whitening that has 3 important effects: high spatio-temporal frequency signals canceling (noise), mid-frequencies details enhancement and low frequencies luminance energy reduction. This *all in one* property directly allows visual signals cleaning of classical undesired distortions introduced by image sensors and input luminance range.
-
-* local logarithmic luminance compression allows details to be enhanced even in low light conditions.
-
-* decorrelation of the details information (Parvocellular output channel) and transient information (events, motion made available at the Magnocellular output channel).
-
-The first two points are illustrated below :
-
-In the figure below, the OpenEXR image sample *CrissyField.exr*, a High Dynamic Range image is shown. In order to make it visible on this web-page, the original input image is linearly rescaled to the classical image luminance range [0-255] and is converted to 8bit/channel format. Such strong conversion hides many details because of too strong local contrasts. Furthermore, noise energy is also strong and pollutes visual information.
-
-.. image:: images/retina_TreeHdr_small.jpg
-   :alt: A High dynamic range image linearly rescaled within range [0-255].
-   :align: center
-
-In the following image, applying the ideas proposed in [benoit2010]_, as your retina does, local luminance adaptation, spatial noise removal and spectral whitening work together and transmit accurate information on lower range 8bit data channels. On this picture, noise in significantly removed, local details hidden by strong luminance contrasts are enhanced. Output image keeps its naturalness and visual content is enhanced. Color processing is based on the color multiplexing/demultiplexing method proposed in [chaix2007]_.
-
-.. image:: images/retina_TreeHdr_retina.jpg
-   :alt: A High dynamic range image compressed within range [0-255] using the retina.
-   :align: center
-
-
-*Note :* image sample can be downloaded from the `OpenEXR website <http://www.openexr.com>`_. Regarding this demonstration, before retina processing, input image has been linearly rescaled within 0-255 keeping its channels float format. 5% of its histogram ends has been cut (mostly removes wrong HDR pixels). Check out the sample *opencv/samples/cpp/OpenEXRimages_HighDynamicRange_Retina_toneMapping.cpp* for similar processing. The following demonstration will only consider classical 8bit/channel images.
-
-The retina model output channels
-================================
-
-The retina model presents two outputs that benefit from the above cited behaviors.
-
-* The first one is called the Parvocellular channel. It is mainly active in the foveal retina area (high resolution central vision with color sensitive photo-receptors), its aim is to provide accurate color vision for visual details remaining static on the retina. On the other hand objects moving on the retina projection are blurred.
-
-* The second well known channel is the Magnocellular channel. It is mainly active in the retina peripheral vision and send signals related to change events (motion, transient events, etc.). These outing signals also help visual system to focus/center retina on 'transient'/moving areas for more detailed analysis thus improving visual scene context and object classification.
-
-**NOTE :** regarding the proposed model, contrary to the real retina, we apply these two channels on the entire input images using the same resolution. This allows enhanced visual details and motion information to be extracted on all the considered images... but remember, that these two channels are complementary. For example, if Magnocellular channel gives strong energy in an area, then, the Parvocellular channel is certainly blurred there since there is a transient event.
-
-As an illustration, we apply in the following the retina model on a webcam video stream of a dark visual scene. In this visual scene, captured in an amphitheater of the university, some students are moving while talking to the teacher.
-
-In this video sequence, because of the dark ambiance, signal to noise ratio is low and color artifacts are present on visual features edges because of the low quality image capture tool-chain.
-
-.. image:: images/studentsSample_input.jpg
-   :alt: an input video stream extract sample
-   :align: center
-
-Below is shown the retina foveal vision applied on the entire image. In the used retina configuration, global luminance is preserved and local contrasts are enhanced. Also, signal to noise ratio is improved : since high frequency spatio-temporal noise is reduced, enhanced details are not corrupted by any enhanced noise.
-
-.. image:: images/studentsSample_parvo.jpg
-   :alt: the retina Parvocellular output. Enhanced details, luminance adaptation and noise removal. A processing tool for image analysis.
-   :align: center
-
-Below is the output of the Magnocellular output of the retina model. Its signals are strong where transient events occur. Here, a student is moving at the bottom of the image thus generating high energy. The remaining of the image is static however, it is corrupted by a strong noise. Here, the retina filters out most of the noise thus generating low false motion area 'alarms'. This channel can be used as a transient/moving areas detector : it would provide relevant information for a low cost segmentation tool that would highlight areas in which an event is occurring.
-
-.. image:: images/studentsSample_magno.jpg
-   :alt: the retina Magnocellular output. Enhanced transient signals (motion, etc.). A preprocessing tool for event detection.
-   :align: center
-
-Retina use case
-===============
-
-This model can be used basically for spatio-temporal video effects but also in the aim of :
-
-* performing texture analysis with enhanced signal to noise ratio and enhanced details robust against input images luminance ranges (check out the Parvocellular retina channel output)
-
-* performing motion analysis also taking benefit of the previously cited properties.
-
-Literature
-==========
-For more information, refer to the following papers :
-
-.. [benoit2010] Benoit A., Caplier A., Durette B., Herault, J., "Using Human Visual System Modeling For Bio-Inspired Low Level Image Processing", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773. DOI <http://dx.doi.org/10.1016/j.cviu.2010.01.011>
-
-* Please have a look at the reference work of Jeanny Herault that you can read in his book :
-
-.. [herault2010] Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-
-This retina filter code includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-
-* take a look at the *retinacolor.hpp* module to discover Brice Chaix de Lavarene phD color mosaicing/demosaicing and his reference paper:
-
-.. [chaix2007] B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-
-* take a look at *imagelogpolprojection.hpp* to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions. More informations in the above cited Jeanny Heraults's book.
-
-Code tutorial
-=============
-
-Please refer to the original tutorial source code in file *opencv_folder/samples/cpp/tutorial_code/bioinspired/retina_tutorial.cpp*.
-
-**Note :** do not forget that the retina model is included in the following namespace : *cv::bioinspired*.
-
-To compile it, assuming OpenCV is correctly installed, use the following command. It requires the opencv_core *(cv::Mat and friends objects management)*, opencv_highgui *(display and image/video read)* and opencv_bioinspired *(Retina description)* libraries to compile.
-
-.. code-block:: cpp
-
-   // compile
-   gcc retina_tutorial.cpp -o Retina_tuto -lopencv_core -lopencv_highgui -lopencv_bioinspired
-
-   // Run commands : add 'log' as a last parameter to apply a spatial log sampling (simulates retina sampling)
-   // run on webcam
-   ./Retina_tuto -video
-   // run on video file
-   ./Retina_tuto -video myVideo.avi
-   // run on an image
-   ./Retina_tuto -image myPicture.jpg
-   // run on an image with log sampling
-   ./Retina_tuto -image myPicture.jpg log
-
-Here is a code explanation :
-
-Retina definition is present in the bioinspired package and a simple include allows to use it. You can rather use the specific header : *opencv2/bioinspired.hpp* if you prefer but then include the other required openv modules : *opencv2/core.hpp* and *opencv2/highgui.hpp*
-
-.. code-block:: cpp
-
-   #include "opencv2/opencv.hpp"
-
-Provide user some hints to run the program with a help function
-
-.. code-block:: cpp
-
-   // the help procedure
-   static void help(std::string errorMessage)
-   {
-    std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : retinaDemo [processing mode] [Optional : media target] [Optional LAST parameter: \"log\" to activate retina log sampling]"<<std::endl;
-    std::cout<<"\t[processing mode] :"<<std::endl;
-    std::cout<<"\t -image : for still image processing"<<std::endl;
-    std::cout<<"\t -video : for video stream processing"<<std::endl;
-    std::cout<<"\t[Optional : media target] :"<<std::endl;
-    std::cout<<"\t if processing an image or video file, then, specify the path and filename of the target to process"<<std::endl;
-    std::cout<<"\t leave empty if processing video stream coming from a connected video device"<<std::endl;
-    std::cout<<"\t[Optional : activate retina log sampling] : an optional last parameter can be specified for retina spatial log sampling"<<std::endl;
-    std::cout<<"\t set \"log\" without quotes to activate this sampling, output frame size will be divided by 4"<<std::endl;
-    std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./retinaDemo -image lena.jpg"<<std::endl;
-    std::cout<<"\t-Image processing with log sampling : ./retinaDemo -image lena.jpg log"<<std::endl;
-    std::cout<<"\t-Video processing : ./retinaDemo -video myMovie.mp4"<<std::endl;
-    std::cout<<"\t-Live video processing : ./retinaDemo -video"<<std::endl;
-    std::cout<<"\nPlease start again with new parameters"<<std::endl;
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<" NOTE : this program generates the default retina parameters file 'RetinaDefaultParameters.xml'"<<std::endl;
-    std::cout<<" => you can use this to fine tune parameters and load them if you save to file 'RetinaSpecificParameters.xml'"<<std::endl;
-   }
-
-Then, start the main program and first declare a *cv::Mat* matrix in which input images will be loaded. Also allocate a *cv::VideoCapture* object ready to load video streams (if necessary)
-
-.. code-block:: cpp
-
-  int main(int argc, char* argv[]) {
-    // declare the retina input buffer... that will be fed differently in regard of the input media
-    cv::Mat inputFrame;
-    cv::VideoCapture videoCapture; // in case a video media is used, its manager is declared here
-
-
-In the main program, before processing, first check input command parameters. Here it loads a first input image coming from a single loaded image (if user chose command *-image*) or from a video stream (if user chose command *-video*). Also, if the user added *log* command at the end of its program call, the spatial logarithmic image sampling performed by the retina is taken into account by the Boolean flag *useLogSampling*.
-
-.. code-block:: cpp
-
-  // welcome message
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<"* Retina demonstration : demonstrates the use of is a wrapper class of the Gipsa/Listic Labs retina model."<<std::endl;
-    std::cout<<"* This demo will try to load the file 'RetinaSpecificParameters.xml' (if exists).\nTo create it, copy the autogenerated template 'RetinaDefaultParameters.xml'.\nThen twaek it with your own retina parameters."<<std::endl;
-    // basic input arguments checking
-    if (argc<2)
-    {
-        help("bad number of parameter");
-        return -1;
-    }
-
-    bool useLogSampling = !strcmp(argv[argc-1], "log"); // check if user wants retina log sampling processing
-
-    std::string inputMediaType=argv[1];
-
-    //////////////////////////////////////////////////////////////////////////////
-    // checking input media type (still image, video file, live video acquisition)
-    if (!strcmp(inputMediaType.c_str(), "-image") && argc >= 3)
-    {
-        std::cout<<"RetinaDemo: processing image "<<argv[2]<<std::endl;
-        // image processing case
-        inputFrame = cv::imread(std::string(argv[2]), 1); // load image in RGB mode
-    }else
-        if (!strcmp(inputMediaType.c_str(), "-video"))
-        {
-            if (argc == 2 || (argc == 3 && useLogSampling)) // attempt to grab images from a video capture device
-            {
-                videoCapture.open(0);
-            }else// attempt to grab images from a video filestream
-            {
-                std::cout<<"RetinaDemo: processing video stream "<<argv[2]<<std::endl;
-                videoCapture.open(argv[2]);
-            }
-
-            // grab a first frame to check if everything is ok
-            videoCapture>>inputFrame;
-        }else
-        {
-            // bad command parameter
-            help("bad command parameter");
-            return -1;
-        }
-
-Once all input parameters are processed, a first image should have been loaded, if not, display error and stop program :
-
-.. code-block:: cpp
-
-    if (inputFrame.empty())
-    {
-        help("Input media could not be loaded, aborting");
-        return -1;
-    }
-
-Now, everything is ready to run the retina model. I propose here to allocate a retina instance and to manage the eventual log sampling option. The Retina constructor expects at least a cv::Size object that shows the input data size that will have to be managed. One can activate other options such as color and its related color multiplexing strategy (here Bayer multiplexing is chosen using *enum cv::bioinspired::RETINA_COLOR_BAYER*). If using log sampling, the image reduction factor (smaller output images) and log sampling strengh can be adjusted.
-
-.. code-block:: cpp
-
-    // pointer to a retina object
-    cv::Ptr<cv::bioinspired::Retina> myRetina;
-
-    // if the last parameter is 'log', then activate log sampling (favour foveal vision and subsamples peripheral vision)
-    if (useLogSampling)
-    {
-        myRetina = cv::bioinspired::createRetina(inputFrame.size(), true, cv::bioinspired::RETINA_COLOR_BAYER, true, 2.0, 10.0);
-    }
-    else// -> else allocate "classical" retina :
-        myRetina = cv::bioinspired::createRetina(inputFrame.size());
-
-Once done, the proposed code writes a default xml file that contains the default parameters of the retina. This is useful to make your own config using this template. Here generated template xml file is called *RetinaDefaultParameters.xml*.
-
-.. code-block:: cpp
-
-    // save default retina parameters file in order to let you see this and maybe modify it and reload using method "setup"
-    myRetina->write("RetinaDefaultParameters.xml");
-
-In the following line, the retina attempts to load another xml file called *RetinaSpecificParameters.xml*. If you created it and introduced your own setup, it will be loaded, in the other case, default retina parameters are used.
-
-.. code-block:: cpp
-
-    // load parameters if file exists
-    myRetina->setup("RetinaSpecificParameters.xml");
-
-It is not required here but just to show it is possible, you can reset the retina buffers to zero to force it to forget past events.
-
-.. code-block:: cpp
-
-    // reset all retina buffers (imagine you close your eyes for a long time)
-    myRetina->clearBuffers();
-
-Now, it is time to run the retina ! First create some output buffers ready to receive the two retina channels outputs
-
-.. code-block:: cpp
-
-    // declare retina output buffers
-    cv::Mat retinaOutput_parvo;
-    cv::Mat retinaOutput_magno;
-
-Then, run retina in a loop, load new frames from video sequence if necessary and get retina outputs back to dedicated buffers.
-
-.. code-block:: cpp
-
-    // processing loop with no stop condition
-    while(true)
-    {
-        // if using video stream, then, grabbing a new frame, else, input remains the same
-        if (videoCapture.isOpened())
-            videoCapture>>inputFrame;
-
-        // run retina filter on the loaded input frame
-        myRetina->run(inputFrame);
-        // Retrieve and display retina output
-        myRetina->getParvo(retinaOutput_parvo);
-        myRetina->getMagno(retinaOutput_magno);
-        cv::imshow("retina input", inputFrame);
-        cv::imshow("Retina Parvo", retinaOutput_parvo);
-        cv::imshow("Retina Magno", retinaOutput_magno);
-        cv::waitKey(10);
-    }
-
-That's done ! But if you want to secure the system, take care and manage Exceptions. The retina can throw some when it sees irrelevant data (no input frame, wrong setup, etc.).
-Then, i recommend to surround all the retina code by a try/catch system like this :
-
-.. code-block:: cpp
-
-    try{
-         // pointer to a retina object
-         cv::Ptr<cv::Retina> myRetina;
-         [---]
-         // processing loop with no stop condition
-         while(true)
-         {
-             [---]
-         }
-
-    }catch(cv::Exception e)
-    {
-        std::cerr<<"Error using Retina : "<<e.what()<<std::endl;
-    }
-
-Retina parameters, what to do ?
-===============================
-
-First, it is recommended to read the reference paper :
-
-* Benoit A., Caplier A., Durette B., Herault, J., *"Using Human Visual System Modeling For Bio-Inspired Low Level Image Processing"*, Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773. DOI <http://dx.doi.org/10.1016/j.cviu.2010.01.011>
-
-Once done open the configuration file *RetinaDefaultParameters.xml* generated by the demo and let's have a look at it.
-
-.. code-block:: cpp
-
-    <?xml version="1.0"?>
-    <opencv_storage>
-    <OPLandIPLparvo>
-        <colorMode>1</colorMode>
-        <normaliseOutput>1</normaliseOutput>
-        <photoreceptorsLocalAdaptationSensitivity>7.5e-01</photoreceptorsLocalAdaptationSensitivity>
-        <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
-        <photoreceptorsSpatialConstant>5.7e-01</photoreceptorsSpatialConstant>
-        <horizontalCellsGain>0.01</horizontalCellsGain>
-        <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
-        <hcellsSpatialConstant>7.</hcellsSpatialConstant>
-        <ganglionCellsSensitivity>7.5e-01</ganglionCellsSensitivity></OPLandIPLparvo>
-    <IPLmagno>
-        <normaliseOutput>1</normaliseOutput>
-        <parasolCells_beta>0.</parasolCells_beta>
-        <parasolCells_tau>0.</parasolCells_tau>
-        <parasolCells_k>7.</parasolCells_k>
-        <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
-        <V0CompressionParameter>9.5e-01</V0CompressionParameter>
-        <localAdaptintegration_tau>0.</localAdaptintegration_tau>
-        <localAdaptintegration_k>7.</localAdaptintegration_k></IPLmagno>
-    </opencv_storage>
-
-Here are some hints but actually, the best parameter setup depends more on what you want to do with the retina rather than the images input that you give to retina. Apart from the more specific case of High Dynamic Range images (HDR) that require more specific setup for specific luminance compression objective, the retina behaviors should be rather stable from content to content. Note that OpenCV is able to manage such HDR format thanks to the OpenEXR images compatibility.
-
-Then, if the application target requires details enhancement prior to specific image processing, you need to know if mean luminance information is required or not. If not, the the retina can cancel or significantly reduce its energy thus giving more visibility to higher spatial frequency details.
-
-
-Basic parameters
-----------------
-
-The most simple parameters are the following :
-
-* **colorMode** : let the retina process color information (if 1) or gray scale images (if 0). In this last case, only the first channel of the input will be processed.
-
-* **normaliseOutput** : each channel has this parameter, if value is 1, then the considered channel output is rescaled between 0 and 255. Take care in this case at the Magnocellular output level (motion/transient channel detection). Residual noise will also be rescaled !
-
-**Note :** using color requires color channels multiplexing/demultipexing which requires more processing. You can expect much faster processing using gray levels : it would require around 30 product per pixel for all the retina processes and it has recently been parallelized for multicore architectures.
-
-Photo-receptors parameters
---------------------------
-
-The following parameters act on the entry point of the retina - photo-receptors - and impact all the following processes. These sensors are low pass spatio-temporal filters that smooth temporal and spatial data and also adjust there sensitivity to local luminance thus improving details extraction and high frequency noise canceling.
-
-* **photoreceptorsLocalAdaptationSensitivity** between 0 and 1. Values close to 1 allow high luminance log compression effect at the photo-receptors level. Values closer to 0 give a more linear sensitivity. Increased alone, it can burn the *Parvo (details channel)* output image. If adjusted in collaboration with **ganglionCellsSensitivity** images can be very contrasted whatever the local luminance there is... at the price of a naturalness decrease.
-
-* **photoreceptorsTemporalConstant** this setups the temporal constant of the low pass filter effect at the entry of the retina. High value lead to strong temporal smoothing effect : moving objects are blurred and can disappear while static object are favored. But when starting the retina processing, stable state is reached lately.
-
-* **photoreceptorsSpatialConstant** specifies the spatial constant related to photo-receptors low pass filter effect. This parameters specify the minimum allowed spatial signal period allowed in the following. Typically, this filter should cut high frequency noise. Then a 0 value doesn't cut anything noise while higher values start to cut high spatial frequencies and more and more lower frequencies... Then, do not go to high if you wanna see some details of the input images ! A good compromise for color images is 0.53 since this won't affect too much the color spectrum. Higher values would lead to gray and blurred output images.
-
-Horizontal cells parameters
----------------------------
-
-This parameter set tunes the neural network connected to the photo-receptors, the horizontal cells. It modulates photo-receptors sensitivity and completes the processing for final spectral whitening (part of the spatial band pass effect thus favoring visual details enhancement).
-
-* **horizontalCellsGain** here is a critical parameter ! If you are not interested by the mean luminance and focus on details enhancement, then, set to zero. But if you want to keep some environment luminance data, let some low spatial frequencies pass into the system and set a higher value (<1).
-
-* **hcellsTemporalConstant** similar to photo-receptors, this acts on the temporal constant of a low pass temporal filter that smooths input data. Here, a high value generates a high retina after effect while a lower value makes the retina more reactive. This value should be lower than **photoreceptorsTemporalConstant** to limit strong retina after effects.
-
-* **hcellsSpatialConstant** is the spatial constant of the low pass filter of these cells filter. It specifies the lowest spatial frequency allowed in the following. Visually, a high value leads to very low spatial frequencies processing and leads to salient halo effects. Lower values reduce this effect but the limit is : do not go lower than the value of **photoreceptorsSpatialConstant**. Those 2 parameters actually specify the spatial band-pass of the retina.
-
-**NOTE** after the processing managed by the previous parameters, input data is cleaned from noise and luminance in already partly enhanced. The following parameters act on the last processing stages of the two outing retina signals.
-
-Parvo (details channel) dedicated parameter
--------------------------------------------
-
-* **ganglionCellsSensitivity** specifies the strength of the final local adaptation occurring at the output of this details dedicated channel. Parameter values remain between 0 and 1. Low value tend to give a linear response while higher values enforces the remaining low contrasted areas.
-
-**Note :** this parameter can correct eventual burned images by favoring low energetic details of the visual scene, even in bright areas.
-
-IPL Magno (motion/transient channel) parameters
------------------------------------------------
-
-Once image information is cleaned, this channel acts as a high pass temporal filter that only selects signals related to transient signals (events, motion, etc.). A low pass spatial filter smooths extracted transient data and a final logarithmic compression enhances low transient events thus enhancing event sensitivity.
-
-* **parasolCells_beta** generally set to zero, can be considered as an amplifier gain at the entry point of this processing stage. Generally set to 0.
-
-* **parasolCells_tau** the temporal smoothing effect that can be added
-
-* **parasolCells_k** the spatial constant of the spatial filtering effect, set it at a high value to favor low spatial frequency signals that are lower subject to residual noise.
-
-* **amacrinCellsTemporalCutFrequency** specifies the temporal constant of the high pass filter. High values let slow transient events to be selected.
-
-* **V0CompressionParameter** specifies the strength of the log compression. Similar behaviors to previous description but here it enforces sensitivity of transient events.
-
-* **localAdaptintegration_tau** generally set to 0, no real use here actually
-
-* **localAdaptintegration_k** specifies the size of the area on which local adaptation is performed. Low values lead to short range local adaptation (higher sensitivity to noise), high values secure log compression.
diff --git a/doc/tutorials/bioinspired/table_of_content_bioinspired/images/retina_TreeHdr_small.jpg b/doc/tutorials/bioinspired/table_of_content_bioinspired/images/retina_TreeHdr_small.jpg
deleted file mode 100644
index 4ffb43ec4..000000000
Binary files a/doc/tutorials/bioinspired/table_of_content_bioinspired/images/retina_TreeHdr_small.jpg and /dev/null differ
diff --git a/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst b/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst
deleted file mode 100644
index 88869e98f..000000000
--- a/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-.. _Table-Of-Content-Bioinspired:
-
-*bioinspired* module. Algorithms inspired from biological models
-----------------------------------------------------------------
-
-Here you will learn how to use additional modules of OpenCV defined in the "bioinspired" module.
-
- .. include:: ../../definitions/tocDefinitions.rst
-
-+
-  .. tabularcolumns:: m{100pt} m{300pt}
-  .. cssclass:: toctableopencv
-
-  =============== ======================================================
-  |RetinaDemoImg| **Title:** :ref:`Retina_Model`
-
-                  *Compatibility:* > OpenCV 2.4
-
-                  *Author:* |Author_AlexB|
-
-                  You will learn how to process images and video streams with a model of retina filter for details enhancement, spatio-temporal noise removal, luminance correction and spatio-temporal events detection.
-
-  =============== ======================================================
-
-  .. |RetinaDemoImg| image:: images/retina_TreeHdr_small.jpg
-                   :height: 90pt
-                   :width:  90pt
-
- .. raw:: latex
-
-    \pagebreak
-
-.. toctree::
-   :hidden:
-
-   ../retina_model/retina_model
diff --git a/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst~ b/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst~
deleted file mode 100644
index 61fe06b7c..000000000
--- a/doc/tutorials/bioinspired/table_of_content_bioinspired/table_of_content_bioinspired.rst~
+++ /dev/null
@@ -1,36 +0,0 @@
-.. _Table-Of-Content-Contrib:
-
-*contrib* module. The additional contributions made available !
-----------------------------------------------------------------
-
-Here you will learn how to use additional modules of OpenCV defined in the "contrib" module.
-
- .. include:: ../../definitions/tocDefinitions.rst
-
-+
-  .. tabularcolumns:: m{100pt} m{300pt}
-  .. cssclass:: toctableopencv
-
-  =============== ======================================================
-  |RetinaDemoImg| **Title:** :ref:`Retina_Model`
-
-                  *Compatibility:* > OpenCV 2.4
-
-                  *Author:* |Author_AlexB|
-
-                  You will learn how to process images and video streams with a model of retina filter for details enhancement, spatio-temporal noise removal, luminance correction and spatio-temporal events detection.
-
-  =============== ======================================================
-
-  .. |RetinaDemoImg| image:: images/retina_TreeHdr_small.jpg
-                   :height: 90pt
-                   :width:  90pt
-
- .. raw:: latex
-
-    \pagebreak
-
-.. toctree::
-   :hidden:
-
-   ../retina_model/retina_model
diff --git a/doc/tutorials/imgproc/histograms/template_matching/template_matching.rst b/doc/tutorials/imgproc/histograms/template_matching/template_matching.rst
index f4b30ef89..eb90369ff 100644
--- a/doc/tutorials/imgproc/histograms/template_matching/template_matching.rst
+++ b/doc/tutorials/imgproc/histograms/template_matching/template_matching.rst
@@ -85,7 +85,7 @@ d. **method=CV\_TM\_CCORR\_NORMED**
 
    .. math::
 
-      R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I'(x+x',y+y'))}{\sqrt{\sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}
+      R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y'))}{\sqrt{\sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}
 
 
 e. **method=CV\_TM\_CCOEFF**
diff --git a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.rst b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.rst
index 34ddde30d..6aae4bb9c 100644
--- a/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.rst
+++ b/doc/tutorials/imgproc/imgtrans/hough_circle/hough_circle.rst
@@ -22,13 +22,13 @@ Hough Circle Transform
 
      C : ( x_{center}, y_{center}, r )
 
-  where :math:`(x_{center}, y_{center})` define the center position (gree point) and :math:`r` is the radius, which allows us to completely define a circle, as it can be seen below:
+  where :math:`(x_{center}, y_{center})` define the center position (green point) and :math:`r` is the radius, which allows us to completely define a circle, as it can be seen below:
 
   .. image:: images/Hough_Circle_Tutorial_Theory_0.jpg
           :alt: Result of detecting circles with Hough Transform
           :align: center
 
-* For sake of efficiency, OpenCV implements a detection method slightly trickier than the standard Hough Transform: *The Hough gradient method*. For more details, please check the book *Learning OpenCV* or your favorite Computer Vision bibliography
+* For sake of efficiency, OpenCV implements a detection method slightly trickier than the standard Hough Transform: *The Hough gradient method*, which is made up of two main stages. The first stage involves edge detection and finding the possible circle centers and the second stage finds the best radius for each candidate center. For more details, please check the book *Learning OpenCV* or your favorite Computer Vision bibliography
 
 Code
 ======
@@ -44,7 +44,7 @@ Code
    .. |TutorialHoughCirclesFancyDownload| replace:: here
    .. _TutorialHoughCirclesFancyDownload: https://github.com/Itseez/opencv/tree/master/samples/cpp/tutorial_code/ImgTrans/HoughCircle_Demo.cpp
 
-#. The sample code that we will explain can be downloaded from |TutorialHoughCirclesSimpleDownload|_. A slightly fancier version (which shows both Hough standard and probabilistic with trackbars for changing the threshold values) can be found |TutorialHoughCirclesFancyDownload|_.
+#. The sample code that we will explain can be downloaded from |TutorialHoughCirclesSimpleDownload|_. A slightly fancier version (which shows trackbars for changing the threshold values) can be found |TutorialHoughCirclesFancyDownload|_.
 
 .. code-block:: cpp
 
@@ -132,15 +132,15 @@ Explanation
 
    with the arguments:
 
-   * *src_gray*: Input image (grayscale)
+   * *src_gray*: Input image (grayscale).
    * *circles*: A vector that stores sets of 3 values: :math:`x_{c}, y_{c}, r` for each detected circle.
-   * *CV_HOUGH_GRADIENT*: Define the detection method. Currently this is the only one available in OpenCV
-   * *dp = 1*: The inverse ratio of resolution
-   * *min_dist = src_gray.rows/8*: Minimum distance between detected centers
-   * *param_1 = 200*: Upper threshold for the internal Canny edge detector
+   * *CV_HOUGH_GRADIENT*: Define the detection method. Currently this is the only one available in OpenCV.
+   * *dp = 1*: The inverse ratio of resolution.
+   * *min_dist = src_gray.rows/8*: Minimum distance between detected centers.
+   * *param_1 = 200*: Upper threshold for the internal Canny edge detector.
    * *param_2* = 100*: Threshold for center detection.
    * *min_radius = 0*: Minimum radio to be detected. If unknown, put zero as default.
-   * *max_radius = 0*: Maximum radius to be detected. If unknown, put zero as default
+   * *max_radius = 0*: Maximum radius to be detected. If unknown, put zero as default.
 
 #. Draw the detected circles:
 
diff --git a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
index 27dd81581..ef9337aae 100644
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@@ -48,10 +48,10 @@ The structure of package contents looks as follows:
 
 ::
 
-    OpenCV-2.4.7-android-sdk
+    OpenCV-2.4.8-android-sdk
     |_ apk
-    |   |_ OpenCV_2.4.7_binary_pack_armv7a.apk
-    |   |_ OpenCV_2.4.7_Manager_2.14_XXX.apk
+    |   |_ OpenCV_2.4.8_binary_pack_armv7a.apk
+    |   |_ OpenCV_2.4.8_Manager_2.16_XXX.apk
     |
     |_ doc
     |_ samples
@@ -66,7 +66,7 @@ The structure of package contents looks as follows:
     |               |_ armeabi-v7a
     |               |_ x86
     |
-    |_ license.txt
+    |_ LICENSE
     |_ README.android
 
 * :file:`sdk` folder contains OpenCV API and libraries for Android:
@@ -157,10 +157,10 @@ Get the OpenCV4Android SDK
 
    .. code-block:: bash
 
-      unzip ~/Downloads/OpenCV-2.4.7-android-sdk.zip
+      unzip ~/Downloads/OpenCV-2.4.8-android-sdk.zip
 
-.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.7-android-sdk.zip`
-.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.7/OpenCV-2.4.7-android-sdk.zip/download
+.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.8-android-sdk.zip`
+.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.8/OpenCV-2.4.8-android-sdk.zip/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
 .. _seven_zip: http://www.7-zip.org/
@@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
   .. code-block:: sh
     :linenos:
 
-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.7_Manager_2.14_armv7a-neon.apk
+    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.8_Manager_2.16_armv7a-neon.apk
 
   .. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
             platform targets:
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
index 12b602ceb..bc9ff7a4a 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@@ -55,14 +55,14 @@ Manager to access OpenCV libraries externally installed in the target system.
    :guilabel:`File -> Import -> Existing project in your workspace`.
 
    Press :guilabel:`Browse`  button and locate OpenCV4Android SDK
-   (:file:`OpenCV-2.4.7-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.8-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In application project add a reference to the OpenCV Java SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``.
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.8``.
 
    .. image:: images/eclipse_opencv_dependency1.png
         :alt: Add dependency from OpenCV library
@@ -128,27 +128,27 @@ described above.
 #. Add the OpenCV library project to your workspace the same way as for the async initialization
    above. Use menu :guilabel:`File -> Import -> Existing project in your workspace`,
    press :guilabel:`Browse` button and select OpenCV SDK path
-   (:file:`OpenCV-2.4.7-android-sdk/sdk`).
+   (:file:`OpenCV-2.4.8-android-sdk/sdk`).
 
    .. image:: images/eclipse_opencv_dependency0.png
         :alt: Add dependency from OpenCV library
         :align: center
 
 #. In the application project add a reference to the OpenCV4Android SDK in
-   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``;
+   :guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.8``;
 
    .. image:: images/eclipse_opencv_dependency1.png
        :alt: Add dependency from OpenCV library
        :align: center
 
 #. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
-   native libs from :file:`<OpenCV-2.4.7-android-sdk>/sdk/native/libs/<target_arch>` to your
+   native libs from :file:`<OpenCV-2.4.8-android-sdk>/sdk/native/libs/<target_arch>` to your
    project directory to folder :file:`libs/<target_arch>`.
 
    In case of the application project **with a JNI part**, instead of manual libraries copying you
    need to modify your ``Android.mk`` file:
    add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before
-   ``"include path_to_OpenCV-2.4.7-android-sdk/sdk/native/jni/OpenCV.mk"``
+   ``"include path_to_OpenCV-2.4.8-android-sdk/sdk/native/jni/OpenCV.mk"``
 
    .. code-block:: make
       :linenos:
@@ -221,7 +221,7 @@ taken:
 
    .. code-block:: make
 
-      include C:\Work\OpenCV4Android\OpenCV-2.4.7-android-sdk\sdk\native\jni\OpenCV.mk
+      include C:\Work\OpenCV4Android\OpenCV-2.4.8-android-sdk\sdk\native\jni\OpenCV.mk
 
    Should be inserted into the :file:`jni/Android.mk` file **after** this line:
 
@@ -382,7 +382,7 @@ result.
            OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_6, this, mLoaderCallback);
        }
 
-#. Defines that your activity implements ``CvViewFrameListener2`` interface and fix activity related
+#. Defines that your activity implements ``CvCameraViewListener2`` interface and fix activity related
    errors by defining missed methods. For this activity define ``onCreate``, ``onDestroy`` and
    ``onPause`` and implement them according code snippet bellow. Fix errors by adding requited
    imports.
@@ -432,7 +432,7 @@ result.
 Lets discuss some most important steps. Every Android application with UI must implement Activity
 and View. By the first steps we create blank activity and default view layout. The simplest
 OpenCV-centric application must implement OpenCV initialization, create its own view to show
-preview from camera and implements ``CvViewFrameListener2`` interface to get frames from camera and
+preview from camera and implements ``CvCameraViewListener2`` interface to get frames from camera and
 process it.
 
 First of all we create our application view using xml layout. Our layout consists of the only
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
index 0b2253ace..87f6d9d4d 100644
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@@ -106,8 +106,8 @@ Enable hardware optimizations
 -----------------------------
 
 Depending on target platform architecture different instruction sets can be used. By default
-compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
-to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
+compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DENABLE_VFPV3=ON``
+to cmake command line to enable code generation for VFPv3 and ``-DENABLE_NEON=ON`` for using
 NEON SIMD extensions.
 
 TBB is supported on multi core ARM SoCs also.
diff --git a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.rst b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.rst
index b1c96c4c8..f1f820557 100644
--- a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.rst
+++ b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.rst
@@ -25,29 +25,34 @@ Let's use a simple program such as DisplayImage.cpp shown below.
 
 .. code-block:: cpp
 
-   #include <stdio.h>
-   #include <opencv2/opencv.hpp>
+  #include <stdio.h>
+  #include <opencv2/opencv.hpp>
 
-   using namespace cv;
+  using namespace cv;
 
-   int main( int argc, char** argv )
-   {
-     Mat image;
-     image = imread( argv[1], 1 );
+  int main(int argc, char** argv )
+  {
+      if ( argc != 2 )
+      {
+          printf("usage: DisplayImage.out <Image_Path>\n");
+          return -1;
+      }
 
-     if( argc != 2 || !image.data )
-       {
-         printf( "No image data \n" );
-         return -1;
-       }
+      Mat image;
+      image = imread( argv[1], 1 );
 
-     namedWindow( "Display Image", CV_WINDOW_AUTOSIZE );
-     imshow( "Display Image", image );
+      if ( !image.data )
+      {
+          printf("No image data \n");
+          return -1;
+      }
+      namedWindow("Display Image", WINDOW_AUTOSIZE );
+      imshow("Display Image", image);
 
-     waitKey(0);
+      waitKey(0);
 
-     return 0;
-   }
+      return 0;
+  }
 
 Create a CMake file
 ---------------------
diff --git a/doc/tutorials/introduction/linux_install/linux_install.rst b/doc/tutorials/introduction/linux_install/linux_install.rst
index 1e02b64c9..d31c68a88 100644
--- a/doc/tutorials/introduction/linux_install/linux_install.rst
+++ b/doc/tutorials/introduction/linux_install/linux_install.rst
@@ -13,10 +13,10 @@ Required Packages
 
        sudo apt-get install build-essential
 
-  * CMake 2.6 or higher;
+  * CMake 2.8.7 or higher;
   * Git;
   * GTK+2.x or higher, including headers (libgtk2.0-dev);
-  * pkgconfig;
+  * pkg-config;
   * Python 2.6 or later and Numpy 1.5 or later with developer packages (python-dev, python-numpy);
   * ffmpeg or libav development packages: libavcodec-dev, libavformat-dev, libswscale-dev;
   * [optional] libdc1394 2.x;
@@ -74,7 +74,8 @@ Building OpenCV from Source Using CMake, Using the Command Line
 
    .. code-block:: bash
 
-      make
+      make -j8 # -j8 runs 8 jobs in parallel.
+               # Change 8 to number of hardware threads available.
       sudo make install
 
 .. note::
diff --git a/doc/tutorials/introduction/load_save_image/load_save_image.rst b/doc/tutorials/introduction/load_save_image/load_save_image.rst
index ac0ee02e5..57d55d3a9 100644
--- a/doc/tutorials/introduction/load_save_image/load_save_image.rst
+++ b/doc/tutorials/introduction/load_save_image/load_save_image.rst
@@ -99,7 +99,7 @@ Explanation
       imshow( imageName, image );
       imshow( "Gray image", gray_image );
 
-#. Add add the *waitKey(0)* function call for the program to wait forever for an user key press.
+#. Add the *waitKey(0)* function call for the program to wait forever for an user key press.
 
 
 Result
diff --git a/doc/tutorials/introduction/windows_install/windows_install.rst b/doc/tutorials/introduction/windows_install/windows_install.rst
index c29c13aed..dd91027ba 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.rst
+++ b/doc/tutorials/introduction/windows_install/windows_install.rst
@@ -81,7 +81,7 @@ Building the OpenCV library from scratch requires a couple of tools installed be
 
    + An IDE of choice (preferably), or just a C\C++ compiler that will actually make the binary files. Here we will use the `Microsoft Visual Studio <https://www.microsoft.com/visualstudio/en-us>`_. However, you can use any other IDE that has a valid C\C++ compiler.
 
-   + |CMake|_, which is a neat tool to make the project files (for your choosen IDE) from the OpenCV source files. It will also allow an easy configuration of the OpenCV build files, in order to make binary files that fits exactly to your needs.
+   + |CMake|_, which is a neat tool to make the project files (for your chosen IDE) from the OpenCV source files. It will also allow an easy configuration of the OpenCV build files, in order to make binary files that fits exactly to your needs.
 
    + Git to acquire the OpenCV source files. A good tool for this is |TortoiseGit|_. Alternatively, you can just download an archived version of the source files from our `page on Sourceforge <http://sourceforge.net/projects/opencvlibrary/files/opencv-win/>`_
 
@@ -320,7 +320,7 @@ First we set an enviroment variable to make easier our work. This will hold the
 
 Here the directory is where you have your OpenCV binaries (*extracted* or *built*). You can have different platform (e.g. x64 instead of x86) or compiler type, so substitute appropriate value. Inside this you should have two folders called *lib* and *bin*. The -m should be added if you wish to make the settings computer wise, instead of user wise.
 
-If you built static libraries then you are done. Otherwise, you need to add the *bin* folders path to the systems path. This is cause you will use the OpenCV library in form of *\"Dynamic-link libraries\"* (also known as **DLL**). Inside these are stored all the algorithms and information the OpenCV library contains. The operating system will load them only on demand, during runtime. However, to do this he needs to know where they are. The systems **PATH** contains a list of folders where DLLs can be found. Add the OpenCV library path to this and the OS will know where to look if he ever needs the OpenCV binaries. Otherwise, you will need to copy the used DLLs right beside the applications executable file (*exe*) for the OS to find it, which is highly unpleasent if you work on many projects. To do this start up again the |PathEditor|_ and add the following new entry (right click in the application to bring up the menu):
+If you built static libraries then you are done. Otherwise, you need to add the *bin* folders path to the systems path. This is because you will use the OpenCV library in form of *\"Dynamic-link libraries\"* (also known as **DLL**). Inside these are stored all the algorithms and information the OpenCV library contains. The operating system will load them only on demand, during runtime. However, to do this the operating system needs to know where they are. The systems **PATH** contains a list of folders where DLLs can be found. Add the OpenCV library path to this and the OS will know where to look if he ever needs the OpenCV binaries. Otherwise, you will need to copy the used DLLs right beside the applications executable file (*exe*) for the OS to find it, which is highly unpleasent if you work on many projects. To do this start up again the |PathEditor|_ and add the following new entry (right click in the application to bring up the menu):
 
 ::
 
diff --git a/doc/tutorials/tutorials.rst b/doc/tutorials/tutorials.rst
index 4ce7491ec..cd58c44e9 100644
--- a/doc/tutorials/tutorials.rst
+++ b/doc/tutorials/tutorials.rst
@@ -171,21 +171,6 @@ As always, we would be happy to hear your comments and receive your contribution
                  :width:  80pt
                  :alt: gpu icon
 
-*  :ref:`Table-Of-Content-Bioinspired`
-
-   .. tabularcolumns:: m{100pt} m{300pt}
-   .. cssclass:: toctableopencv
-
-   ============= =======================================================
-   |Bioinspired|       Algorithms inspired from biological models.
-
-   ============= =======================================================
-
-   .. |Bioinspired| image:: images/retina.jpg
-                 :height: 80pt
-                 :width:  80pt
-                 :alt: gpu icon
-
 *  :ref:`Table-Of-Content-iOS`
 
    .. tabularcolumns:: m{100pt} m{300pt}
@@ -250,7 +235,6 @@ As always, we would be happy to hear your comments and receive your contribution
    ml/table_of_content_ml/table_of_content_ml
    photo/table_of_content_photo/table_of_content_photo
    gpu/table_of_content_gpu/table_of_content_gpu
-   bioinspired/table_of_content_bioinspired/table_of_content_bioinspired
    ios/table_of_content_ios/table_of_content_ios
    viz/table_of_content_viz/table_of_content_viz
    general/table_of_content_general/table_of_content_general
diff --git a/doc/tutorials/viz/launching_viz/launching_viz.rst b/doc/tutorials/viz/launching_viz/launching_viz.rst
index 0bf31977d..a507a7f27 100644
--- a/doc/tutorials/viz/launching_viz/launching_viz.rst
+++ b/doc/tutorials/viz/launching_viz/launching_viz.rst
@@ -43,7 +43,7 @@ You can download the code from :download:`here <../../../../samples/cpp/tutorial
         cout << "First event loop is over" << endl;
 
         /// Access window via its name
-        viz::Viz3d sameWindow = viz::get("Viz Demo");
+        viz::Viz3d sameWindow = viz::getWindowByName("Viz Demo");
 
         /// Start event loop
         sameWindow.spin();
diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst
new file mode 100644
index 000000000..bae5f7014
--- /dev/null
+++ b/doc/user_guide/ug_intelperc.rst
@@ -0,0 +1,79 @@
+*******
+HighGUI
+*******
+
+.. highlight:: cpp
+
+Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors
+=======================================================================================
+
+Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``.
+
+In order to use depth sensor with OpenCV you should do the following preliminary steps:
+
+#.
+    Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual).
+
+#.
+    Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value.
+
+#.
+    Build OpenCV.
+
+VideoCapture can retrieve the following data:
+
+#.
+    data given from depth generator:
+      * ``CV_CAP_INTELPERC_DEPTH_MAP``       - each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. (CV_16UC1)
+      * ``CV_CAP_INTELPERC_UVDEPTH_MAP``     - each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. (CV_32FC2)
+      * ``CV_CAP_INTELPERC_IR_MAP``          - each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. (CV_16UC1)
+#.
+    data given from RGB image generator:
+      * ``CV_CAP_INTELPERC_IMAGE``           - color image. (CV_8UC3)
+
+In order to get depth map from depth sensor use ``VideoCapture::operator >>``, e. g. ::
+
+    VideoCapture capture( CV_CAP_INTELPERC );
+    for(;;)
+    {
+        Mat depthMap;
+        capture >> depthMap;
+
+        if( waitKey( 30 ) >= 0 )
+            break;
+    }
+
+For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::retrieve``, e.g. ::
+
+    VideoCapture capture(CV_CAP_INTELPERC);
+    for(;;)
+    {
+        Mat depthMap;
+        Mat image;
+        Mat irImage;
+
+        capture.grab();
+
+        capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP );
+        capture.retrieve(    image, CV_CAP_INTELPERC_IMAGE );
+        capture.retrieve(  irImage, CV_CAP_INTELPERC_IR_MAP);
+
+        if( waitKey( 30 ) >= 0 )
+            break;
+    }
+
+For setting and getting some property of sensor` data generators use ``VideoCapture::set`` and ``VideoCapture::get`` methods respectively, e.g. ::
+
+    VideoCapture capture( CV_CAP_INTELPERC );
+    capture.set( CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0 );
+    cout << "FPS    " << capture.get( CV_CAP_INTELPERC_DEPTH_GENERATOR+CV_CAP_PROP_FPS ) << endl;
+
+Since two types of sensor's data generators are supported (image generator and depth generator), there are two flags that should be used to set/get property of the needed generator:
+
+* CV_CAP_INTELPERC_IMAGE_GENERATOR -- a flag for access to the image generator properties.
+
+* CV_CAP_INTELPERC_DEPTH_GENERATOR -- a flag for access to the depth generator properties. This flag value is assumed by default if neither of the two possible values of the property is set.
+
+For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder.
+
+.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp
\ No newline at end of file
diff --git a/doc/user_guide/user_guide.rst b/doc/user_guide/user_guide.rst
index de9edcb68..76cf756f8 100644
--- a/doc/user_guide/user_guide.rst
+++ b/doc/user_guide/user_guide.rst
@@ -9,3 +9,4 @@ OpenCV User Guide
    ug_features2d.rst
    ug_highgui.rst
    ug_traincascade.rst
+   ug_intelperc.rst
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index ed3b85a8f..b4e48e6fa 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB old_hdrs "opencv/*.h*")
 install(FILES ${old_hdrs}
     DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv
-    COMPONENT main)
+    COMPONENT dev)
 install(FILES "opencv2/opencv.hpp"
     DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2
-    COMPONENT main)
+    COMPONENT dev)
diff --git a/include/opencv2/opencv.hpp b/include/opencv2/opencv.hpp
index 3b96bdd36..020a45373 100644
--- a/include/opencv2/opencv.hpp
+++ b/include/opencv2/opencv.hpp
@@ -52,7 +52,6 @@
 #include "opencv2/calib3d.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/contrib.hpp"
-#include "opencv2/bioinspired.hpp"
 #include "opencv2/ml.hpp"
 
 #endif
diff --git a/modules/androidcamera/CMakeLists.txt b/modules/androidcamera/CMakeLists.txt
index 8ac8ced88..3858ba9f6 100644
--- a/modules/androidcamera/CMakeLists.txt
+++ b/modules/androidcamera/CMakeLists.txt
@@ -40,6 +40,6 @@ else()
     get_filename_component(wrapper_name "${wrapper}" NAME)
     install(FILES "${LIBRARY_OUTPUT_PATH}/${wrapper_name}"
             DESTINATION ${OPENCV_LIB_INSTALL_PATH}
-            COMPONENT main)
+            COMPONENT libs)
   endforeach()
 endif()
diff --git a/modules/androidcamera/camera_wrapper/CMakeLists.txt b/modules/androidcamera/camera_wrapper/CMakeLists.txt
index 21b9ee1ad..bc5585a7a 100644
--- a/modules/androidcamera/camera_wrapper/CMakeLists.txt
+++ b/modules/androidcamera/camera_wrapper/CMakeLists.txt
@@ -63,4 +63,4 @@ if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
 endif()
 
 
-install(TARGETS ${the_target} LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main)
+install(TARGETS ${the_target} LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT libs)
diff --git a/modules/bioinspired/CMakeLists.txt b/modules/bioinspired/CMakeLists.txt
deleted file mode 100644
index c800d33ff..000000000
--- a/modules/bioinspired/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-set(the_description "Biologically inspired algorithms")
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-ocv_define_module(bioinspired opencv_core OPTIONAL opencv_highgui opencv_ocl)
diff --git a/modules/bioinspired/doc/bioinspired.rst b/modules/bioinspired/doc/bioinspired.rst
deleted file mode 100644
index 6bffcdcf2..000000000
--- a/modules/bioinspired/doc/bioinspired.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-********************************************************************
-bioinspired. Biologically inspired vision models and derivated tools
-********************************************************************
-
-The module provides biological visual systems models (human visual system and others). It also provides derivated objects that take advantage of those bio-inspired models.
-
-.. toctree::
-    :maxdepth: 2
-
-    Human retina documentation <retina/index>
diff --git a/modules/bioinspired/doc/retina/images/retinaInput.jpg b/modules/bioinspired/doc/retina/images/retinaInput.jpg
deleted file mode 100644
index d3cdeeecb..000000000
Binary files a/modules/bioinspired/doc/retina/images/retinaInput.jpg and /dev/null differ
diff --git a/modules/bioinspired/doc/retina/images/retinaOutput_default.jpg b/modules/bioinspired/doc/retina/images/retinaOutput_default.jpg
deleted file mode 100644
index 0b14a5308..000000000
Binary files a/modules/bioinspired/doc/retina/images/retinaOutput_default.jpg and /dev/null differ
diff --git a/modules/bioinspired/doc/retina/images/retinaOutput_realistic.jpg b/modules/bioinspired/doc/retina/images/retinaOutput_realistic.jpg
deleted file mode 100644
index 1bd60f80c..000000000
Binary files a/modules/bioinspired/doc/retina/images/retinaOutput_realistic.jpg and /dev/null differ
diff --git a/modules/bioinspired/doc/retina/index.rst b/modules/bioinspired/doc/retina/index.rst
deleted file mode 100644
index fd487b7f9..000000000
--- a/modules/bioinspired/doc/retina/index.rst
+++ /dev/null
@@ -1,493 +0,0 @@
-Retina : a Bio mimetic human retina model
-*****************************************
-
-.. highlight:: cpp
-
-Retina
-======
-.. ocv:class:: Retina : public Algorithm
-
-**Note** : do not forget that the retina model is included in the following namespace : *cv::bioinspired*.
-
-Introduction
-++++++++++++
-
-Class which provides the main controls to the Gipsa/Listic labs human  retina model. This is a non separable spatio-temporal filter modelling the two main retina information channels :
-
-* foveal vision for detailled color vision : the parvocellular pathway.
-
-* peripheral vision for sensitive transient signals detection (motion and events) : the magnocellular pathway.
-
-From a general point of view, this filter whitens the image spectrum and corrects luminance thanks to local adaptation. An other important property is its hability to filter out spatio-temporal noise while enhancing details.
-This model originates from Jeanny Herault work [Herault2010]_. It has been involved in Alexandre Benoit phd and his current research [Benoit2010]_, [Strat2013]_ (he currently maintains this module within OpenCV). It includes the work of other Jeanny's phd student such as [Chaix2007]_ and the log polar transformations of Barthelemy Durette described in Jeanny's book.
-
-**NOTES :**
-
-* For ease of use in computer vision applications, the two retina channels are applied homogeneously on all the input images. This does not follow the real retina topology but this can still be done using the log sampling capabilities proposed within the class.
-
-* Extend the retina description and code use in the tutorial/contrib section for complementary explanations.
-
-Preliminary illustration
-++++++++++++++++++++++++
-
-As a preliminary presentation, let's start with a visual example. We propose to apply the filter on a low quality color jpeg image with backlight problems. Here is the considered input... *"Well, my eyes were able to see more that this strange black shadow..."*
-
-.. image:: images/retinaInput.jpg
-   :alt: a low quality color jpeg image with backlight problems.
-   :align: center
-
-Below, the retina foveal model applied on the entire image with default parameters. Here contours are enforced, halo effects are voluntary visible with this configuration. See parameters discussion below and increase horizontalCellsGain near 1 to remove them.
-
-.. image:: images/retinaOutput_default.jpg
-   :alt: the retina foveal model applied on the entire image with default parameters. Here contours are enforced, luminance is corrected and halo effects are voluntary visible with this configuration, increase horizontalCellsGain near 1 to remove them.
-   :align: center
-
-Below, a second retina foveal model output applied on the entire image with a parameters setup focused on naturalness perception. *"Hey, i now recognize my cat, looking at the mountains at the end of the day !"*. Here contours are enforced, luminance is corrected but halos are avoided with this configuration. The backlight effect is corrected and highlight details are still preserved. Then, even on a low quality jpeg image, if some luminance information remains, the retina is able to reconstruct a proper visual signal. Such configuration is also usefull for High Dynamic Range (*HDR*) images compression to 8bit images as discussed in [benoit2010]_ and in the demonstration codes discussed below.
-As shown at the end of the page, parameters change from defaults are :
-
-* horizontalCellsGain=0.3
-
-* photoreceptorsLocalAdaptationSensitivity=ganglioncellsSensitivity=0.89.
-
-.. image:: images/retinaOutput_realistic.jpg
-   :alt: the retina foveal model applied on the entire image with 'naturalness' parameters. Here contours are enforced but are avoided with this configuration, horizontalCellsGain is 0.3 and photoreceptorsLocalAdaptationSensitivity=ganglioncellsSensitivity=0.89.
-   :align: center
-
-As observed in this preliminary demo, the retina can be settled up with various parameters, by default, as shown on the figure above, the retina strongly reduces mean luminance energy and enforces all details of the visual scene. Luminance energy and halo effects can be modulated (exagerated to cancelled as shown on the two examples). In order to use your own parameters, you can use at least one time the *write(String fs)* method which will write a proper XML file with all default parameters. Then, tweak it on your own and reload them at any time using method *setup(String fs)*. These methods update a *Retina::RetinaParameters* member structure that is described hereafter. XML parameters file samples are shown at the end of the page.
-
-Here is an overview of the abstract Retina interface, allocate one instance with the *createRetina* functions.::
-
-  namespace cv{namespace bioinspired{
-
-  class Retina : public Algorithm
-  {
-  public:
-    // parameters setup instance
-    struct RetinaParameters; // this class is detailled later
-
-    // main method for input frame processing (all use method, can also perform High Dynamic Range tone mapping)
-    void run (InputArray inputImage);
-
-    // specific method aiming at correcting luminance only (faster High Dynamic Range tone mapping)
-    void applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)
-
-    // output buffers retreival methods
-    // -> foveal color vision details channel with luminance and noise correction
-    void getParvo (OutputArray retinaOutput_parvo);
-    void getParvoRAW (OutputArray retinaOutput_parvo);// retreive original output buffers without any normalisation
-    const Mat getParvoRAW () const;// retreive original output buffers without any normalisation
-    // -> peripheral monochrome motion and events (transient information) channel
-    void getMagno (OutputArray retinaOutput_magno);
-    void getMagnoRAW (OutputArray retinaOutput_magno); // retreive original output buffers without any normalisation
-    const Mat getMagnoRAW () const;// retreive original output buffers without any normalisation
-
-    // reset retina buffers... equivalent to closing your eyes for some seconds
-    void clearBuffers ();
-
-    // retreive input and output buffers sizes
-    Size getInputSize ();
-    Size getOutputSize ();
-
-    // setup methods with specific parameters specification of global xml config file loading/write
-    void setup (String retinaParameterFile="", const bool applyDefaultSetupOnFailure=true);
-    void setup (FileStorage &fs, const bool applyDefaultSetupOnFailure=true);
-    void setup (RetinaParameters newParameters);
-    struct Retina::RetinaParameters getParameters ();
-    const String printSetup ();
-    virtual void write (String fs) const;
-    virtual void write (FileStorage &fs) const;
-    void setupOPLandIPLParvoChannel (const bool colorMode=true, const bool normaliseOutput=true, const float photoreceptorsLocalAdaptationSensitivity=0.7, const float photoreceptorsTemporalConstant=0.5, const float photoreceptorsSpatialConstant=0.53, const float horizontalCellsGain=0, const float HcellsTemporalConstant=1, const float HcellsSpatialConstant=7, const float ganglionCellsSensitivity=0.7);
-    void setupIPLMagnoChannel (const bool normaliseOutput=true, const float parasolCells_beta=0, const float parasolCells_tau=0, const float parasolCells_k=7, const float amacrinCellsTemporalCutFrequency=1.2, const float V0CompressionParameter=0.95, const float localAdaptintegration_tau=0, const float localAdaptintegration_k=7);
-    void setColorSaturation (const bool saturateColors=true, const float colorSaturationValue=4.0);
-    void activateMovingContoursProcessing (const bool activate);
-    void activateContoursProcessing (const bool activate);
-  };
-
-    // Allocators
-    cv::Ptr<Retina> createRetina (Size inputSize);
-    cv::Ptr<Retina> createRetina (Size inputSize, const bool colorMode, RETINA_COLORSAMPLINGMETHOD colorSamplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-    }} // cv and bioinspired namespaces end
-
-.. Sample code::
-
-   * An example on retina tone mapping can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
-   * An example on retina tone mapping on video input can be found at opencv_source_code/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
-   * A complete example illustrating the retina interface can be found at opencv_source_code/samples/cpp/retinaDemo.cpp
-
-Description
-+++++++++++
-
-Class which allows the `Gipsa <http://www.gipsa-lab.inpg.fr>`_ (preliminary work) / `Listic <http://www.listic.univ-savoie.fr>`_ (code maintainer and user) labs retina model to be used. This class allows human retina spatio-temporal image processing to be applied on still images, images sequences and video sequences. Briefly, here are the main human retina model properties:
-
-* spectral whithening (mid-frequency details enhancement)
-
-* high frequency spatio-temporal noise reduction (temporal noise and high frequency spatial noise are minimized)
-
-* low frequency luminance reduction (luminance range compression) : high luminance regions do not hide details in darker regions anymore
-
-* local logarithmic luminance compression allows details to be enhanced even in low light conditions
-
-Use : this model can be used basically for spatio-temporal video effects but also in the aim of :
-
-* performing texture analysis with enhanced signal to noise ratio and enhanced details robust against input images luminance ranges (check out the parvocellular retina channel output, by using the provided **getParvo** methods)
-
-* performing motion analysis also taking benefit of the previously cited properties  (check out the magnocellular retina channel output, by using the provided **getMagno** methods)
-
-* general image/video sequence description using either one or both channels. An example of the use of Retina in a Bag of Words approach is given in [Strat2013]_.
-
-Literature
-==========
-For more information, refer to the following papers :
-
-* Model description :
-
-.. [Benoit2010] Benoit A., Caplier A., Durette B., Herault, J., "Using Human Visual System Modeling For Bio-Inspired Low Level Image Processing", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773. DOI <http://dx.doi.org/10.1016/j.cviu.2010.01.011>
-
-* Model use in a Bag of Words approach :
-
-.. [Strat2013] Strat S., Benoit A., Lambert P., "Retina enhanced SIFT descriptors for video indexing", CBMI2013, Veszprém, Hungary, 2013.
-
-* Please have a look at the reference work of Jeanny Herault that you can read in his book :
-
-.. [Herault2010] Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-
-This retina filter code includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-
-* take a look at the *retinacolor.hpp* module to discover Brice Chaix de Lavarene phD color mosaicing/demosaicing and his reference paper:
-
-.. [Chaix2007] B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-
-* take a look at *imagelogpolprojection.hpp* to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions. More informations in the above cited Jeanny Heraults's book.
-
-* Meylan&al work on HDR tone mapping that is implemented as a specific method within the model :
-
-.. [Meylan2007] L. Meylan , D. Alleysson, S. Susstrunk, "A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images", Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
-
-Demos and experiments !
-=======================
-
-**NOTE : Complementary to the following examples, have a look at the Retina tutorial in the tutorial/contrib section for complementary explanations.**
-
-Take a look at the provided C++ examples provided with OpenCV :
-
-* **samples/cpp/retinademo.cpp** shows how to use the retina module for details enhancement (Parvo channel output) and transient maps observation (Magno channel output). You can play with images, video sequences and webcam video.
-    Typical uses are (provided your OpenCV installation is situated in folder *OpenCVReleaseFolder*)
-
-    * image processing : **OpenCVReleaseFolder/bin/retinademo -image myPicture.jpg**
-
-    * video processing : **OpenCVReleaseFolder/bin/retinademo -video myMovie.avi**
-
-    * webcam processing: **OpenCVReleaseFolder/bin/retinademo -video**
-
-   **Note :** This demo generates the file *RetinaDefaultParameters.xml* which contains the default parameters of the retina. Then, rename this as *RetinaSpecificParameters.xml*, adjust the parameters the way you want and reload the program to check the effect.
-
-
-* **samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp** shows how to use the retina to perform High Dynamic Range (HDR) luminance compression
-
-   Then, take a HDR image using bracketing with your camera and generate an OpenEXR image and then process it using the demo.
-
-   Typical use, supposing that you have the OpenEXR image such as *memorial.exr* (present in the samples/cpp/ folder)
-
-   **OpenCVReleaseFolder/bin/OpenEXRimages_HDR_Retina_toneMapping memorial.exr [optional: 'fast']**
-
-      Note that some sliders are made available to allow you to play with luminance compression.
-
-      If not using the 'fast' option, then, tone mapping is performed using the full retina model [Benoit2010]_. It includes spectral whitening that allows luminance energy to be reduced. When using the 'fast' option, then, a simpler method is used, it is an adaptation of the algorithm presented in [Meylan2007]_. This method gives also good results and is faster to process but it sometimes requires some more parameters adjustement.
-
-
-Methods description
-===================
-
-Here are detailled the main methods to control the retina model
-
-Ptr<Retina>::createRetina
-+++++++++++++++++++++++++
-
-.. ocv:function:: Ptr<cv::bioinspired::Retina> createRetina(Size inputSize)
-.. ocv:function:: Ptr<cv::bioinspired::Retina> createRetina(Size inputSize, const bool colorMode, cv::bioinspired::RETINA_COLORSAMPLINGMETHOD colorSamplingMethod = cv::bioinspired::RETINA_COLOR_BAYER, const bool useRetinaLogSampling = false, const double reductionFactor = 1.0, const double samplingStrenght = 10.0 )
-
-    Constructors from standardized interfaces : retreive a smart pointer to a Retina instance
-
-    :param inputSize: the input frame size
-    :param colorMode: the chosen processing mode : with or without color processing
-    :param colorSamplingMethod: specifies which kind of color sampling will be used :
-
-        * cv::bioinspired::RETINA_COLOR_RANDOM: each pixel position is either R, G or B in a random choice
-
-        * cv::bioinspired::RETINA_COLOR_DIAGONAL: color sampling is RGBRGBRGB..., line 2 BRGBRGBRG..., line 3, GBRGBRGBR...
-
-        * cv::bioinspired::RETINA_COLOR_BAYER: standard bayer sampling
-
-    :param useRetinaLogSampling: activate retina log sampling, if true, the 2 following parameters can be used
-    :param reductionFactor: only usefull if param useRetinaLogSampling=true, specifies the reduction factor of the output frame (as the center (fovea) is high resolution and corners can be underscaled, then a reduction of the output is allowed without precision leak
-    :param samplingStrenght: only usefull if param useRetinaLogSampling=true, specifies the strenght of the log scale that is applied
-
-Retina::activateContoursProcessing
-++++++++++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::activateContoursProcessing(const bool activate)
-
-    Activate/desactivate the Parvocellular pathway processing (contours information extraction), by default, it is activated
-
-    :param activate: true if Parvocellular (contours information extraction) output should be activated, false if not... if activated, the Parvocellular output can be retrieved using the **getParvo** methods
-
-Retina::activateMovingContoursProcessing
-++++++++++++++++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::activateMovingContoursProcessing(const bool activate)
-
-    Activate/desactivate the Magnocellular pathway processing (motion information extraction), by default, it is activated
-
-    :param activate: true if Magnocellular output should be activated, false if not... if activated, the Magnocellular output can be retrieved using the **getMagno** methods
-
-Retina::clearBuffers
-++++++++++++++++++++
-
-.. ocv:function:: void Retina::clearBuffers()
-
-    Clears all retina buffers (equivalent to opening the eyes after a long period of eye close ;o) whatchout the temporal transition occuring just after this method call.
-
-Retina::getParvo
-++++++++++++++++
-
-.. ocv:function:: void Retina::getParvo( OutputArray retinaOutput_parvo )
-.. ocv:function:: void Retina::getParvoRAW( OutputArray retinaOutput_parvo )
-.. ocv:function:: const Mat Retina::getParvoRAW() const
-
-    Accessor of the details channel of the retina (models foveal vision). Warning, getParvoRAW methods return buffers that are not rescaled within range [0;255] while the non RAW method allows a normalized matrix to be retrieved.
-
-    :param retinaOutput_parvo: the output buffer (reallocated if necessary), format can be :
-
-        * a Mat, this output is rescaled for standard 8bits image processing use in OpenCV
-
-        * RAW methods actually return a 1D matrix (encoding is R1, R2, ... Rn, G1, G2, ..., Gn, B1, B2, ...Bn), this output is the original retina filter model output, without any quantification or rescaling.
-
-Retina::getMagno
-++++++++++++++++
-
-.. ocv:function:: void Retina::getMagno( OutputArray retinaOutput_magno )
-.. ocv:function:: void Retina::getMagnoRAW( OutputArray retinaOutput_magno )
-.. ocv:function:: const Mat Retina::getMagnoRAW() const
-
-    Accessor of the motion channel of the retina (models peripheral vision). Warning, getMagnoRAW methods return buffers that are not rescaled within range [0;255] while the non RAW method allows a normalized matrix to be retrieved.
-
-    :param retinaOutput_magno: the output buffer (reallocated if necessary), format can be :
-
-        * a Mat, this output is rescaled for standard 8bits image processing use in OpenCV
-
-        * RAW methods actually return a 1D matrix (encoding is M1, M2,... Mn), this output is the original retina filter model output, without any quantification or rescaling.
-
-Retina::getInputSize
-++++++++++++++++++++
-
-.. ocv:function:: Size Retina::getInputSize()
-
-    Retreive retina input buffer size
-
-    :return: the retina input buffer size
-
-Retina::getOutputSize
-+++++++++++++++++++++
-
-.. ocv:function:: Size Retina::getOutputSize()
-
-    Retreive retina output buffer size that can be different from the input if a spatial log transformation is applied
-
-    :return: the retina output buffer size
-
-Retina::printSetup
-++++++++++++++++++
-
-.. ocv:function:: const String Retina::printSetup()
-
-    Outputs a string showing the used parameters setup
-
-    :return: a string which contains formated parameters information
-
-Retina::run
-+++++++++++
-
-.. ocv:function:: void Retina::run(InputArray inputImage)
-
-    Method which allows retina to be applied on an input image, after run, encapsulated retina module is ready to deliver its outputs using dedicated acccessors, see getParvo and getMagno methods
-
-    :param inputImage: the input Mat image to be processed, can be gray level or BGR coded in any format (from 8bit to 16bits)
-
-Retina::applyFastToneMapping
-++++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)
-
-    Method which processes an image in the aim to correct its luminance : correct backlight problems, enhance details in shadows. This method is designed to perform High Dynamic Range image tone mapping (compress >8bit/pixel images to 8bit/pixel). This is a simplified version of the Retina Parvocellular model (simplified version of the run/getParvo methods call) since it does not include the spatio-temporal filter modelling the Outer Plexiform Layer of the retina that performs spectral whitening and many other stuff. However, it works great for tone mapping and in a faster way.
-
-    Check the demos and experiments section to see examples and the way to perform tone mapping using the original retina model and the method.
-
-    :param inputImage: the input image to process (should be coded in float format : CV_32F, CV_32FC1, CV_32F_C3, CV_32F_C4, the 4th channel won't be considered).
-    :param outputToneMappedImage: the output 8bit/channel tone mapped image (CV_8U or CV_8UC3 format).
-
-Retina::setColorSaturation
-++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::setColorSaturation(const bool saturateColors = true, const float colorSaturationValue = 4.0 )
-
-    Activate color saturation as the final step of the color demultiplexing process -> this saturation is a sigmoide function applied to each channel of the demultiplexed image.
-
-    :param saturateColors: boolean that activates color saturation (if true) or desactivate (if false)
-    :param colorSaturationValue: the saturation factor : a simple factor applied on the chrominance buffers
-
-
-Retina::setup
-+++++++++++++
-
-.. ocv:function:: void Retina::setup(String retinaParameterFile = "", const bool applyDefaultSetupOnFailure = true )
-.. ocv:function:: void Retina::setup(FileStorage & fs, const bool applyDefaultSetupOnFailure = true )
-.. ocv:function:: void Retina::setup(RetinaParameters newParameters)
-
-    Try to open an XML retina parameters file to adjust current retina instance setup => if the xml file does not exist, then default setup is applied => warning, Exceptions are thrown if read XML file is not valid
-
-    :param retinaParameterFile: the parameters filename
-    :param applyDefaultSetupOnFailure: set to true if an error must be thrown on error
-    :param fs: the open Filestorage which contains retina parameters
-    :param newParameters: a parameters structures updated with the new target configuration. You can retreive the current parameers structure using method *Retina::RetinaParameters Retina::getParameters()* and update it before running method *setup*.
-
-Retina::write
-+++++++++++++
-
-.. ocv:function:: void Retina::write( String fs ) const
-.. ocv:function:: void Retina::write( FileStorage& fs ) const
-
-    Write xml/yml formated parameters information
-
-    :param fs: the filename of the xml file that will be open and writen with formatted parameters information
-
-Retina::setupIPLMagnoChannel
-++++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::setupIPLMagnoChannel(const bool normaliseOutput = true, const float parasolCells_beta = 0, const float parasolCells_tau = 0, const float parasolCells_k = 7, const float amacrinCellsTemporalCutFrequency = 1.2, const float V0CompressionParameter = 0.95, const float localAdaptintegration_tau = 0, const float localAdaptintegration_k = 7 )
-
-    Set parameters values for the Inner Plexiform Layer (IPL) magnocellular channel this channel processes signals output from OPL processing stage in peripheral vision, it allows motion information enhancement. It is decorrelated from the details channel. See reference papers for more details.
-
-    :param normaliseOutput: specifies if (true) output is rescaled between 0 and 255 of not (false)
-    :param parasolCells_beta: the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-    :param parasolCells_tau: the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-    :param parasolCells_k: the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-    :param amacrinCellsTemporalCutFrequency: the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, typical value is 1.2
-    :param V0CompressionParameter: the compression strengh of the ganglion cells local adaptation output, set a value between 0.6 and 1 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 0.95
-    :param localAdaptintegration_tau: specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-    :param localAdaptintegration_k: specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-
-Retina::setupOPLandIPLParvoChannel
-++++++++++++++++++++++++++++++++++
-
-.. ocv:function:: void Retina::setupOPLandIPLParvoChannel(const bool colorMode = true, const bool normaliseOutput = true, const float photoreceptorsLocalAdaptationSensitivity = 0.7, const float photoreceptorsTemporalConstant = 0.5, const float photoreceptorsSpatialConstant = 0.53, const float horizontalCellsGain = 0, const float HcellsTemporalConstant = 1, const float HcellsSpatialConstant = 7, const float ganglionCellsSensitivity = 0.7 )
-
-    Setup the OPL and IPL parvo channels (see biologocal model) OPL is referred as Outer Plexiform Layer of the retina, it allows the spatio-temporal filtering which withens the spectrum and reduces spatio-temporal noise while attenuating global luminance (low frequency energy) IPL parvo is the OPL next processing stage, it refers to a part of the Inner Plexiform layer of the retina, it allows high contours sensitivity in foveal vision. See reference papers for more informations.
-
-    :param colorMode: specifies if (true) color is processed of not (false) to then processing gray level image
-    :param normaliseOutput: specifies if (true) output is rescaled between 0 and 255 of not (false)
-    :param photoreceptorsLocalAdaptationSensitivity: the photoreceptors sensitivity renage is 0-1 (more log compression effect when value increases)
-    :param photoreceptorsTemporalConstant: the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-    :param photoreceptorsSpatialConstant: the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-    :param horizontalCellsGain: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-    :param HcellsTemporalConstant: the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-    :param HcellsSpatialConstant: the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-    :param ganglionCellsSensitivity: the compression strengh of the ganglion cells local adaptation output, set a value between 0.6 and 1 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 0.7
-
-
-Retina::RetinaParameters
-========================
-
-.. ocv:struct:: Retina::RetinaParameters
-
-  This structure merges all the parameters that can be adjusted threw the **Retina::setup()**, **Retina::setupOPLandIPLParvoChannel** and **Retina::setupIPLMagnoChannel** setup methods
-  Parameters structure for better clarity, check explenations on the comments of methods : setupOPLandIPLParvoChannel and setupIPLMagnoChannel. ::
-
-    class RetinaParameters{
-        struct OPLandIplParvoParameters{ // Outer Plexiform Layer (OPL) and Inner Plexiform Layer Parvocellular (IplParvo) parameters
-               OPLandIplParvoParameters():colorMode(true),
-                  normaliseOutput(true), // specifies if (true) output is rescaled between 0 and 255 of not (false)
-                  photoreceptorsLocalAdaptationSensitivity(0.7f), // the photoreceptors sensitivity renage is 0-1 (more log compression effect when value increases)
-                  photoreceptorsTemporalConstant(0.5f),// the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-                  photoreceptorsSpatialConstant(0.53f),// the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-                  horizontalCellsGain(0.0f),//gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-                  hcellsTemporalConstant(1.f),// the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors. Reduce to 0.5 to limit retina after effects.
-                  hcellsSpatialConstant(7.f),//the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-                  ganglionCellsSensitivity(0.7f)//the compression strengh of the ganglion cells local adaptation output, set a value between 0.6 and 1 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 0.7
-                  {};// default setup
-               bool colorMode, normaliseOutput;
-               float photoreceptorsLocalAdaptationSensitivity, photoreceptorsTemporalConstant, photoreceptorsSpatialConstant, horizontalCellsGain, hcellsTemporalConstant, hcellsSpatialConstant, ganglionCellsSensitivity;
-           };
-           struct IplMagnoParameters{ // Inner Plexiform Layer Magnocellular channel (IplMagno)
-               IplMagnoParameters():
-                  normaliseOutput(true), //specifies if (true) output is rescaled between 0 and 255 of not (false)
-                  parasolCells_beta(0.f), // the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-                  parasolCells_tau(0.f), //the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-                  parasolCells_k(7.f), //the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-                  amacrinCellsTemporalCutFrequency(1.2f), //the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, typical value is 1.2
-                  V0CompressionParameter(0.95f), the compression strengh of the ganglion cells local adaptation output, set a value between 0.6 and 1 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 0.95
-                  localAdaptintegration_tau(0.f), // specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-                  localAdaptintegration_k(7.f) // specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-                  {};// default setup
-               bool normaliseOutput;
-               float parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, V0CompressionParameter, localAdaptintegration_tau, localAdaptintegration_k;
-           };
-            struct OPLandIplParvoParameters OPLandIplParvo;
-            struct IplMagnoParameters IplMagno;
-    };
-
-Retina parameters files examples
-++++++++++++++++++++++++++++++++
-
-Here is the default configuration file of the retina module. It gives results such as the first retina output shown on the top of this page.
-
-.. code-block:: cpp
-
-    <?xml version="1.0"?>
-    <opencv_storage>
-    <OPLandIPLparvo>
-        <colorMode>1</colorMode>
-        <normaliseOutput>1</normaliseOutput>
-        <photoreceptorsLocalAdaptationSensitivity>7.5e-01</photoreceptorsLocalAdaptationSensitivity>
-        <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
-        <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
-        <horizontalCellsGain>0.01</horizontalCellsGain>
-        <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
-        <hcellsSpatialConstant>7.</hcellsSpatialConstant>
-        <ganglionCellsSensitivity>7.5e-01</ganglionCellsSensitivity></OPLandIPLparvo>
-    <IPLmagno>
-        <normaliseOutput>1</normaliseOutput>
-        <parasolCells_beta>0.</parasolCells_beta>
-        <parasolCells_tau>0.</parasolCells_tau>
-        <parasolCells_k>7.</parasolCells_k>
-        <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
-        <V0CompressionParameter>9.5e-01</V0CompressionParameter>
-        <localAdaptintegration_tau>0.</localAdaptintegration_tau>
-        <localAdaptintegration_k>7.</localAdaptintegration_k></IPLmagno>
-    </opencv_storage>
-
-Here is the 'realistic" setup used to obtain the second retina output shown on the top of this page.
-
-.. code-block:: cpp
-
-    <?xml version="1.0"?>
-    <opencv_storage>
-    <OPLandIPLparvo>
-      <colorMode>1</colorMode>
-      <normaliseOutput>1</normaliseOutput>
-      <photoreceptorsLocalAdaptationSensitivity>8.9e-01</photoreceptorsLocalAdaptationSensitivity>
-      <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
-      <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
-      <horizontalCellsGain>0.3</horizontalCellsGain>
-      <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
-      <hcellsSpatialConstant>7.</hcellsSpatialConstant>
-      <ganglionCellsSensitivity>8.9e-01</ganglionCellsSensitivity></OPLandIPLparvo>
-    <IPLmagno>
-      <normaliseOutput>1</normaliseOutput>
-      <parasolCells_beta>0.</parasolCells_beta>
-      <parasolCells_tau>0.</parasolCells_tau>
-      <parasolCells_k>7.</parasolCells_k>
-      <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
-      <V0CompressionParameter>9.5e-01</V0CompressionParameter>
-      <localAdaptintegration_tau>0.</localAdaptintegration_tau>
-      <localAdaptintegration_k>7.</localAdaptintegration_k></IPLmagno>
-    </opencv_storage>
diff --git a/modules/bioinspired/include/opencv2/bioinspired.hpp b/modules/bioinspired/include/opencv2/bioinspired.hpp
deleted file mode 100644
index 5f2f8644d..000000000
--- a/modules/bioinspired/include/opencv2/bioinspired.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_BIOINSPIRED_HPP__
-#define __OPENCV_BIOINSPIRED_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/bioinspired/retina.hpp"
-#include "opencv2/bioinspired/retinafasttonemapping.hpp"
-
-#endif
diff --git a/modules/bioinspired/include/opencv2/bioinspired/retina.hpp b/modules/bioinspired/include/opencv2/bioinspired/retina.hpp
deleted file mode 100644
index b4fda7038..000000000
--- a/modules/bioinspired/include/opencv2/bioinspired/retina.hpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*#******************************************************************************
- ** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- **
- ** By downloading, copying, installing or using the software you agree to this license.
- ** If you do not agree to this license, do not download, install,
- ** copy or use the software.
- **
- **
- ** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
- ** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
- **
- ** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
- **
- **  Creation - enhancement process 2007-2013
- **      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
- **
- ** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
- ** Refer to the following research paper for more information:
- ** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- ** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
- ** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- **
- ** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
- ** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
- ** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
- ** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
- ** ====> more informations in the above cited Jeanny Heraults's book.
- **
- **                          License Agreement
- **               For Open Source Computer Vision Library
- **
- ** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- ** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
- **
- **               For Human Visual System tools (bioinspired)
- ** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
- **
- ** Third party copyrights are property of their respective owners.
- **
- ** Redistribution and use in source and binary forms, with or without modification,
- ** are permitted provided that the following conditions are met:
- **
- ** * Redistributions of source code must retain the above copyright notice,
- **    this list of conditions and the following disclaimer.
- **
- ** * Redistributions in binary form must reproduce the above copyright notice,
- **    this list of conditions and the following disclaimer in the documentation
- **    and/or other materials provided with the distribution.
- **
- ** * The name of the copyright holders may not be used to endorse or promote products
- **    derived from this software without specific prior written permission.
- **
- ** This software is provided by the copyright holders and contributors "as is" and
- ** any express or implied warranties, including, but not limited to, the implied
- ** warranties of merchantability and fitness for a particular purpose are disclaimed.
- ** In no event shall the Intel Corporation or contributors be liable for any direct,
- ** indirect, incidental, special, exemplary, or consequential damages
- ** (including, but not limited to, procurement of substitute goods or services;
- ** loss of use, data, or profits; or business interruption) however caused
- ** and on any theory of liability, whether in contract, strict liability,
- ** or tort (including negligence or otherwise) arising in any way out of
- ** the use of this software, even if advised of the possibility of such damage.
- *******************************************************************************/
-
-#ifndef __OPENCV_BIOINSPIRED_RETINA_HPP__
-#define __OPENCV_BIOINSPIRED_RETINA_HPP__
-
-/*
- * Retina.hpp
- *
- *  Created on: Jul 19, 2011
- *      Author: Alexandre Benoit
- */
-
-#include "opencv2/core.hpp" // for all OpenCV core functionalities access, including cv::Exception support
-
-
-namespace cv{
-namespace bioinspired{
-
-enum {
-    RETINA_COLOR_RANDOM, //!< each pixel position is either R, G or B in a random choice
-    RETINA_COLOR_DIAGONAL,//!< color sampling is RGBRGBRGB..., line 2 BRGBRGBRG..., line 3, GBRGBRGBR...
-    RETINA_COLOR_BAYER//!< standard bayer sampling
-};
-
-/**
- * @class Retina a wrapper class which allows the Gipsa/Listic Labs model to be used with OpenCV.
- * This retina model allows spatio-temporal image processing (applied on still images, video sequences).
- * As a summary, these are the retina model properties:
- * => It applies a spectral whithening (mid-frequency details enhancement)
- * => high frequency spatio-temporal noise reduction
- * => low frequency luminance to be reduced (luminance range compression)
- * => local logarithmic luminance compression allows details to be enhanced in low light conditions
- *
- * USE : this model can be used basically for spatio-temporal video effects but also for :
- *      _using the getParvo method output matrix : texture analysiswith enhanced signal to noise ratio and enhanced details robust against input images luminance ranges
- *      _using the getMagno method output matrix : motion analysis also with the previously cited properties
- *
- * for more information, reer to the following papers :
- * Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- * Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- *
- * The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
- * _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
- * ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
- * _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
- * ====> more informations in the above cited Jeanny Heraults's book.
- */
-class CV_EXPORTS Retina : public Algorithm {
-
-public:
-
-    // parameters structure for better clarity, check explenations on the comments of methods : setupOPLandIPLParvoChannel and setupIPLMagnoChannel
-    struct RetinaParameters{
-        struct OPLandIplParvoParameters{ // Outer Plexiform Layer (OPL) and Inner Plexiform Layer Parvocellular (IplParvo) parameters
-               OPLandIplParvoParameters():colorMode(true),
-                                 normaliseOutput(true),
-                                 photoreceptorsLocalAdaptationSensitivity(0.75f),
-                                 photoreceptorsTemporalConstant(0.9f),
-                                 photoreceptorsSpatialConstant(0.53f),
-                                 horizontalCellsGain(0.01f),
-                                 hcellsTemporalConstant(0.5f),
-                                 hcellsSpatialConstant(7.f),
-                                 ganglionCellsSensitivity(0.75f){};// default setup
-               bool colorMode, normaliseOutput;
-               float photoreceptorsLocalAdaptationSensitivity, photoreceptorsTemporalConstant, photoreceptorsSpatialConstant, horizontalCellsGain, hcellsTemporalConstant, hcellsSpatialConstant, ganglionCellsSensitivity;
-           };
-           struct IplMagnoParameters{ // Inner Plexiform Layer Magnocellular channel (IplMagno)
-               IplMagnoParameters():
-                          normaliseOutput(true),
-                          parasolCells_beta(0.f),
-                          parasolCells_tau(0.f),
-                          parasolCells_k(7.f),
-                          amacrinCellsTemporalCutFrequency(2.0f),
-                          V0CompressionParameter(0.95f),
-                          localAdaptintegration_tau(0.f),
-                          localAdaptintegration_k(7.f){};// default setup
-               bool normaliseOutput;
-               float parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, V0CompressionParameter, localAdaptintegration_tau, localAdaptintegration_k;
-           };
-            struct OPLandIplParvoParameters OPLandIplParvo;
-            struct IplMagnoParameters IplMagno;
-    };
-
-    /**
-    * retreive retina input buffer size
-    */
-    virtual Size getInputSize()=0;
-
-    /**
-    * retreive retina output buffer size
-    */
-    virtual Size getOutputSize()=0;
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param retinaParameterFile : the parameters filename
-         * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-    virtual void setup(String retinaParameterFile="", const bool applyDefaultSetupOnFailure=true)=0;
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param fs : the open Filestorage which contains retina parameters
-     * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-    virtual void setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure=true)=0;
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param newParameters : a parameters structures updated with the new target configuration
-         * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-    virtual void setup(RetinaParameters newParameters)=0;
-
-    /**
-    * @return the current parameters setup
-    */
-    virtual struct Retina::RetinaParameters getParameters()=0;
-
-    /**
-     * parameters setup display method
-     * @return a string which contains formatted parameters information
-     */
-    virtual const String printSetup()=0;
-
-    /**
-     * write xml/yml formated parameters information
-     * @rparam fs : the filename of the xml file that will be open and writen with formatted parameters information
-     */
-    virtual void write( String fs ) const=0;
-
-    /**
-     * write xml/yml formated parameters information
-     * @param fs : a cv::Filestorage object ready to be filled
-         */
-    virtual void write( FileStorage& fs ) const=0;
-
-    /**
-     * setup the OPL and IPL parvo channels (see biologocal model)
-     * OPL is referred as Outer Plexiform Layer of the retina, it allows the spatio-temporal filtering which withens the spectrum and reduces spatio-temporal noise while attenuating global luminance (low frequency energy)
-     * IPL parvo is the OPL next processing stage, it refers to Inner Plexiform layer of the retina, it allows high contours sensitivity in foveal vision.
-     * for more informations, please have a look at the paper Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-     * @param colorMode : specifies if (true) color is processed of not (false) to then processing gray level image
-     * @param normaliseOutput : specifies if (true) output is rescaled between 0 and 255 of not (false)
-     * @param photoreceptorsLocalAdaptationSensitivity: the photoreceptors sensitivity renage is 0-1 (more log compression effect when value increases)
-     * @param photoreceptorsTemporalConstant: the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-     * @param photoreceptorsSpatialConstant: the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-     * @param horizontalCellsGain: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-     * @param HcellsTemporalConstant: the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-     * @param HcellsSpatialConstant: the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-     * @param ganglionCellsSensitivity: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 230
-     */
-    virtual void setupOPLandIPLParvoChannel(const bool colorMode=true, const bool normaliseOutput = true, const float photoreceptorsLocalAdaptationSensitivity=0.7, const float photoreceptorsTemporalConstant=0.5, const float photoreceptorsSpatialConstant=0.53, const float horizontalCellsGain=0, const float HcellsTemporalConstant=1, const float HcellsSpatialConstant=7, const float ganglionCellsSensitivity=0.7)=0;
-
-    /**
-     * set parameters values for the Inner Plexiform Layer (IPL) magnocellular channel
-     * this channel processes signals outpint from OPL processing stage in peripheral vision, it allows motion information enhancement. It is decorrelated from the details channel. See reference paper for more details.
-     * @param normaliseOutput : specifies if (true) output is rescaled between 0 and 255 of not (false)
-     * @param parasolCells_beta: the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-     * @param parasolCells_tau: the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-     * @param parasolCells_k: the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-     * @param amacrinCellsTemporalCutFrequency: the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, tipicall value is 5
-     * @param V0CompressionParameter: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 200
-     * @param localAdaptintegration_tau: specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-     * @param localAdaptintegration_k: specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-     */
-    virtual void setupIPLMagnoChannel(const bool normaliseOutput = true, const float parasolCells_beta=0, const float parasolCells_tau=0, const float parasolCells_k=7, const float amacrinCellsTemporalCutFrequency=1.2, const float V0CompressionParameter=0.95, const float localAdaptintegration_tau=0, const float localAdaptintegration_k=7)=0;
-
-    /**
-     * method which allows retina to be applied on an input image, after run, encapsulated retina module is ready to deliver its outputs using dedicated acccessors, see getParvo and getMagno methods
-     * @param inputImage : the input cv::Mat image to be processed, can be gray level or BGR coded in any format (from 8bit to 16bits)
-     */
-    virtual void run(InputArray inputImage)=0;
-
-    /**
-     * method that applies a luminance correction (initially High Dynamic Range (HDR) tone mapping) using only the 2 local adaptation stages of the retina parvo channel : photoreceptors level and ganlion cells level. Spatio temporal filtering is applied but limited to temporal smoothing and eventually high frequencies attenuation. This is a lighter method than the one available using the regular run method. It is then faster but it does not include complete temporal filtering nor retina spectral whitening. Then, it can have a more limited effect on images with a very high dynamic range. This is an adptation of the original still image HDR tone mapping algorithm of David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
-     @param inputImage the input image to process RGB or gray levels
-     @param outputToneMappedImage the output tone mapped image
-     */
-    virtual void applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)=0;
-
-    /**
-     * accessor of the details channel of the retina (models foveal vision)
-     * @param retinaOutput_parvo : the output buffer (reallocated if necessary), this output is rescaled for standard 8bits image processing use in OpenCV
-     */
-    virtual void getParvo(OutputArray retinaOutput_parvo)=0;
-
-    /**
-     * accessor of the details channel of the retina (models foveal vision)
-     * @param retinaOutput_parvo : a cv::Mat header filled with the internal parvo buffer of the retina module. This output is the original retina filter model output, without any quantification or rescaling
-     */
-    virtual void getParvoRAW(OutputArray retinaOutput_parvo)=0;
-
-    /**
-     * accessor of the motion channel of the retina (models peripheral vision)
-     * @param retinaOutput_magno : the output buffer (reallocated if necessary), this output is rescaled for standard 8bits image processing use in OpenCV
-     */
-    virtual void getMagno(OutputArray retinaOutput_magno)=0;
-
-    /**
-     * accessor of the motion channel of the retina (models peripheral vision)
-     * @param retinaOutput_magno : a cv::Mat header filled with the internal retina magno buffer of the retina module. This output is the original retina filter model output, without any quantification or rescaling
-     */
-    virtual void getMagnoRAW(OutputArray retinaOutput_magno)=0;
-
-    // original API level data accessors : get buffers addresses from a Mat header, similar to getParvoRAW and getMagnoRAW...
-    virtual const Mat getMagnoRAW() const=0;
-    virtual const Mat getParvoRAW() const=0;
-
-    /**
-     * activate color saturation as the final step of the color demultiplexing process
-     * -> this saturation is a sigmoide function applied to each channel of the demultiplexed image.
-     * @param saturateColors: boolean that activates color saturation (if true) or desactivate (if false)
-     * @param colorSaturationValue: the saturation factor
-     */
-    virtual void setColorSaturation(const bool saturateColors=true, const float colorSaturationValue=4.0)=0;
-
-    /**
-     * clear all retina buffers (equivalent to opening the eyes after a long period of eye close ;o)
-     */
-    virtual void clearBuffers()=0;
-
-    /**
-    * Activate/desactivate the Magnocellular pathway processing (motion information extraction), by default, it is activated
-    * @param activate: true if Magnocellular output should be activated, false if not
-    */
-    virtual void activateMovingContoursProcessing(const bool activate)=0;
-
-    /**
-    * Activate/desactivate the Parvocellular pathway processing (contours information extraction), by default, it is activated
-    * @param activate: true if Parvocellular (contours information extraction) output should be activated, false if not
-    */
-    virtual void activateContoursProcessing(const bool activate)=0;
-};
-CV_EXPORTS Ptr<Retina> createRetina(Size inputSize);
-CV_EXPORTS Ptr<Retina> createRetina(Size inputSize, const bool colorMode, int colorSamplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-
-CV_EXPORTS Ptr<Retina> createRetina_OCL(Size inputSize);
-CV_EXPORTS Ptr<Retina> createRetina_OCL(Size inputSize, const bool colorMode, int colorSamplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-}
-}
-#endif /* __OPENCV_BIOINSPIRED_RETINA_HPP__ */
diff --git a/modules/bioinspired/include/opencv2/bioinspired/retinafasttonemapping.hpp b/modules/bioinspired/include/opencv2/bioinspired/retinafasttonemapping.hpp
deleted file mode 100644
index 6c83f885c..000000000
--- a/modules/bioinspired/include/opencv2/bioinspired/retinafasttonemapping.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-
-/*#******************************************************************************
- ** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- **
- ** By downloading, copying, installing or using the software you agree to this license.
- ** If you do not agree to this license, do not download, install,
- ** copy or use the software.
- **
- **
- ** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
- **
- ** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
- **
- **  Creation - enhancement process 2007-2013
- **      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
- **
- ** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
- ** Refer to the following research paper for more information:
- ** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- ** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
- ** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- **
- **
- **
- **
- **
- ** This class is based on image processing tools of the author and already used within the Retina class (this is the same code as method retina::applyFastToneMapping, but in an independent class, it is ligth from a memory requirement point of view). It implements an adaptation of the efficient tone mapping algorithm propose by David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
- ** -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
- **
- **
- **                          License Agreement
- **               For Open Source Computer Vision Library
- **
- ** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- ** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
- **
- **               For Human Visual System tools (bioinspired)
- ** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
- **
- ** Third party copyrights are property of their respective owners.
- **
- ** Redistribution and use in source and binary forms, with or without modification,
- ** are permitted provided that the following conditions are met:
- **
- ** * Redistributions of source code must retain the above copyright notice,
- **    this list of conditions and the following disclaimer.
- **
- ** * Redistributions in binary form must reproduce the above copyright notice,
- **    this list of conditions and the following disclaimer in the documentation
- **    and/or other materials provided with the distribution.
- **
- ** * The name of the copyright holders may not be used to endorse or promote products
- **    derived from this software without specific prior written permission.
- **
- ** This software is provided by the copyright holders and contributors "as is" and
- ** any express or implied warranties, including, but not limited to, the implied
- ** warranties of merchantability and fitness for a particular purpose are disclaimed.
- ** In no event shall the Intel Corporation or contributors be liable for any direct,
- ** indirect, incidental, special, exemplary, or consequential damages
- ** (including, but not limited to, procurement of substitute goods or services;
- ** loss of use, data, or profits; or business interruption) however caused
- ** and on any theory of liability, whether in contract, strict liability,
- ** or tort (including negligence or otherwise) arising in any way out of
- ** the use of this software, even if advised of the possibility of such damage.
- *******************************************************************************/
-
-#ifndef __OPENCV_BIOINSPIRED_RETINAFASTTONEMAPPING_HPP__
-#define __OPENCV_BIOINSPIRED_RETINAFASTTONEMAPPING_HPP__
-
-/*
- * retinafasttonemapping.hpp
- *
- *  Created on: May 26, 2013
- *      Author: Alexandre Benoit
- */
-
-#include "opencv2/core.hpp" // for all OpenCV core functionalities access, including cv::Exception support
-
-namespace cv{
-namespace bioinspired{
-
-/**
- * @class RetinaFastToneMappingImpl a wrapper class which allows the tone mapping algorithm of Meylan&al(2007) to be used with OpenCV.
- * This algorithm is already implemented in thre Retina class (retina::applyFastToneMapping) but used it does not require all the retina model to be allocated. This allows a light memory use for low memory devices (smartphones, etc.
- * As a summary, these are the model properties:
- * => 2 stages of local luminance adaptation with a different local neighborhood for each.
- * => first stage models the retina photorecetors local luminance adaptation
- * => second stage models th ganglion cells local information adaptation
- * => compared to the initial publication, this class uses spatio-temporal low pass filters instead of spatial only filters.
- * ====> this can help noise robustness and temporal stability for video sequence use cases.
- * for more information, read to the following papers :
- *  Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- * regarding spatio-temporal filter and the bigger retina model :
- * Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- */
-class CV_EXPORTS RetinaFastToneMapping : public Algorithm
-{
-public:
-
-    /**
-     * method that applies a luminance correction (initially High Dynamic Range (HDR) tone mapping) using only the 2 local adaptation stages of the retina parvocellular channel : photoreceptors level and ganlion cells level. Spatio temporal filtering is applied but limited to temporal smoothing and eventually high frequencies attenuation. This is a lighter method than the one available using the regular retina::run method. It is then faster but it does not include complete temporal filtering nor retina spectral whitening. Then, it can have a more limited effect on images with a very high dynamic range. This is an adptation of the original still image HDR tone mapping algorithm of David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
-     @param inputImage the input image to process RGB or gray levels
-     @param outputToneMappedImage the output tone mapped image
-     */
-    virtual void applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)=0;
-
-    /**
-     * setup method that updates tone mapping behaviors by adjusing the local luminance computation area
-     * @param photoreceptorsNeighborhoodRadius the first stage local adaptation area
-     * @param ganglioncellsNeighborhoodRadius the second stage local adaptation area
-     * @param meanLuminanceModulatorK the factor applied to modulate the meanLuminance information (default is 1, see reference paper)
-     */
-    virtual void setup(const float photoreceptorsNeighborhoodRadius=3.f, const float ganglioncellsNeighborhoodRadius=1.f, const float meanLuminanceModulatorK=1.f)=0;
-};
-
-CV_EXPORTS Ptr<RetinaFastToneMapping> createRetinaFastToneMapping(Size inputSize);
-
-}
-}
-#endif /* __OPENCV_BIOINSPIRED_RETINAFASTTONEMAPPING_HPP__ */
diff --git a/modules/bioinspired/src/basicretinafilter.cpp b/modules/bioinspired/src/basicretinafilter.cpp
deleted file mode 100644
index 7e7b467fa..000000000
--- a/modules/bioinspired/src/basicretinafilter.cpp
+++ /dev/null
@@ -1,888 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-
-#include <iostream>
-#include <cstdlib>
-#include "basicretinafilter.hpp"
-#include <cmath>
-
-
-namespace cv
-{
-namespace bioinspired
-{
-// @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-
-//////////////////////////////////////////////////////////
-//                 BASIC RETINA FILTER
-//////////////////////////////////////////////////////////
-
-// Constructor and Desctructor of the basic retina filter
-BasicRetinaFilter::BasicRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns, const unsigned int parametersListSize, const bool useProgressiveFilter)
-:_filterOutput(NBrows, NBcolumns),
- _localBuffer(NBrows*NBcolumns),
- _filteringCoeficientsTable(3*parametersListSize),
- _progressiveSpatialConstant(0),// pointer to a local table containing local spatial constant (allocated with the object)
- _progressiveGain(0)
-{
-#ifdef T_BASIC_RETINA_ELEMENT_DEBUG
-    std::cout<<"BasicRetinaFilter::BasicRetinaFilter: new filter, size="<<NBrows<<", "<<NBcolumns<<std::endl;
-#endif
-    _halfNBrows=_filterOutput.getNBrows()/2;
-    _halfNBcolumns=_filterOutput.getNBcolumns()/2;
-
-    if (useProgressiveFilter)
-    {
-#ifdef T_BASIC_RETINA_ELEMENT_DEBUG
-        std::cout<<"BasicRetinaFilter::BasicRetinaFilter: _progressiveSpatialConstant_Tbuffer"<<std::endl;
-#endif
-        _progressiveSpatialConstant.resize(_filterOutput.size());
-#ifdef T_BASIC_RETINA_ELEMENT_DEBUG
-        std::cout<<"BasicRetinaFilter::BasicRetinaFilter: new _progressiveGain_Tbuffer"<<NBrows<<", "<<NBcolumns<<std::endl;
-#endif
-        _progressiveGain.resize(_filterOutput.size());
-    }
-#ifdef T_BASIC_RETINA_ELEMENT_DEBUG
-    std::cout<<"BasicRetinaFilter::BasicRetinaFilter: new filter, size="<<NBrows<<", "<<NBcolumns<<std::endl;
-#endif
-
-    // set default values
-    _maxInputValue=256.0;
-
-    // reset all buffers
-    clearAllBuffers();
-
-#ifdef T_BASIC_RETINA_ELEMENT_DEBUG
-    std::cout<<"BasicRetinaFilter::Init BasicRetinaElement at specified frame size OK, size="<<this->size()<<std::endl;
-#endif
-
-}
-
-BasicRetinaFilter::~BasicRetinaFilter()
-{
-
-#ifdef BASIC_RETINA_ELEMENT_DEBUG
-    std::cout<<"BasicRetinaFilter::BasicRetinaElement Deleted OK"<<std::endl;
-#endif
-
-}
-
-////////////////////////////////////
-// functions of the basic filter
-////////////////////////////////////
-
-
-// resize all allocated buffers
-void BasicRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-
-    std::cout<<"BasicRetinaFilter::resize( "<<NBrows<<", "<<NBcolumns<<")"<<std::endl;
-
-    // resizing buffers
-    _filterOutput.resizeBuffer(NBrows, NBcolumns);
-
-    // updating variables
-    _halfNBrows=_filterOutput.getNBrows()/2;
-    _halfNBcolumns=_filterOutput.getNBcolumns()/2;
-
-    _localBuffer.resize(_filterOutput.size());
-    // in case of spatial adapted filter
-    if (_progressiveSpatialConstant.size()>0)
-    {
-        _progressiveSpatialConstant.resize(_filterOutput.size());
-        _progressiveGain.resize(_filterOutput.size());
-    }
-    // reset buffers
-    clearAllBuffers();
-}
-
-// Change coefficients table
-void BasicRetinaFilter::setLPfilterParameters(const float beta, const float tau, const float desired_k, const unsigned int filterIndex)
-{
-    float _beta = beta+tau;
-    float k=desired_k;
-    // check if the spatial constant is correct (avoid 0 value to avoid division by 0)
-    if (desired_k<=0)
-    {
-        k=0.001f;
-        std::cerr<<"BasicRetinaFilter::spatial constant of the low pass filter must be superior to zero !!! correcting parameter setting to 0,001"<<std::endl;
-    }
-
-    float _alpha = k*k;
-    float _mu = 0.8f;
-    unsigned int tableOffset=filterIndex*3;
-    if (k<=0)
-    {
-        std::cerr<<"BasicRetinaFilter::spatial filtering coefficient must be superior to zero, correcting value to 0.01"<<std::endl;
-        _alpha=0.0001f;
-    }
-
-    float _temp =  (1.0f+_beta)/(2.0f*_mu*_alpha);
-    float a = _filteringCoeficientsTable[tableOffset] = 1.0f + _temp - (float)std::sqrt( (1.0f+_temp)*(1.0f+_temp) - 1.0f);
-    _filteringCoeficientsTable[1+tableOffset]=(1.0f-a)*(1.0f-a)*(1.0f-a)*(1.0f-a)/(1.0f+_beta);
-    _filteringCoeficientsTable[2+tableOffset] =tau;
-
-    //std::cout<<"BasicRetinaFilter::normal:"<<(1.0-a)*(1.0-a)*(1.0-a)*(1.0-a)/(1.0+_beta)<<" -> old:"<<(1-a)*(1-a)*(1-a)*(1-a)/(1+_beta)<<std::endl;
-
-    //std::cout<<"BasicRetinaFilter::a="<<a<<", gain="<<_filteringCoeficientsTable[1+tableOffset]<<", tau="<<tau<<std::endl;
-}
-
-void BasicRetinaFilter::setProgressiveFilterConstants_CentredAccuracy(const float beta, const float tau, const float alpha0, const unsigned int filterIndex)
-{
-    // check if dedicated buffers are already allocated, if not create them
-    if (_progressiveSpatialConstant.size()!=_filterOutput.size())
-    {
-        _progressiveSpatialConstant.resize(_filterOutput.size());
-        _progressiveGain.resize(_filterOutput.size());
-    }
-
-    float _beta = beta+tau;
-    float _mu=0.8f;
-    if (alpha0<=0)
-    {
-        std::cerr<<"BasicRetinaFilter::spatial filtering coefficient must be superior to zero, correcting value to 0.01"<<std::endl;
-        //alpha0=0.0001;
-    }
-
-    unsigned int tableOffset=filterIndex*3;
-
-    float _alpha=0.8f;
-    float _temp =  (1.0f+_beta)/(2.0f*_mu*_alpha);
-    float a=_filteringCoeficientsTable[tableOffset] = 1.0f + _temp - (float)std::sqrt( (1.0f+_temp)*(1.0f+_temp) - 1.0f);
-    _filteringCoeficientsTable[tableOffset+1]=(1.0f-a)*(1.0f-a)*(1.0f-a)*(1.0f-a)/(1.0f+_beta);
-    _filteringCoeficientsTable[tableOffset+2] =tau;
-
-    float commonFactor=alpha0/(float)std::sqrt(_halfNBcolumns*_halfNBcolumns+_halfNBrows*_halfNBrows+1.0f);
-    //memset(_progressiveSpatialConstant, 255, _filterOutput.getNBpixels());
-    for (unsigned int idColumn=0;idColumn<_halfNBcolumns; ++idColumn)
-        for (unsigned int idRow=0;idRow<_halfNBrows; ++idRow)
-        {
-            // computing local spatial constant
-            float localSpatialConstantValue=commonFactor*std::sqrt((float)(idColumn*idColumn)+(float)(idRow*idRow));
-            if (localSpatialConstantValue>1.0f)
-                localSpatialConstantValue=1.0f;
-
-            _progressiveSpatialConstant[_halfNBcolumns-1+idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1+idRow)]=localSpatialConstantValue;
-            _progressiveSpatialConstant[_halfNBcolumns-1-idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1+idRow)]=localSpatialConstantValue;
-            _progressiveSpatialConstant[_halfNBcolumns-1+idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1-idRow)]=localSpatialConstantValue;
-            _progressiveSpatialConstant[_halfNBcolumns-1-idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1-idRow)]=localSpatialConstantValue;
-
-            // computing local gain
-            float localGain=(1-localSpatialConstantValue)*(1-localSpatialConstantValue)*(1-localSpatialConstantValue)*(1-localSpatialConstantValue)/(1+_beta);
-            _progressiveGain[_halfNBcolumns-1+idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1+idRow)]=localGain;
-            _progressiveGain[_halfNBcolumns-1-idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1+idRow)]=localGain;
-            _progressiveGain[_halfNBcolumns-1+idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1-idRow)]=localGain;
-            _progressiveGain[_halfNBcolumns-1-idColumn+_filterOutput.getNBcolumns()*(_halfNBrows-1-idRow)]=localGain;
-
-            //std::cout<<commonFactor<<", "<<std::sqrt((_halfNBcolumns-1-idColumn)+(_halfNBrows-idRow-1))<<", "<<(_halfNBcolumns-1-idColumn)<<", "<<(_halfNBrows-idRow-1)<<", "<<localSpatialConstantValue<<std::endl;
-        }
-}
-
-void BasicRetinaFilter::setProgressiveFilterConstants_CustomAccuracy(const float beta, const float tau, const float k, const std::valarray<float> &accuracyMap, const unsigned int filterIndex)
-{
-
-    if (accuracyMap.size()!=_filterOutput.size())
-    {
-        std::cerr<<"BasicRetinaFilter::setProgressiveFilterConstants_CustomAccuracy: error: input accuracy map does not match filter size, init skept"<<std::endl;
-        return ;
-    }
-
-    // check if dedicated buffers are already allocated, if not create them
-    if (_progressiveSpatialConstant.size()!=_filterOutput.size())
-    {
-        _progressiveSpatialConstant.resize(accuracyMap.size());
-        _progressiveGain.resize(accuracyMap.size());
-    }
-
-    float _beta = beta+tau;
-    float _alpha=k*k;
-    float _mu=0.8f;
-    if (k<=0)
-    {
-        std::cerr<<"BasicRetinaFilter::spatial filtering coefficient must be superior to zero, correcting value to 0.01"<<std::endl;
-        //alpha0=0.0001;
-    }
-    unsigned int tableOffset=filterIndex*3;
-    float _temp =  (1.0f+_beta)/(2.0f*_mu*_alpha);
-    float a=_filteringCoeficientsTable[tableOffset] = 1.0f + _temp - (float)std::sqrt( (1.0f+_temp)*(1.0f+_temp) - 1.0f);
-    _filteringCoeficientsTable[tableOffset+1]=(1.0f-a)*(1.0f-a)*(1.0f-a)*(1.0f-a)/(1.0f+_beta);
-    _filteringCoeficientsTable[tableOffset+2] =tau;
-
-    //memset(_progressiveSpatialConstant, 255, _filterOutput.getNBpixels());
-    for (unsigned int idColumn=0;idColumn<_filterOutput.getNBcolumns(); ++idColumn)
-        for (unsigned int idRow=0;idRow<_filterOutput.getNBrows(); ++idRow)
-        {
-            // computing local spatial constant
-            unsigned int index=idColumn+idRow*_filterOutput.getNBcolumns();
-            float localSpatialConstantValue=_a*accuracyMap[index];
-            if (localSpatialConstantValue>1)
-                localSpatialConstantValue=1;
-
-            _progressiveSpatialConstant[index]=localSpatialConstantValue;
-
-            // computing local gain
-            float localGain=(1.0f-localSpatialConstantValue)*(1.0f-localSpatialConstantValue)*(1.0f-localSpatialConstantValue)*(1.0f-localSpatialConstantValue)/(1.0f+_beta);
-            _progressiveGain[index]=localGain;
-
-            //std::cout<<commonFactor<<", "<<std::sqrt((_halfNBcolumns-1-idColumn)+(_halfNBrows-idRow-1))<<", "<<(_halfNBcolumns-1-idColumn)<<", "<<(_halfNBrows-idRow-1)<<", "<<localSpatialConstantValue<<std::endl;
-        }
-}
-
-///////////////////////////////////////////////////////////////////////
-/// Local luminance adaptation functions
-// run local adaptation filter and save result in _filterOutput
-const std::valarray<float> &BasicRetinaFilter::runFilter_LocalAdapdation(const std::valarray<float> &inputFrame, const std::valarray<float> &localLuminance)
-{
-    _localLuminanceAdaptation(get_data(inputFrame), get_data(localLuminance), &_filterOutput[0]);
-    return _filterOutput;
-}
-// run local adaptation filter at a specific output adress
-void BasicRetinaFilter::runFilter_LocalAdapdation(const std::valarray<float> &inputFrame, const std::valarray<float> &localLuminance, std::valarray<float> &outputFrame)
-{
-    _localLuminanceAdaptation(get_data(inputFrame), get_data(localLuminance), &outputFrame[0]);
-}
-// run local adaptation filter and save result in _filterOutput with autonomous low pass filtering before adaptation
-const std::valarray<float> &BasicRetinaFilter::runFilter_LocalAdapdation_autonomous(const std::valarray<float> &inputFrame)
-{
-    _spatiotemporalLPfilter(get_data(inputFrame), &_filterOutput[0]);
-    _localLuminanceAdaptation(get_data(inputFrame), &_filterOutput[0], &_filterOutput[0]);
-    return _filterOutput;
-}
-// run local adaptation filter at a specific output adress with autonomous low pass filtering before adaptation
-void BasicRetinaFilter::runFilter_LocalAdapdation_autonomous(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame)
-{
-    _spatiotemporalLPfilter(get_data(inputFrame), &_filterOutput[0]);
-    _localLuminanceAdaptation(get_data(inputFrame), &_filterOutput[0], &outputFrame[0]);
-}
-
-// local luminance adaptation of the input in regard of localLuminance buffer, the input is rewrited and becomes the output
-void BasicRetinaFilter::_localLuminanceAdaptation(float *inputOutputFrame, const float *localLuminance)
-{
-    _localLuminanceAdaptation(inputOutputFrame, localLuminance, inputOutputFrame, false);
-
-    /*    const float *localLuminancePTR=localLuminance;
-    float *inputOutputFramePTR=inputOutputFrame;
-
-    for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel, ++inputOutputFramePTR)
-    {
-        float X0=*(localLuminancePTR++)*_localLuminanceFactor+_localLuminanceAddon;
-        *(inputOutputFramePTR) = (_maxInputValue+X0)**inputOutputFramePTR/(*inputOutputFramePTR +X0+0.00000000001);
-    }
-      */
-}
-
-// local luminance adaptation of the input in regard of localLuminance buffer
-void BasicRetinaFilter::_localLuminanceAdaptation(const float *inputFrame, const float *localLuminance, float *outputFrame, const bool updateLuminanceMean)
-{
-    if (updateLuminanceMean)
-    {	float meanLuminance=0;
-        const float *luminancePTR=inputFrame;
-        for (unsigned int i=0;i<_filterOutput.getNBpixels();++i)
-            meanLuminance+=*(luminancePTR++);
-        meanLuminance/=_filterOutput.getNBpixels();
-        //float tempMeanValue=meanLuminance+_meanInputValue*_tau;
-        updateCompressionParameter(meanLuminance);
-    }
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(0,_filterOutput.getNBpixels()), Parallel_localAdaptation(localLuminance, inputFrame, outputFrame, _localLuminanceFactor, _localLuminanceAddon, _maxInputValue));
-#else
-    //std::cout<<meanLuminance<<std::endl;
-    const float *localLuminancePTR=localLuminance;
-    const float *inputFramePTR=inputFrame;
-    float *outputFramePTR=outputFrame;
-    for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel, ++inputFramePTR, ++outputFramePTR)
-    {
-        float X0=*(localLuminancePTR++)*_localLuminanceFactor+_localLuminanceAddon;
-        // TODO : the following line can lead to a divide by zero ! A small offset is added, take care if the offset is too large in case of High Dynamic Range images which can use very small values...
-        *(outputFramePTR) = (_maxInputValue+X0)**inputFramePTR/(*inputFramePTR +X0+0.00000000001);
-        //std::cout<<"BasicRetinaFilter::inputFrame[IDpixel]=%f, X0=%f, outputFrame[IDpixel]=%f\n", inputFrame[IDpixel], X0, outputFrame[IDpixel]);
-    }
-#endif
-}
-
-// local adaptation applied on a range of values which can be positive and negative
-void BasicRetinaFilter::_localLuminanceAdaptationPosNegValues(const float *inputFrame, const float *localLuminance, float *outputFrame)
-{
-    const float *localLuminancePTR=localLuminance;
-    const float *inputFramePTR=inputFrame;
-    float *outputFramePTR=outputFrame;
-    float factor=_maxInputValue*2.0f/(float)CV_PI;
-    for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel, ++inputFramePTR)
-    {
-        float X0=*(localLuminancePTR++)*_localLuminanceFactor+_localLuminanceAddon;
-        *(outputFramePTR++) = factor*atan(*inputFramePTR/X0);//(_maxInputValue+X0)**inputFramePTR/(*inputFramePTR +X0);
-        //std::cout<<"BasicRetinaFilter::inputFrame[IDpixel]=%f, X0=%f, outputFrame[IDpixel]=%f\n", inputFrame[IDpixel], X0, outputFrame[IDpixel]);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-/// Spatio temporal Low Pass filter functions
-// run LP filter and save result in the basic retina element buffer
-const std::valarray<float> &BasicRetinaFilter::runFilter_LPfilter(const std::valarray<float> &inputFrame, const unsigned int filterIndex)
-{
-    _spatiotemporalLPfilter(get_data(inputFrame), &_filterOutput[0], filterIndex);
-    return _filterOutput;
-}
-
-// run LP filter for a new frame input and save result at a specific output adress
-void BasicRetinaFilter::runFilter_LPfilter(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame, const unsigned int filterIndex)
-{
-    _spatiotemporalLPfilter(get_data(inputFrame), &outputFrame[0], filterIndex);
-}
-
-// run LP filter on the input data and rewrite it
-void BasicRetinaFilter::runFilter_LPfilter_Autonomous(std::valarray<float> &inputOutputFrame, const unsigned int filterIndex)
-{
-    unsigned int coefTableOffset=filterIndex*3;
-
-    /**********/
-    _a=_filteringCoeficientsTable[coefTableOffset];
-    _gain=_filteringCoeficientsTable[1+coefTableOffset];
-    _tau=_filteringCoeficientsTable[2+coefTableOffset];
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    _horizontalCausalFilter(&inputOutputFrame[0], 0, _filterOutput.getNBrows());
-    _horizontalAnticausalFilter(&inputOutputFrame[0], 0, _filterOutput.getNBrows());
-    _verticalCausalFilter(&inputOutputFrame[0], 0, _filterOutput.getNBcolumns());
-    _verticalAnticausalFilter_multGain(&inputOutputFrame[0], 0, _filterOutput.getNBcolumns());
-
-}
-// run LP filter for a new frame input and save result at a specific output adress
-void BasicRetinaFilter::_spatiotemporalLPfilter(const float *inputFrame, float *outputFrame, const unsigned int filterIndex)
-{
-    unsigned int coefTableOffset=filterIndex*3;
-    /**********/
-    _a=_filteringCoeficientsTable[coefTableOffset];
-    _gain=_filteringCoeficientsTable[1+coefTableOffset];
-    _tau=_filteringCoeficientsTable[2+coefTableOffset];
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    _horizontalCausalFilter_addInput(inputFrame, outputFrame, 0,_filterOutput.getNBrows());
-    _horizontalAnticausalFilter(outputFrame, 0, _filterOutput.getNBrows());
-    _verticalCausalFilter(outputFrame, 0, _filterOutput.getNBcolumns());
-    _verticalAnticausalFilter_multGain(outputFrame, 0, _filterOutput.getNBcolumns());
-
-}
-
-// run SQUARING LP filter for a new frame input and save result at a specific output adress
-float BasicRetinaFilter::_squaringSpatiotemporalLPfilter(const float *inputFrame, float *outputFrame, const unsigned int filterIndex)
-{
-    unsigned int coefTableOffset=filterIndex*3;
-    /**********/
-    _a=_filteringCoeficientsTable[coefTableOffset];
-    _gain=_filteringCoeficientsTable[1+coefTableOffset];
-    _tau=_filteringCoeficientsTable[2+coefTableOffset];
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-
-    _squaringHorizontalCausalFilter(inputFrame, outputFrame, 0, _filterOutput.getNBrows());
-    _horizontalAnticausalFilter(outputFrame, 0, _filterOutput.getNBrows());
-    _verticalCausalFilter(outputFrame, 0, _filterOutput.getNBcolumns());
-    return _verticalAnticausalFilter_returnMeanValue(outputFrame, 0, _filterOutput.getNBcolumns());
-}
-
-/////////////////////////////////////////////////
-// standard version of the 1D low pass filters
-
-//  horizontal causal filter which adds the input inside
-void BasicRetinaFilter::_horizontalCausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-
-
-    //#pragma omp parallel for
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float* outputPTR=outputFrame+(IDrowStart+IDrow)*_filterOutput.getNBcolumns();
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(outputPTR)+  _a* result;
-            *(outputPTR++) = result;
-        }
-    }
-}
-//  horizontal causal filter which adds the input inside
-void BasicRetinaFilter::_horizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDrowStart,IDrowEnd), Parallel_horizontalCausalFilter_addInput(inputFrame, outputFrame, IDrowStart, _filterOutput.getNBcolumns(), _a, _tau));
-#else
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float* outputPTR=outputFrame+(IDrowStart+IDrow)*_filterOutput.getNBcolumns();
-        register const float* inputPTR=inputFrame+(IDrowStart+IDrow)*_filterOutput.getNBcolumns();
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(inputPTR++) + _tau**(outputPTR)+  _a* result;
-            *(outputPTR++) = result;
-        }
-    }
-#endif
-}
-
-//  horizontal anticausal filter  (basic way, no add on)
-void BasicRetinaFilter::_horizontalAnticausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDrowStart,IDrowEnd), Parallel_horizontalAnticausalFilter(outputFrame, IDrowEnd, _filterOutput.getNBcolumns(), _a ));
-#else
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float* outputPTR=outputFrame+(IDrowEnd-IDrow)*(_filterOutput.getNBcolumns())-1;
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(outputPTR)+  _a* result;
-            *(outputPTR--) = result;
-        }
-    }
-#endif
-}
-
-//  horizontal anticausal filter which multiplies the output by _gain
-void BasicRetinaFilter::_horizontalAnticausalFilter_multGain(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-
-    //#pragma omp parallel for
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float* outputPTR=outputFrame+(IDrowEnd-IDrow)*(_filterOutput.getNBcolumns())-1;
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(outputPTR)+  _a* result;
-            *(outputPTR--) = _gain*result;
-        }
-    }
-}
-
-//  vertical anticausal filter
-void BasicRetinaFilter::_verticalCausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDcolumnStart,IDcolumnEnd), Parallel_verticalCausalFilter(outputFrame, _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _a ));
-#else
-        for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=outputFrame+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + _a * result;
-            *(outputPTR) = result;
-            outputPTR+=_filterOutput.getNBcolumns();
-
-        }
-    }
-#endif
-}
-
-
-//  vertical anticausal filter (basic way, no add on)
-void BasicRetinaFilter::_verticalAnticausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-    float* offset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    //#pragma omp parallel for
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=offset+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + _a * result;
-            *(outputPTR) = result;
-            outputPTR-=_filterOutput.getNBcolumns();
-
-        }
-    }
-}
-
-//  vertical anticausal filter which multiplies the output by _gain
-void BasicRetinaFilter::_verticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDcolumnStart,IDcolumnEnd), Parallel_verticalAnticausalFilter_multGain(outputFrame, _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _a, _gain ));
-#else
-        float* offset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    //#pragma omp parallel for
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=offset+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + _a * result;
-            *(outputPTR) = _gain*result;
-            outputPTR-=_filterOutput.getNBcolumns();
-
-        }
-    }
-#endif
-}
-
-/////////////////////////////////////////
-// specific modifications of 1D filters
-
-// -> squaring horizontal causal filter
-void BasicRetinaFilter::_squaringHorizontalCausalFilter(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-    register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(inputPTR)**(inputPTR) + _tau**(outputPTR)+  _a* result;
-            *(outputPTR++) = result;
-            ++inputPTR;
-        }
-    }
-}
-
-//  vertical anticausal filter that returns the mean value of its result
-float BasicRetinaFilter::_verticalAnticausalFilter_returnMeanValue(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-    register float meanValue=0;
-    float* offset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=offset+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + _a * result;
-            *(outputPTR) = _gain*result;
-            meanValue+=*(outputPTR);
-            outputPTR-=_filterOutput.getNBcolumns();
-
-        }
-    }
-
-    return meanValue/(float)_filterOutput.getNBpixels();
-}
-
-// LP filter with integration in specific areas (regarding true values of a binary parameters image)
-void BasicRetinaFilter::_localSquaringSpatioTemporalLPfilter(const float *inputFrame, float *LPfilterOutput, const unsigned int *integrationAreas, const unsigned int filterIndex)
-{
-    unsigned int coefTableOffset=filterIndex*3;
-    _a=_filteringCoeficientsTable[coefTableOffset+0];
-    _gain=_filteringCoeficientsTable[coefTableOffset+1];
-    _tau=_filteringCoeficientsTable[coefTableOffset+2];
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-
-    _local_squaringHorizontalCausalFilter(inputFrame, LPfilterOutput, 0, _filterOutput.getNBrows(), integrationAreas);
-    _local_horizontalAnticausalFilter(LPfilterOutput, 0, _filterOutput.getNBrows(), integrationAreas);
-    _local_verticalCausalFilter(LPfilterOutput, 0, _filterOutput.getNBcolumns(), integrationAreas);
-    _local_verticalAnticausalFilter_multGain(LPfilterOutput, 0, _filterOutput.getNBcolumns(), integrationAreas);
-
-}
-
-// LP filter on specific parts of the picture instead of all the image
-// same functions (some of them) but take a binary flag to allow integration, false flag means, no data change at the output...
-
-// this function take an image in input and squares it befor computing
-void BasicRetinaFilter::_local_squaringHorizontalCausalFilter(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const unsigned int *integrationAreas)
-{
-    register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    const unsigned int *integrationAreasPTR=integrationAreas;
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            if (*(integrationAreasPTR++))
-                result = *(inputPTR)**(inputPTR) + _tau**(outputPTR)+  _a* result;
-            else
-                result=0;
-            *(outputPTR++) = result;
-            ++inputPTR;
-
-        }
-    }
-}
-
-void BasicRetinaFilter::_local_horizontalAnticausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const unsigned int *integrationAreas)
-{
-
-    register float* outputPTR=outputFrame+IDrowEnd*(_filterOutput.getNBcolumns())-1;
-    const unsigned int *integrationAreasPTR=integrationAreas;
-
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            if (*(integrationAreasPTR++))
-                result = *(outputPTR)+  _a* result;
-            else
-                result=0;
-            *(outputPTR--) = result;
-        }
-    }
-
-}
-
-void BasicRetinaFilter::_local_verticalCausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const unsigned int *integrationAreas)
-{
-    const unsigned int *integrationAreasPTR=integrationAreas;
-
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=outputFrame+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            if (*(integrationAreasPTR++))
-                result = *(outputPTR)+  _a* result;
-            else
-                result=0;
-            *(outputPTR) = result;
-            outputPTR+=_filterOutput.getNBcolumns();
-
-        }
-    }
-}
-// this functions affects _gain at the output
-void BasicRetinaFilter::_local_verticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const unsigned int *integrationAreas)
-{
-    const unsigned int *integrationAreasPTR=integrationAreas;
-    float* offset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=offset+IDcolumn;
-
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            if (*(integrationAreasPTR++))
-                result = *(outputPTR)+  _a* result;
-            else
-                result=0;
-            *(outputPTR) = _gain*result;
-            outputPTR-=_filterOutput.getNBcolumns();
-
-        }
-    }
-}
-
-////////////////////////////////////////////////////
-// run LP filter for a new frame input and save result at a specific output adress
-// -> USE IRREGULAR SPATIAL CONSTANT
-
-// irregular filter computed from a buffer and rewrites it
-void BasicRetinaFilter::_spatiotemporalLPfilter_Irregular(float *inputOutputFrame, const unsigned int filterIndex)
-{
-    if (_progressiveGain.size()==0)
-    {
-        std::cerr<<"BasicRetinaFilter::runProgressiveFilter: cannot perform filtering, no progressive filter settled up"<<std::endl;
-        return;
-    }
-    unsigned int coefTableOffset=filterIndex*3;
-    /**********/
-    //_a=_filteringCoeficientsTable[coefTableOffset];
-    _tau=_filteringCoeficientsTable[2+coefTableOffset];
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    _horizontalCausalFilter_Irregular(inputOutputFrame, 0, (int)_filterOutput.getNBrows());
-    _horizontalAnticausalFilter_Irregular(inputOutputFrame, 0, (int)_filterOutput.getNBrows(), &_progressiveSpatialConstant[0]);
-    _verticalCausalFilter_Irregular(inputOutputFrame, 0, (int)_filterOutput.getNBcolumns(), &_progressiveSpatialConstant[0]);
-    _verticalAnticausalFilter_Irregular_multGain(inputOutputFrame, 0, (int)_filterOutput.getNBcolumns());
-
-}
-// irregular filter computed from a buffer and puts result on another
-void BasicRetinaFilter::_spatiotemporalLPfilter_Irregular(const float *inputFrame, float *outputFrame, const unsigned int filterIndex)
-{
-    if (_progressiveGain.size()==0)
-    {
-        std::cerr<<"BasicRetinaFilter::runProgressiveFilter: cannot perform filtering, no progressive filter settled up"<<std::endl;
-        return;
-    }
-    unsigned int coefTableOffset=filterIndex*3;
-    /**********/
-    //_a=_filteringCoeficientsTable[coefTableOffset];
-    _tau=_filteringCoeficientsTable[2+coefTableOffset];
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    _horizontalCausalFilter_Irregular_addInput(inputFrame, outputFrame, 0, (int)_filterOutput.getNBrows());
-    _horizontalAnticausalFilter_Irregular(outputFrame, 0, (int)_filterOutput.getNBrows(), &_progressiveSpatialConstant[0]);
-    _verticalCausalFilter_Irregular(outputFrame, 0, (int)_filterOutput.getNBcolumns(), &_progressiveSpatialConstant[0]);
-    _verticalAnticausalFilter_Irregular_multGain(outputFrame, 0, (int)_filterOutput.getNBcolumns());
-
-}
-// 1D filters with irregular spatial constant
-//  horizontal causal filter wich runs on its input buffer
-void BasicRetinaFilter::_horizontalCausalFilter_Irregular(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-    register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* spatialConstantPTR=&_progressiveSpatialConstant[0]+IDrowStart*_filterOutput.getNBcolumns();
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(outputPTR)+  *(spatialConstantPTR++)* result;
-            *(outputPTR++) = result;
-        }
-    }
-}
-
-// horizontal causal filter with add input
-void BasicRetinaFilter::_horizontalCausalFilter_Irregular_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-    register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* spatialConstantPTR=&_progressiveSpatialConstant[0]+IDrowStart*_filterOutput.getNBcolumns();
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(inputPTR++) + _tau**(outputPTR)+  *(spatialConstantPTR++)* result;
-            *(outputPTR++) = result;
-        }
-    }
-
-}
-
-//  horizontal anticausal filter  (basic way, no add on)
-void BasicRetinaFilter::_horizontalAnticausalFilter_Irregular(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const float *spatialConstantBuffer)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDrowStart,IDrowEnd), Parallel_horizontalAnticausalFilter_Irregular(outputFrame, spatialConstantBuffer, IDrowEnd, _filterOutput.getNBcolumns()));
-#else
-    register float* outputPTR=outputFrame+IDrowEnd*(_filterOutput.getNBcolumns())-1;
-    register const float* spatialConstantPTR=spatialConstantBuffer+IDrowEnd*(_filterOutput.getNBcolumns())-1;
-
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            result = *(outputPTR)+  *(spatialConstantPTR--)* result;
-            *(outputPTR--) = result;
-        }
-    }
-#endif
-
-}
-
-//  vertical anticausal filter
-void BasicRetinaFilter::_verticalCausalFilter_Irregular(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const float *spatialConstantBuffer)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDcolumnStart,IDcolumnEnd), Parallel_verticalCausalFilter_Irregular(outputFrame, spatialConstantBuffer, _filterOutput.getNBrows(), _filterOutput.getNBcolumns()));
-#else
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=outputFrame+IDcolumn;
-        register const float *spatialConstantPTR=spatialConstantBuffer+IDcolumn;
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + *(spatialConstantPTR) * result;
-            *(outputPTR) = result;
-            outputPTR+=_filterOutput.getNBcolumns();
-            spatialConstantPTR+=_filterOutput.getNBcolumns();
-        }
-    }
-#endif
-}
-
-//  vertical anticausal filter which multiplies the output by _gain
-void BasicRetinaFilter::_verticalAnticausalFilter_Irregular_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-    float* outputOffset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    const float* constantOffset=&_progressiveSpatialConstant[0]+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    const float* gainOffset=&_progressiveGain[0]+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=outputOffset+IDcolumn;
-        register const float *spatialConstantPTR=constantOffset+IDcolumn;
-        register const float *progressiveGainPTR=gainOffset+IDcolumn;
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + *(spatialConstantPTR) * result;
-            *(outputPTR) = *(progressiveGainPTR)*result;
-            outputPTR-=_filterOutput.getNBcolumns();
-            spatialConstantPTR-=_filterOutput.getNBcolumns();
-            progressiveGainPTR-=_filterOutput.getNBcolumns();
-        }
-    }
-
-}
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/basicretinafilter.hpp b/modules/bioinspired/src/basicretinafilter.hpp
deleted file mode 100644
index 323bff940..000000000
--- a/modules/bioinspired/src/basicretinafilter.hpp
+++ /dev/null
@@ -1,657 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#ifndef BASICRETINAELEMENT_HPP_
-#define BASICRETINAELEMENT_HPP_
-
-#include <cstring>
-
-
-/**
-* @class BasicRetinaFilter
-* @brief Brief overview, this class provides tools for low level image processing:
-* --> this class is able to perform:
-* -> first order Low pass optimized filtering
-* -> local luminance adaptation (able to correct back light problems and contrast enhancement)
-* -> progressive low pass filter filtering (higher filtering on the borders than on the center)
-* -> image data between 0 and 255 resampling with different options, linear rescaling, sigmoide)
-*
-* NOTE : initially the retina model was based on double format scalar values but
-* a good memory/precision compromise is float...
-* also the double format precision does not make so much sense from a biological point of view (neurons value coding is not so precise)
-*
-* TYPICAL USE:
-*
-* // create object at a specified picture size
-* BasicRetinaFilter *_photoreceptorsPrefilter;
-* _photoreceptorsPrefilter =new BasicRetinaFilter(sizeRows, sizeWindows);
-*
-* // init gain, spatial and temporal parameters:
-* _photoreceptorsPrefilter->setCoefficientsTable(gain,temporalConstant, spatialConstant);
-*
-* // during program execution, call the filter for local luminance correction or low pass filtering for an input picture called "FrameBuffer":
-* _photoreceptorsPrefilter->runFilter_LocalAdapdation(FrameBuffer);
-* // or (Low pass first order filter)
-* _photoreceptorsPrefilter->runFilter_LPfilter(FrameBuffer);
-* // get output frame and its size:
-* const unsigned int output_nbRows=_photoreceptorsPrefilter->getNBrows();
-* const unsigned int output_nbColumns=_photoreceptorsPrefilter->getNBcolumns();
-* const double *outputFrame=_photoreceptorsPrefilter->getOutput();
-*
-* // at the end of the program, destroy object:
-* delete _photoreceptorsPrefilter;
-
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-* synthesis of the work described in Alexandre BENOIT thesis: "Le systeme visuel humain au secours de la vision par ordinateur"
-*/
-
-#include <iostream>
-#include "templatebuffer.hpp"
-
-//#define __BASIC_RETINA_ELEMENT_DEBUG
-
-namespace cv
-{
-namespace bioinspired
-{
-    class BasicRetinaFilter
-    {
-    public:
-
-        /**
-        * constructor of the base bio-inspired toolbox, parameters are only linked to imae input size and number of filtering capabilities of the object
-        * @param NBrows: number of rows of the input image
-        * @param NBcolumns: number of columns of the input image
-        * @param parametersListSize: specifies the number of parameters set (each parameters set represents a specific low pass spatio-temporal filter)
-        * @param useProgressiveFilter: specifies if the filter has irreguar (progressive) filtering capabilities (this can be activated later using setProgressiveFilterConstants_xxx methods)
-        */
-        BasicRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns, const unsigned int parametersListSize=1, const bool useProgressiveFilter=false);
-
-        /**
-        * standrad destructore
-        */
-        ~BasicRetinaFilter();
-
-        /**
-        * function which clears the output buffer of the object
-        */
-        inline void clearOutputBuffer(){_filterOutput=0;};
-
-        /**
-        * function which clears the secondary buffer of the object
-        */
-        inline void clearSecondaryBuffer(){_localBuffer=0;};
-
-        /**
-        * function which clears the output and the secondary buffer of the object
-        */
-        inline void clearAllBuffers(){clearOutputBuffer();clearSecondaryBuffer();};
-
-        /**
-        * resize basic retina filter object (resize all allocated buffers
-        * @param NBrows: the new height size
-        * @param NBcolumns: the new width size
-        */
-        void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-        /**
-        * forbiden method inherited from parent std::valarray
-        * prefer not to use this method since the filter matrix become vectors
-        */
-        void resize(const unsigned int){std::cerr<<"error, not accessible method"<<std::endl;};
-
-        /**
-        *  low pass filter call and run (models the homogeneous cells network at the retina level, for example horizontal cells or photoreceptors)
-        * @param inputFrame: the input image to be processed
-        * @param filterIndex: the offset which specifies the parameter set that should be used for the filtering
-        * @return the processed image, the output is reachable later by using function getOutput()
-        */
-        const std::valarray<float> &runFilter_LPfilter(const std::valarray<float> &inputFrame, const unsigned int filterIndex=0); // run the LP filter for a new frame input and save result in _filterOutput
-
-        /**
-        * low pass filter call and run (models the homogeneous cells network at the retina level, for example horizontal cells or photoreceptors)
-        * @param inputFrame: the input image to be processed
-        * @param outputFrame: the output buffer in which the result is writed
-        * @param filterIndex: the offset which specifies the parameter set that should be used for the filtering
-        */
-        void runFilter_LPfilter(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame, const unsigned int filterIndex=0); // run LP filter on a specific output adress
-
-        /**
-        *  low pass filter call and run (models the homogeneous cells network at the retina level, for example horizontal cells or photoreceptors)
-        * @param inputOutputFrame: the input image to be processed on which the result is rewrited
-        * @param filterIndex: the offset which specifies the parameter set that should be used for the filtering
-        */
-        void runFilter_LPfilter_Autonomous(std::valarray<float> &inputOutputFrame, const unsigned int filterIndex=0);// run LP filter on the input data and rewrite it
-
-        /**
-        *  local luminance adaptation call and run (contrast enhancement property of the photoreceptors)
-        * @param inputOutputFrame: the input image to be processed
-        * @param localLuminance: an image which represents the local luminance of the inputFrame parameter, in general, it is its low pass spatial filtering
-        * @return the processed image, the output is reachable later by using function getOutput()
-        */
-        const std::valarray<float> &runFilter_LocalAdapdation(const std::valarray<float> &inputOutputFrame, const std::valarray<float> &localLuminance);// run local adaptation filter and save result in _filterOutput
-
-        /**
-        *  local luminance adaptation call and run (contrast enhancement property of the photoreceptors)
-        * @param inputFrame: the input image to be processed
-        * @param localLuminance: an image which represents the local luminance of the inputFrame parameter, in general, it is its low pass spatial filtering
-        * @param outputFrame: the output buffer in which the result is writed
-        */
-        void runFilter_LocalAdapdation(const std::valarray<float> &inputFrame, const std::valarray<float> &localLuminance, std::valarray<float> &outputFrame); // run local adaptation filter on a specific output adress
-
-        /**
-        *  local luminance adaptation call and run (contrast enhancement property of the photoreceptors)
-        * @param inputFrame: the input image to be processed
-        * @return the processed image, the output is reachable later by using function getOutput()
-        */
-        const std::valarray<float> &runFilter_LocalAdapdation_autonomous(const std::valarray<float> &inputFrame);// run local adaptation filter and save result in _filterOutput
-
-        /**
-        *  local luminance adaptation call and run (contrast enhancement property of the photoreceptors)
-        * @param inputFrame: the input image to be processed
-        * @param outputFrame: the output buffer in which the result is writen
-        */
-        void runFilter_LocalAdapdation_autonomous(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame); // run local adaptation filter on a specific output adress
-
-        /**
-        * run low pass filtering with progressive parameters (models the retina log sampling of the photoreceptors and its low pass filtering effect consequence: more powerfull low pass filtering effect on the corners)
-        * @param inputFrame: the input image to be processed
-        * @param filterIndex: the index which specifies the parameter set that should be used for the filtering
-        * @return the processed image, the output is reachable later by using function getOutput() if outputFrame is NULL
-        */
-        inline void runProgressiveFilter(std::valarray<float> &inputFrame, const unsigned int filterIndex=0){_spatiotemporalLPfilter_Irregular(&inputFrame[0], filterIndex);};
-
-        /**
-        * run low pass filtering with progressive parameters (models the retina log sampling of the photoreceptors and its low pass filtering effect consequence: more powerfull low pass filtering effect on the corners)
-        * @param inputFrame: the input image to be processed
-        * @param outputFrame: the output buffer in which the result is writen
-        * @param filterIndex: the index which specifies the parameter set that should be used for the filtering
-        */
-        inline void runProgressiveFilter(const std::valarray<float> &inputFrame,
-            std::valarray<float> &outputFrame,
-            const unsigned int filterIndex=0)
-        {_spatiotemporalLPfilter_Irregular(get_data(inputFrame), &outputFrame[0], filterIndex);};
-
-        /**
-        * first order spatio-temporal low pass filter setup function
-        * @param beta: gain of the filter (generally set to zero)
-        * @param tau: time constant of the filter (unit is frame for video processing)
-        * @param k: spatial constant of the filter (unit is pixels)
-        * @param filterIndex: the index which specifies the parameter set that should be used for the filtering
-        */
-        void setLPfilterParameters(const float beta, const float tau, const float k, const unsigned int filterIndex=0); // change the parameters of the filter
-
-        /**
-        * first order spatio-temporal low pass filter setup function
-        * @param beta: gain of the filter (generally set to zero)
-        * @param tau: time constant of the filter (unit is frame for video processing)
-        * @param alpha0: spatial constant of the filter (unit is pixels) on the border of the image
-        * @param filterIndex: the index which specifies the parameter set that should be used for the filtering
-        */
-        void setProgressiveFilterConstants_CentredAccuracy(const float beta, const float tau, const float alpha0, const unsigned int filterIndex=0);
-
-        /**
-        * first order spatio-temporal low pass filter setup function
-        * @param beta: gain of the filter (generally set to zero)
-        * @param tau: time constant of the filter (unit is frame for video processing)
-        * @param alpha0: spatial constant of the filter (unit is pixels) on the border of the image
-        * @param accuracyMap an image (float format) which values range is between 0 and 1, where 0 means, apply no filtering and 1 means apply the filtering as specified in the parameters set, intermediate values allow to smooth variations of the filtering strenght
-        * @param filterIndex: the index which specifies the parameter set that should be used for the filtering
-        */
-        void setProgressiveFilterConstants_CustomAccuracy(const float beta, const float tau, const float alpha0, const std::valarray<float> &accuracyMap, const unsigned int filterIndex=0);
-
-        /**
-        * local luminance adaptation setup, this function should be applied for normal local adaptation (not for tone mapping operation)
-        * @param v0: compression effect for the local luminance adaptation processing, set a value between 0.6 and 0.9 for best results, a high value yields to a high compression effect
-        * @param maxInputValue: the maximum amplitude value measured after local adaptation processing (c.f. function runFilter_LocalAdapdation & runFilter_LocalAdapdation_autonomous)
-        * @param meanLuminance: the a priori meann luminance of the input data (should be 128 for 8bits images but can vary greatly in case of High Dynamic Range Images (HDRI)
-        */
-        void setV0CompressionParameter(const float v0, const float maxInputValue, const float){ _v0=v0*maxInputValue; _localLuminanceFactor=v0; _localLuminanceAddon=maxInputValue*(1.0f-v0); _maxInputValue=maxInputValue;};
-
-        /**
-        * update local luminance adaptation setup, initial maxInputValue is kept. This function should be applied for normal local adaptation (not for tone mapping operation)
-        * @param v0: compression effect for the local luminance adaptation processing, set a value between 0.6 and 0.9 for best results, a high value yields to a high compression effect
-        * @param meanLuminance: the a priori meann luminance of the input data (should be 128 for 8bits images but can vary greatly in case of High Dynamic Range Images (HDRI)
-        */
-        void setV0CompressionParameter(const float v0, const float meanLuminance){ this->setV0CompressionParameter(v0, _maxInputValue, meanLuminance);};
-
-        /**
-        * local luminance adaptation setup, this function should be applied for normal local adaptation (not for tone mapping operation)
-        * @param v0: compression effect for the local luminance adaptation processing, set a value between 0.6 and 0.9 for best results, a high value yields to a high compression effect
-        */
-        void setV0CompressionParameter(const float v0){ _v0=v0*_maxInputValue; _localLuminanceFactor=v0; _localLuminanceAddon=_maxInputValue*(1.0f-v0);};
-
-        /**
-        * local luminance adaptation setup, this function should be applied for local adaptation applied to tone mapping operation
-        * @param v0: compression effect for the local luminance adaptation processing, set a value between 0.6 and 0.9 for best results, a high value yields to a high compression effect
-        * @param maxInputValue: the maximum amplitude value measured after local adaptation processing (c.f. function runFilter_LocalAdapdation & runFilter_LocalAdapdation_autonomous)
-        * @param meanLuminance: the a priori meann luminance of the input data (should be 128 for 8bits images but can vary greatly in case of High Dynamic Range Images (HDRI)
-        */
-        void setV0CompressionParameterToneMapping(const float v0, const float maxInputValue, const float meanLuminance=128.0f){ _v0=v0*maxInputValue; _localLuminanceFactor=1.0f; _localLuminanceAddon=meanLuminance*v0; _maxInputValue=maxInputValue;};
-
-        /**
-        * update compression parameters while keeping v0 parameter value
-        * @param meanLuminance the input frame mean luminance
-        */
-        inline void updateCompressionParameter(const float meanLuminance){_localLuminanceFactor=1; _localLuminanceAddon=meanLuminance*_v0;};
-
-        /**
-        * @return the v0 compression parameter used to compute the local adaptation
-        */
-        float getV0CompressionParameter(){ return _v0/_maxInputValue;};
-
-        /**
-        * @return the output result of the object
-        */
-        inline const std::valarray<float> &getOutput() const {return _filterOutput;};
-
-        /**
-        * @return number of rows of the filter
-        */
-        inline unsigned int getNBrows(){return _filterOutput.getNBrows();};
-
-        /**
-        * @return number of columns of the filter
-        */
-        inline unsigned int getNBcolumns(){return _filterOutput.getNBcolumns();};
-
-        /**
-        * @return number of pixels of the filter
-        */
-        inline unsigned int getNBpixels(){return _filterOutput.getNBpixels();};
-
-        /**
-        * force filter output to be normalized between 0 and maxValue
-        * @param maxValue: the maximum output value that is required
-        */
-        inline void normalizeGrayOutput_0_maxOutputValue(const float maxValue){_filterOutput.normalizeGrayOutput_0_maxOutputValue(maxValue);};
-
-        /**
-        * force filter output to be normalized around 0 and rescaled with a sigmoide effect (extrem values saturation)
-        * @param maxValue: the maximum output value that is required
-        */
-        inline void normalizeGrayOutputCentredSigmoide(){_filterOutput.normalizeGrayOutputCentredSigmoide();};
-
-        /**
-        * force filter output to be normalized : data centering and std normalisation
-        * @param maxValue: the maximum output value that is required
-        */
-        inline void centerReductImageLuminance(){_filterOutput.centerReductImageLuminance();};
-
-        /**
-        * @return the maximum input buffer value
-        */
-        inline float getMaxInputValue(){return this->_maxInputValue;};
-
-        /**
-        * @return the maximum input buffer value
-        */
-        inline void setMaxInputValue(const float newMaxInputValue){this->_maxInputValue=newMaxInputValue;};
-
-    protected:
-
-        /////////////////////////
-        // data buffers
-        TemplateBuffer<float> _filterOutput; // primary buffer (contains processing outputs)
-        std::valarray<float> _localBuffer; // local secondary buffer
-        /////////////////////////
-        // PARAMETERS
-        unsigned int _halfNBrows;
-        unsigned int _halfNBcolumns;
-
-        // parameters buffers
-        std::valarray <float>_filteringCoeficientsTable;
-        std::valarray <float>_progressiveSpatialConstant;// pointer to a local table containing local spatial constant (allocated with the object)
-        std::valarray <float>_progressiveGain;// pointer to a local table containing local spatial constant (allocated with the object)
-
-        // local adaptation filtering parameters
-        float _v0; //value used for local luminance adaptation function
-        float _maxInputValue;
-        float _meanInputValue;
-        float _localLuminanceFactor;
-        float _localLuminanceAddon;
-
-        // protected data related to standard low pass filters parameters
-        float _a;
-        float _tau;
-        float _gain;
-
-        /////////////////////////
-        // FILTERS METHODS
-
-        // Basic low pass spation temporal low pass filter used by each retina filters
-        void _spatiotemporalLPfilter(const float *inputFrame, float *LPfilterOutput, const unsigned int coefTableOffset=0);
-        float _squaringSpatiotemporalLPfilter(const float *inputFrame, float *outputFrame, const unsigned int filterIndex=0);
-
-        // LP filter with an irregular spatial filtering
-
-        // -> rewrites the input buffer
-        void _spatiotemporalLPfilter_Irregular(float *inputOutputFrame, const unsigned int filterIndex=0);
-        // writes the output on another buffer
-        void _spatiotemporalLPfilter_Irregular(const float *inputFrame, float *outputFrame, const unsigned int filterIndex=0);
-        // LP filter that squares the input and computes the output ONLY on the areas where the integrationAreas map are TRUE
-        void _localSquaringSpatioTemporalLPfilter(const float *inputFrame, float *LPfilterOutput, const unsigned int *integrationAreas, const unsigned int filterIndex=0);
-
-        // local luminance adaptation of the input in regard of localLuminance buffer
-        void _localLuminanceAdaptation(const float *inputFrame, const float *localLuminance, float *outputFrame, const bool updateLuminanceMean=true);
-        // local luminance adaptation of the input in regard of localLuminance buffer, the input is rewrited and becomes the output
-        void _localLuminanceAdaptation(float *inputOutputFrame, const float *localLuminance);
-        // local adaptation applied on a range of values which can be positive and negative
-        void _localLuminanceAdaptationPosNegValues(const float *inputFrame, const float *localLuminance, float *outputFrame);
-
-
-        //////////////////////////////////////////////////////////////
-        // 1D directional filters used for the 2D low pass filtering
-
-        // 1D filters with image input
-        void _horizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);
-        // 1D filters  with image input that is squared in the function // parallelized with TBB
-        void _squaringHorizontalCausalFilter(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);
-        //  vertical anticausal filter that returns the mean value of its result
-        float _verticalAnticausalFilter_returnMeanValue(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd);
-
-        // most simple functions: only perform 1D filtering with output=input (no add on)
-        void _horizontalCausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);
-        void _horizontalAnticausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);     // parallelized with TBB
-        void _verticalCausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd);     // parallelized with TBB
-        void _verticalAnticausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd);
-
-        // perform 1D filtering with output with varrying spatial coefficient
-        void _horizontalCausalFilter_Irregular(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);
-        void _horizontalCausalFilter_Irregular_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd);
-        void _horizontalAnticausalFilter_Irregular(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const float *spatialConstantBuffer);   // parallelized with TBB
-        void _verticalCausalFilter_Irregular(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const float *spatialConstantBuffer);   // parallelized with TBB
-        void _verticalAnticausalFilter_Irregular_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd);
-
-
-        // 1D filters in which the output is multiplied by _gain
-        void _verticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd); // this functions affects _gain at the output // parallelized with TBB
-        void _horizontalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd); // this functions affects _gain at the output
-
-        // LP filter on specific parts of the picture instead of all the image
-        // same functions (some of them) but take a binary flag to allow integration, false flag means, 0 at the output...
-        void _local_squaringHorizontalCausalFilter(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const unsigned int *integrationAreas);
-        void _local_horizontalAnticausalFilter(float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd, const unsigned int *integrationAreas);
-        void _local_verticalCausalFilter(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const unsigned int *integrationAreas);
-        void _local_verticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd, const unsigned int *integrationAreas); // this functions affects _gain at the output
-
-#ifdef MAKE_PARALLEL
-        /******************************************************
-        ** IF some parallelizing thread methods are available, then, main loops are parallelized using these functors
-        ** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
-        ** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
-        ** ==> functors constructors can differ from the parameters used with their related serial functions
-        */
-
-#define _DEBUG_TBB // define DEBUG_TBB in order to display additionnal data on stdout
-        class Parallel_horizontalAnticausalFilter: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            unsigned int IDrowEnd, nbColumns;
-            float filterParam_a;
-        public:
-            // constructor which takes the input image pointer reference reference and limits
-            Parallel_horizontalAnticausalFilter(float *bufferToProcess, const unsigned int idEnd, const unsigned int nbCols, const float a )
-                :outputFrame(bufferToProcess), IDrowEnd(idEnd), nbColumns(nbCols), filterParam_a(a)
-            {
-#ifdef DEBUG_TBB
-                std::cout<<"Parallel_horizontalAnticausalFilter::Parallel_horizontalAnticausalFilter :"
-                    <<"\n\t idEnd="<<IDrowEnd
-                    <<"\n\t nbCols="<<nbColumns
-                    <<"\n\t filterParam="<<filterParam_a
-                    <<std::endl;
-#endif
-            }
-
-            virtual void operator()( const Range& r ) const {
-
-#ifdef DEBUG_TBB
-                std::cout<<"Parallel_horizontalAnticausalFilter::operator() :"
-                    <<"\n\t range size="<<r.size()
-                    <<"\n\t first index="<<r.start
-                    //<<"\n\t last index="<<filterParam
-                    <<std::endl;
-#endif
-                for (int IDrow=r.start; IDrow!=r.end; ++IDrow)
-                {
-                    register float* outputPTR=outputFrame+(IDrowEnd-IDrow)*(nbColumns)-1;
-                    register float result=0;
-                    for (unsigned int index=0; index<nbColumns; ++index)
-                    {
-                        result = *(outputPTR)+  filterParam_a* result;
-                        *(outputPTR--) = result;
-                    }
-                }
-            }
-        };
-
-        class Parallel_horizontalCausalFilter_addInput: public cv::ParallelLoopBody
-        {
-        private:
-            const float *inputFrame;
-            float *outputFrame;
-            unsigned int IDrowStart, nbColumns;
-            float filterParam_a, filterParam_tau;
-        public:
-            Parallel_horizontalCausalFilter_addInput(const float *bufferToAddAsInputProcess, float *bufferToProcess, const unsigned int idStart, const unsigned int nbCols,  const float a,  const float tau)
-                :inputFrame(bufferToAddAsInputProcess), outputFrame(bufferToProcess), IDrowStart(idStart), nbColumns(nbCols), filterParam_a(a), filterParam_tau(tau){}
-
-            virtual void operator()( const Range& r ) const {
-                for (int IDrow=r.start; IDrow!=r.end; ++IDrow)
-                {
-                    register float* outputPTR=outputFrame+(IDrowStart+IDrow)*nbColumns;
-                    register const float* inputPTR=inputFrame+(IDrowStart+IDrow)*nbColumns;
-                    register float result=0;
-                    for (unsigned int index=0; index<nbColumns; ++index)
-                    {
-                        result = *(inputPTR++) + filterParam_tau**(outputPTR)+  filterParam_a* result;
-                        *(outputPTR++) = result;
-                    }
-                }
-            }
-        };
-
-        class Parallel_verticalCausalFilter: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            unsigned int nbRows, nbColumns;
-            float filterParam_a;
-        public:
-            Parallel_verticalCausalFilter(float *bufferToProcess, const unsigned int nbRws, const unsigned int nbCols, const float a )
-                :outputFrame(bufferToProcess), nbRows(nbRws), nbColumns(nbCols), filterParam_a(a){}
-
-            virtual void operator()( const Range& r ) const {
-                for (int IDcolumn=r.start; IDcolumn!=r.end; ++IDcolumn)
-                {
-                    register float result=0;
-                    register float *outputPTR=outputFrame+IDcolumn;
-
-                    for (unsigned int index=0; index<nbRows; ++index)
-                    {
-                        result = *(outputPTR) + filterParam_a * result;
-                        *(outputPTR) = result;
-                        outputPTR+=nbColumns;
-
-                    }
-                }
-            }
-        };
-
-        class Parallel_verticalAnticausalFilter_multGain: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            unsigned int nbRows, nbColumns;
-            float filterParam_a, filterParam_gain;
-        public:
-            Parallel_verticalAnticausalFilter_multGain(float *bufferToProcess, const unsigned int nbRws, const unsigned int nbCols, const float a, const float  gain)
-                :outputFrame(bufferToProcess), nbRows(nbRws), nbColumns(nbCols), filterParam_a(a), filterParam_gain(gain){}
-
-            virtual void operator()( const Range& r ) const {
-                float* offset=outputFrame+nbColumns*nbRows-nbColumns;
-                for (int IDcolumn=r.start; IDcolumn!=r.end; ++IDcolumn)
-                {
-                    register float result=0;
-                    register float *outputPTR=offset+IDcolumn;
-
-                    for (unsigned int index=0; index<nbRows; ++index)
-                    {
-                        result = *(outputPTR) + filterParam_a * result;
-                        *(outputPTR) = filterParam_gain*result;
-                        outputPTR-=nbColumns;
-
-                    }
-                }
-            }
-        };
-
-        class Parallel_localAdaptation: public cv::ParallelLoopBody
-        {
-        private:
-            const float *localLuminance, *inputFrame;
-            float *outputFrame;
-            float localLuminanceFactor, localLuminanceAddon, maxInputValue;
-        public:
-            Parallel_localAdaptation(const float *localLum, const float *inputImg, float *bufferToProcess, const float localLuminanceFact, const float localLuminanceAdd, const float maxInputVal)
-                :localLuminance(localLum), inputFrame(inputImg),outputFrame(bufferToProcess), localLuminanceFactor(localLuminanceFact), localLuminanceAddon(localLuminanceAdd), maxInputValue(maxInputVal) {};
-
-            virtual void operator()( const Range& r ) const {
-                const float *localLuminancePTR=localLuminance+r.start;
-                const float *inputFramePTR=inputFrame+r.start;
-                float *outputFramePTR=outputFrame+r.start;
-                for (register int IDpixel=r.start ; IDpixel!=r.end ; ++IDpixel, ++inputFramePTR, ++outputFramePTR)
-                {
-                    float X0=*(localLuminancePTR++)*localLuminanceFactor+localLuminanceAddon;
-                    // TODO : the following line can lead to a divide by zero ! A small offset is added, take care if the offset is too large in case of High Dynamic Range images which can use very small values...
-                    *(outputFramePTR) = (maxInputValue+X0)**inputFramePTR/(*inputFramePTR +X0+0.00000000001f);
-                    //std::cout<<"BasicRetinaFilter::inputFrame[IDpixel]=%f, X0=%f, outputFrame[IDpixel]=%f\n", inputFrame[IDpixel], X0, outputFrame[IDpixel]);
-                }
-            }
-        };
-
-        //////////////////////////////////////////
-        /// Specific filtering methods which manage non const spatial filtering parameter (used By retinacolor and LogProjectors)
-        class Parallel_horizontalAnticausalFilter_Irregular: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            const float *spatialConstantBuffer;
-            unsigned int IDrowEnd, nbColumns;
-        public:
-            Parallel_horizontalAnticausalFilter_Irregular(float *bufferToProcess, const float *spatialConst, const unsigned int idEnd, const unsigned int nbCols)
-                :outputFrame(bufferToProcess), spatialConstantBuffer(spatialConst), IDrowEnd(idEnd), nbColumns(nbCols){}
-
-            virtual void operator()( const Range& r ) const {
-
-                for (int IDrow=r.start; IDrow!=r.end; ++IDrow)
-                {
-                    register float* outputPTR=outputFrame+(IDrowEnd-IDrow)*(nbColumns)-1;
-                    register const float* spatialConstantPTR=spatialConstantBuffer+(IDrowEnd-IDrow)*(nbColumns)-1;
-                    register float result=0;
-                    for (unsigned int index=0; index<nbColumns; ++index)
-                    {
-                        result = *(outputPTR)+  *(spatialConstantPTR--)* result;
-                        *(outputPTR--) = result;
-                    }
-                }
-            }
-        };
-
-        class Parallel_verticalCausalFilter_Irregular: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            const float *spatialConstantBuffer;
-            unsigned int nbRows, nbColumns;
-        public:
-            Parallel_verticalCausalFilter_Irregular(float *bufferToProcess, const float *spatialConst, const unsigned int nbRws, const unsigned int nbCols)
-                :outputFrame(bufferToProcess), spatialConstantBuffer(spatialConst), nbRows(nbRws), nbColumns(nbCols){}
-
-            virtual void operator()( const Range& r ) const {
-                for (int IDcolumn=r.start; IDcolumn!=r.end; ++IDcolumn)
-                {
-                    register float result=0;
-                    register float *outputPTR=outputFrame+IDcolumn;
-                    register const float* spatialConstantPTR=spatialConstantBuffer+IDcolumn;
-                    for (unsigned int index=0; index<nbRows; ++index)
-                    {
-                        result = *(outputPTR) +  *(spatialConstantPTR) * result;
-                        *(outputPTR) = result;
-                        outputPTR+=nbColumns;
-                        spatialConstantPTR+=nbColumns;
-                    }
-                }
-            }
-        };
-
-#endif
-
-    };
-
-}// end of namespace bioinspired
-}// end of namespace cv
-#endif
diff --git a/modules/bioinspired/src/imagelogpolprojection.cpp b/modules/bioinspired/src/imagelogpolprojection.cpp
deleted file mode 100644
index 0a4c1ed0d..000000000
--- a/modules/bioinspired/src/imagelogpolprojection.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-#include "imagelogpolprojection.hpp"
-
-#include <cmath>
-#include <iostream>
-
-// @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-
-namespace cv
-{
-namespace bioinspired
-{
-// constructor
-ImageLogPolProjection::ImageLogPolProjection(const unsigned int nbRows, const unsigned int nbColumns, const PROJECTIONTYPE projection, const bool colorModeCapable)
-:BasicRetinaFilter(nbRows, nbColumns),
- _sampledFrame(0),
- _tempBuffer(_localBuffer),
- _transformTable(0),
- _irregularLPfilteredFrame(_filterOutput)
-{
-    _inputDoubleNBpixels=nbRows*nbColumns*2;
-    _selectedProjection = projection;
-    _reductionFactor=0;
-    _initOK=false;
-    _usefullpixelIndex=0;
-    _colorModeCapable=colorModeCapable;
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::allocating"<<std::endl;
-#endif
-    if (_colorModeCapable)
-    {
-        _tempBuffer.resize(nbRows*nbColumns*3);
-    }
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::done"<<std::endl;
-#endif
-
-    clearAllBuffers();
-}
-
-// destructor
-ImageLogPolProjection::~ImageLogPolProjection()
-{
-
-}
-
-
-// reset buffers method
-void ImageLogPolProjection::clearAllBuffers()
-{
-    _sampledFrame=0;
-    _tempBuffer=0;
-    BasicRetinaFilter::clearAllBuffers();
-}
-
-/**
-* resize retina color filter object (resize all allocated buffers)
-* @param NBrows: the new height size
-* @param NBcolumns: the new width size
-*/
-void ImageLogPolProjection::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::resize(NBrows, NBcolumns);
-    initProjection(_reductionFactor, _samplingStrenght);
-
-    // reset buffers method
-    clearAllBuffers();
-
-}
-
-// init functions depending on the projection type
-bool ImageLogPolProjection::initProjection(const double reductionFactor, const double samplingStrenght)
-{
-    switch(_selectedProjection)
-    {
-    case RETINALOGPROJECTION:
-        return _initLogRetinaSampling(reductionFactor, samplingStrenght);
-        break;
-    case CORTEXLOGPOLARPROJECTION:
-        return _initLogPolarCortexSampling(reductionFactor, samplingStrenght);
-        break;
-    default:
-        std::cout<<"ImageLogPolProjection::no projection setted up... performing default retina projection... take care"<<std::endl;
-        return _initLogRetinaSampling(reductionFactor, samplingStrenght);
-        break;
-    }
-}
-
-// -> private init functions dedicated to each projection
-bool ImageLogPolProjection::_initLogRetinaSampling(const double reductionFactor, const double samplingStrenght)
-{
-    _initOK=false;
-
-    if (_selectedProjection!=RETINALOGPROJECTION)
-    {
-        std::cerr<<"ImageLogPolProjection::initLogRetinaSampling: could not initialize logPolar projection for a log projection system\n -> you probably chose the wrong init function, use initLogPolarCortexSampling() instead"<<std::endl;
-        return false;
-    }
-    if (reductionFactor<1.0)
-    {
-        std::cerr<<"ImageLogPolProjection::initLogRetinaSampling: reduction factor must be superior to 0, skeeping initialisation..."<<std::endl;
-        return false;
-    }
-
-    // compute image output size
-    _outputNBrows=predictOutputSize(this->getNBrows(), reductionFactor);
-    _outputNBcolumns=predictOutputSize(this->getNBcolumns(), reductionFactor);
-    _outputNBpixels=_outputNBrows*_outputNBcolumns;
-    _outputDoubleNBpixels=_outputNBrows*_outputNBcolumns*2;
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::initLogRetinaSampling: Log resampled image resampling factor: "<<reductionFactor<<", strenght:"<<samplingStrenght<<std::endl;
-    std::cout<<"ImageLogPolProjection::initLogRetinaSampling: Log resampled image size: "<<_outputNBrows<<"*"<<_outputNBcolumns<<std::endl;
-#endif
-
-    // setup progressive prefilter that will be applied BEFORE log sampling
-    setProgressiveFilterConstants_CentredAccuracy(0.f, 0.f, 0.99f);
-
-    // (re)create the image output buffer and transform table if the reduction factor changed
-    _sampledFrame.resize(_outputNBpixels*(1+(unsigned int)_colorModeCapable*2));
-
-    // specifiying new reduction factor after preliminar checks
-    _reductionFactor=reductionFactor;
-    _samplingStrenght=samplingStrenght;
-
-    // compute the rlim for symetric rows/columns sampling, then, the rlim is based on the smallest dimension
-    _minDimension=(double)(_filterOutput.getNBrows() < _filterOutput.getNBcolumns() ? _filterOutput.getNBrows() : _filterOutput.getNBcolumns());
-
-    // input frame dimensions dependent log sampling:
-    //double rlim=1.0/reductionFactor*(minDimension/2.0+samplingStrenght);
-
-    // input frame dimensions INdependent log sampling:
-    _azero=(1.0+reductionFactor*std::sqrt(samplingStrenght))/(reductionFactor*reductionFactor*samplingStrenght-1.0);
-    _alim=(1.0+_azero)/reductionFactor;
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::initLogRetinaSampling: rlim= "<<rlim<<std::endl;
-    std::cout<<"ImageLogPolProjection::initLogRetinaSampling: alim= "<<alim<<std::endl;
-#endif
-
-    // get half frame size
-    unsigned int halfOutputRows = _outputNBrows/2-1;
-    unsigned int halfOutputColumns = _outputNBcolumns/2-1;
-    unsigned int halfInputRows = _filterOutput.getNBrows()/2-1;
-    unsigned int halfInputColumns = _filterOutput.getNBcolumns()/2-1;
-
-    // computing log sampling matrix by computing quarters of images
-    // the original new image center (_filterOutput.getNBrows()/2, _filterOutput.getNBcolumns()/2) being at coordinate (_filterOutput.getNBrows()/(2*_reductionFactor), _filterOutput.getNBcolumns()/(2*_reductionFactor))
-
-    // -> use a temporary transform table which is bigger than the final one, we only report pixels coordinates that are included in the sampled picture
-    std::valarray<unsigned int> tempTransformTable(2*_outputNBpixels); // the structure would be: (pixelInputCoordinate n)(pixelOutputCoordinate n)(pixelInputCoordinate n+1)(pixelOutputCoordinate n+1)
-    _usefullpixelIndex=0;
-
-    double rMax=0;
-    halfInputRows<halfInputColumns ? rMax=(double)(halfInputRows*halfInputRows):rMax=(double)(halfInputColumns*halfInputColumns);
-
-    for (unsigned int idRow=0;idRow<halfOutputRows; ++idRow)
-    {
-        for (unsigned int idColumn=0;idColumn<halfOutputColumns; ++idColumn)
-        {
-            // get the pixel position in the original picture
-
-            // -> input frame dimensions dependent log sampling:
-            //double scale = samplingStrenght/(rlim-(double)std::sqrt(idRow*idRow+idColumn*idColumn));
-
-            // -> input frame dimensions INdependent log sampling:
-            double scale=getOriginalRadiusLength((double)std::sqrt((double)(idRow*idRow+idColumn*idColumn)));
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            std::cout<<"ImageLogPolProjection::initLogRetinaSampling: scale= "<<scale<<std::endl;
-            std::cout<<"ImageLogPolProjection::initLogRetinaSampling: scale2= "<<scale2<<std::endl;
-#endif
-            if (scale < 0) ///check it later
-                scale = 10000;
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            //            std::cout<<"ImageLogPolProjection::initLogRetinaSampling: scale= "<<scale<<std::endl;
-#endif
-
-            unsigned int u=(unsigned int)floor((double)idRow*scale);
-            unsigned int v=(unsigned int)floor((double)idColumn*scale);
-
-            // manage border effects
-            double length=u*u+v*v;
-            double radiusRatio=std::sqrt(rMax/length);
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            std::cout<<"ImageLogPolProjection::(inputH, inputW)="<<halfInputRows<<", "<<halfInputColumns<<", Rmax2="<<rMax<<std::endl;
-            std::cout<<"before ==> ImageLogPolProjection::(u, v)="<<u<<", "<<v<<", r="<<u*u+v*v<<std::endl;
-            std::cout<<"ratio ="<<radiusRatio<<std::endl;
-#endif
-
-            if (radiusRatio < 1.0)
-            {
-                u=(unsigned int)floor(radiusRatio*double(u));
-                v=(unsigned int)floor(radiusRatio*double(v));
-            }
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            std::cout<<"after ==> ImageLogPolProjection::(u, v)="<<u<<", "<<v<<", r="<<u*u+v*v<<std::endl;
-            std::cout<<"ImageLogPolProjection::("<<(halfOutputRows-idRow)<<", "<<idColumn+halfOutputColumns<<") <- ("<<halfInputRows-u<<", "<<v+halfInputColumns<<")"<<std::endl;
-            std::cout<<(halfOutputRows-idRow)+(halfOutputColumns+idColumn)*_outputNBrows<<" -> "<<(halfInputRows-u)+_filterOutput.getNBrows()*(halfInputColumns+v)<<std::endl;
-#endif
-
-            if ((u<halfInputRows)&&(v<halfInputColumns))
-            {
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-                std::cout<<"*** VALID ***"<<std::endl;
-#endif
-
-                // set pixel coordinate of the input picture in the transform table at the current log sampled pixel
-                // 1st quadrant
-                tempTransformTable[_usefullpixelIndex++]=(halfOutputColumns+idColumn)+(halfOutputRows-idRow)*_outputNBcolumns;
-                tempTransformTable[_usefullpixelIndex++]=_filterOutput.getNBcolumns()*(halfInputRows-u)+(halfInputColumns+v);
-                // 2nd quadrant
-                tempTransformTable[_usefullpixelIndex++]=(halfOutputColumns+idColumn)+(halfOutputRows+idRow)*_outputNBcolumns;
-                tempTransformTable[_usefullpixelIndex++]=_filterOutput.getNBcolumns()*(halfInputRows+u)+(halfInputColumns+v);
-                // 3rd quadrant
-                tempTransformTable[_usefullpixelIndex++]=(halfOutputColumns-idColumn)+(halfOutputRows-idRow)*_outputNBcolumns;
-                tempTransformTable[_usefullpixelIndex++]=_filterOutput.getNBcolumns()*(halfInputRows-u)+(halfInputColumns-v);
-                // 4td quadrant
-                tempTransformTable[_usefullpixelIndex++]=(halfOutputColumns-idColumn)+(halfOutputRows+idRow)*_outputNBcolumns;
-                tempTransformTable[_usefullpixelIndex++]=_filterOutput.getNBcolumns()*(halfInputRows+u)+(halfInputColumns-v);
-            }
-        }
-    }
-
-    // (re)creating and filling the transform table
-    _transformTable.resize(_usefullpixelIndex);
-    memcpy(&_transformTable[0], &tempTransformTable[0], sizeof(unsigned int)*_usefullpixelIndex);
-
-    // reset all buffers
-    clearAllBuffers();
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::initLogRetinaSampling: init done successfully"<<std::endl;
-#endif
-    _initOK=true;
-    return _initOK;
-}
-
-bool ImageLogPolProjection::_initLogPolarCortexSampling(const double reductionFactor, const double)
-{
-    _initOK=false;
-
-    if (_selectedProjection!=CORTEXLOGPOLARPROJECTION)
-    {
-        std::cerr<<"ImageLogPolProjection::could not initialize log projection for a logPolar projection system\n -> you probably chose the wrong init function, use initLogRetinaSampling() instead"<<std::endl;
-        return false;
-    }
-
-    if (reductionFactor<1.0)
-    {
-        std::cerr<<"ImageLogPolProjection::reduction factor must be superior to 0, skeeping initialisation..."<<std::endl;
-        return false;
-    }
-
-    // compute the smallest image size
-    unsigned int minDimension=(_filterOutput.getNBrows() < _filterOutput.getNBcolumns() ? _filterOutput.getNBrows() : _filterOutput.getNBcolumns());
-    // specifiying new reduction factor after preliminar checks
-    _reductionFactor=reductionFactor;
-    // compute image output size
-    _outputNBrows=(unsigned int)((double)minDimension/reductionFactor);
-    _outputNBcolumns=(unsigned int)((double)minDimension/reductionFactor);
-    _outputNBpixels=_outputNBrows*_outputNBcolumns;
-    _outputDoubleNBpixels=_outputNBrows*_outputNBcolumns*2;
-
-    // get half frame size
-    //unsigned int halfOutputRows = _outputNBrows/2-1;
-    //unsigned int halfOutputColumns = _outputNBcolumns/2-1;
-    unsigned int halfInputRows = _filterOutput.getNBrows()/2-1;
-    unsigned int halfInputColumns = _filterOutput.getNBcolumns()/2-1;
-
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-    std::cout<<"ImageLogPolProjection::Log resampled image size: "<<_outputNBrows<<"*"<<_outputNBcolumns<<std::endl;
-#endif
-
-    // setup progressive prefilter that will be applied BEFORE log sampling
-    setProgressiveFilterConstants_CentredAccuracy(0.f, 0.f, 0.99f);
-
-    // (re)create the image output buffer and transform table if the reduction factor changed
-    _sampledFrame.resize(_outputNBpixels*(1+(unsigned int)_colorModeCapable*2));
-
-    // create the radius and orientation axis and fill them, radius E [0;1], orientation E[-pi, pi]
-    std::valarray<double> radiusAxis(_outputNBcolumns);
-    double radiusStep=2.30/(double)_outputNBcolumns;
-    for (unsigned int i=0;i<_outputNBcolumns;++i)
-    {
-        radiusAxis[i]=i*radiusStep;
-    }
-    std::valarray<double> orientationAxis(_outputNBrows);
-    double orientationStep=-2.0*CV_PI/(double)_outputNBrows;
-    for (unsigned int io=0;io<_outputNBrows;++io)
-    {
-        orientationAxis[io]=io*orientationStep;
-    }
-    // -> use a temporay transform table which is bigger than the final one, we only report pixels coordinates that are included in the sampled picture
-    std::valarray<unsigned int> tempTransformTable(2*_outputNBpixels); // the structure would be: (pixelInputCoordinate n)(pixelOutputCoordinate n)(pixelInputCoordinate n+1)(pixelOutputCoordinate n+1)
-    _usefullpixelIndex=0;
-
-    //std::cout<<"ImageLogPolProjection::Starting cortex projection"<<std::endl;
-    // compute transformation, get theta and Radius in reagrd of the output sampled pixel
-    double diagonalLenght=std::sqrt((double)(_outputNBcolumns*_outputNBcolumns+_outputNBrows*_outputNBrows));
-    for (unsigned int radiusIndex=0;radiusIndex<_outputNBcolumns;++radiusIndex)
-        for(unsigned int orientationIndex=0;orientationIndex<_outputNBrows;++orientationIndex)
-        {
-            double x=1.0+sinh(radiusAxis[radiusIndex])*cos(orientationAxis[orientationIndex]);
-            double y=sinh(radiusAxis[radiusIndex])*sin(orientationAxis[orientationIndex]);
-            // get the input picture coordinate
-            double R=diagonalLenght*std::sqrt(x*x+y*y)/(5.0+std::sqrt(x*x+y*y));
-            double theta=atan2(y,x);
-            // convert input polar coord into cartesian/C compatble coordinate
-            unsigned int columnIndex=(unsigned int)(cos(theta)*R)+halfInputColumns;
-            unsigned int rowIndex=(unsigned int)(sin(theta)*R)+halfInputRows;
-            //std::cout<<"ImageLogPolProjection::R="<<R<<" / Theta="<<theta<<" / (x, y)="<<columnIndex<<", "<<rowIndex<<std::endl;
-            if ((columnIndex<_filterOutput.getNBcolumns())&&(columnIndex>0)&&(rowIndex<_filterOutput.getNBrows())&&(rowIndex>0))
-            {
-                // set coordinate
-                tempTransformTable[_usefullpixelIndex++]=radiusIndex+orientationIndex*_outputNBcolumns;
-                tempTransformTable[_usefullpixelIndex++]= columnIndex+rowIndex*_filterOutput.getNBcolumns();
-            }
-        }
-
-    // (re)creating and filling the transform table
-    _transformTable.resize(_usefullpixelIndex);
-    memcpy(&_transformTable[0], &tempTransformTable[0], sizeof(unsigned int)*_usefullpixelIndex);
-
-    // reset all buffers
-    clearAllBuffers();
-    _initOK=true;
-    return true;
-}
-
-// action function
-std::valarray<float> &ImageLogPolProjection::runProjection(const std::valarray<float> &inputFrame, const bool colorMode)
-{
-    if (_colorModeCapable&&colorMode)
-    {
-        // progressive filtering and storage of the result in _tempBuffer
-        _spatiotemporalLPfilter_Irregular(get_data(inputFrame), &_irregularLPfilteredFrame[0]);
-        _spatiotemporalLPfilter_Irregular(&_irregularLPfilteredFrame[0], &_tempBuffer[0]); // warning, temporal issue may occur, if the temporal constant is not NULL !!!
-
-        _spatiotemporalLPfilter_Irregular(get_data(inputFrame)+_filterOutput.getNBpixels(), &_irregularLPfilteredFrame[0]);
-        _spatiotemporalLPfilter_Irregular(&_irregularLPfilteredFrame[0], &_tempBuffer[0]+_filterOutput.getNBpixels());
-
-        _spatiotemporalLPfilter_Irregular(get_data(inputFrame)+_filterOutput.getNBpixels()*2, &_irregularLPfilteredFrame[0]);
-        _spatiotemporalLPfilter_Irregular(&_irregularLPfilteredFrame[0], &_tempBuffer[0]+_filterOutput.getNBpixels()*2);
-
-        // applying image projection/resampling
-        register unsigned int *transformTablePTR=&_transformTable[0];
-        for (unsigned int i=0 ; i<_usefullpixelIndex ; i+=2, transformTablePTR+=2)
-        {
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            std::cout<<"ImageLogPolProjection::i:"<<i<<"output(max="<<_outputNBpixels<<")="<<_transformTable[i]<<" / intput(max="<<_filterOutput.getNBpixels()<<")="<<_transformTable[i+1]<<std::endl;
-#endif
-            _sampledFrame[*(transformTablePTR)]=_tempBuffer[*(transformTablePTR+1)];
-            _sampledFrame[*(transformTablePTR)+_outputNBpixels]=_tempBuffer[*(transformTablePTR+1)+_filterOutput.getNBpixels()];
-            _sampledFrame[*(transformTablePTR)+_outputDoubleNBpixels]=_tempBuffer[*(transformTablePTR+1)+_inputDoubleNBpixels];
-        }
-
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-        std::cout<<"ImageLogPolProjection::runProjection: color image projection OK"<<std::endl;
-#endif
-        //normalizeGrayOutput_0_maxOutputValue(_sampledFrame, _outputNBpixels);
-    }else
-    {
-        _spatiotemporalLPfilter_Irregular(get_data(inputFrame), &_irregularLPfilteredFrame[0]);
-        _spatiotemporalLPfilter_Irregular(&_irregularLPfilteredFrame[0], &_irregularLPfilteredFrame[0]);
-        // applying image projection/resampling
-        register unsigned int *transformTablePTR=&_transformTable[0];
-        for (unsigned int i=0 ; i<_usefullpixelIndex ; i+=2, transformTablePTR+=2)
-        {
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-            std::cout<<"i:"<<i<<"output(max="<<_outputNBpixels<<")="<<_transformTable[i]<<" / intput(max="<<_filterOutput.getNBpixels()<<")="<<_transformTable[i+1]<<std::endl;
-#endif
-            _sampledFrame[*(transformTablePTR)]=_irregularLPfilteredFrame[*(transformTablePTR+1)];
-        }
-        //normalizeGrayOutput_0_maxOutputValue(_sampledFrame, _outputNBpixels);
-#ifdef IMAGELOGPOLPROJECTION_DEBUG
-        std::cout<<"ImageLogPolProjection::runProjection: gray level image projection OK"<<std::endl;
-#endif
-    }
-
-    return _sampledFrame;
-}
-
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/imagelogpolprojection.hpp b/modules/bioinspired/src/imagelogpolprojection.hpp
deleted file mode 100644
index 41ecd5eaf..000000000
--- a/modules/bioinspired/src/imagelogpolprojection.hpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#ifndef IMAGELOGPOLPROJECTION_H_
-#define IMAGELOGPOLPROJECTION_H_
-
-/**
-* @class ImageLogPolProjection
-* @brief class able to perform a log sampling of an image input (models the log sampling of the photoreceptors of the retina)
-* or a log polar projection which models the retina information projection on the primary visual cortex: a linear projection in the center for detail analysis and a log projection of the borders (low spatial frequency motion information in general)
-*
-* collaboration: Barthelemy DURETTE who experimented the retina log projection
--> "Traitement visuels Bio mimtiques pour la supplance perceptive", internal technical report, May 2005, Gipsa-lab/DIS, Grenoble, FRANCE
-*
-* * TYPICAL USE:
-*
-* // create object, here for a log sampling (keyword:RETINALOGPROJECTION): (dynamic object allocation sample)
-* ImageLogPolProjection *imageSamplingTool;
-* imageSamplingTool = new ImageLogPolProjection(frameSizeRows, frameSizeColumns, RETINALOGPROJECTION);
-*
-* // init log projection:
-* imageSamplingTool->initProjection(1.0, 15.0);
-*
-* // during program execution, call the log transform applied to a frame called "FrameBuffer" :
-* imageSamplingTool->runProjection(FrameBuffer);
-* // get output frame and its size:
-* const unsigned int logSampledFrame_nbRows=imageSamplingTool->getOutputNBrows();
-* const unsigned int logSampledFrame_nbColumns=imageSamplingTool->getOutputNBcolumns();
-* const double *logSampledFrame=imageSamplingTool->getSampledFrame();
-*
-* // at the end of the program, destroy object:
-* delete imageSamplingTool;
-*
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-*/
-
-//#define __IMAGELOGPOLPROJECTION_DEBUG // used for std output debug information
-
-#include "basicretinafilter.hpp"
-
-
-namespace cv
-{
-namespace bioinspired
-{
-
-class ImageLogPolProjection:public BasicRetinaFilter
-{
-public:
-
-    enum PROJECTIONTYPE{RETINALOGPROJECTION, CORTEXLOGPOLARPROJECTION};
-
-    /**
-    * constructor, just specifies the image input size and the projection type, no projection initialisation is done
-    * -> use initLogRetinaSampling() or initLogPolarCortexSampling() for that
-    * @param nbRows: number of rows of the input image
-    * @param nbColumns: number of columns of the input image
-    * @param projection: the type of projection, RETINALOGPROJECTION or CORTEXLOGPOLARPROJECTION
-    * @param colorMode: specifies if the projection is applied on a grayscale image (false) or color images (3 layers) (true)
-    */
-    ImageLogPolProjection(const unsigned int nbRows, const unsigned int nbColumns, const PROJECTIONTYPE projection, const bool colorMode=false);
-
-    /**
-    * standard destructor
-    */
-    virtual ~ImageLogPolProjection();
-
-    /**
-    * function that clears all buffers of the object
-    */
-    void clearAllBuffers();
-
-    /**
-    * resize retina color filter object (resize all allocated buffers)
-    * @param NBrows: the new height size
-    * @param NBcolumns: the new width size
-    */
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-    /**
-    * init function depending on the projection type
-    * @param reductionFactor: the size reduction factor of the ouptup image in regard of the size of the input image, must be superior to 1
-    * @param samplingStrenght: specifies the strenght of the log compression effect (magnifying coefficient)
-    * @return true if the init was performed without any errors
-    */
-    bool initProjection(const double reductionFactor, const double samplingStrenght);
-
-    /**
-    * main funtion of the class: run projection function
-    * @param inputFrame: the input frame to be processed
-        * @param colorMode: the input buffer color mode: false=gray levels, true = 3 color channels mode
-    * @return the output frame
-    */
-    std::valarray<float> &runProjection(const std::valarray<float> &inputFrame, const bool colorMode=false);
-
-    /**
-    * @return the numbers of rows (height) of the images OUTPUTS of the object
-    */
-    inline unsigned int getOutputNBrows(){return _outputNBrows;};
-
-    /**
-    * @return the numbers of columns (width) of the images OUTPUTS of the object
-    */
-    inline unsigned int getOutputNBcolumns(){return _outputNBcolumns;};
-
-    /**
-    * main funtion of the class: run projection function
-    * @param size: one of the input frame initial dimensions to be processed
-    * @return the output frame dimension
-    */
-    inline static unsigned int predictOutputSize(const unsigned int size, const double reductionFactor){return (unsigned int)((double)size/reductionFactor);};
-
-    /**
-    * @return the output of the filter which applies an irregular Low Pass spatial filter to the imag input (see function
-    */
-    inline const std::valarray<float> &getIrregularLPfilteredInputFrame() const {return _irregularLPfilteredFrame;};
-
-    /**
-    * function which allows to retrieve the output frame which was updated after the "runProjection(...) function BasicRetinaFilter::runProgressiveFilter(...)
-    * @return the projection result
-    */
-    inline const std::valarray<float> &getSampledFrame() const {return _sampledFrame;};
-
-    /**
-    * function which allows gives the tranformation table, its size is (getNBrows()*getNBcolumns()*2)
-    * @return the transformation matrix [outputPixIndex_i, inputPixIndex_i, outputPixIndex_i+1, inputPixIndex_i+1....]
-    */
-    inline const std::valarray<unsigned int> &getSamplingMap() const {return _transformTable;};
-
-    inline double getOriginalRadiusLength(const double projectedRadiusLength){return _azero/(_alim-projectedRadiusLength*2.0/_minDimension);};
-
-    //    unsigned int getInputPixelIndex(const unsigned int ){ return  _transformTable[index*2+1]};
-
-private:
-    PROJECTIONTYPE _selectedProjection;
-
-    // size of the image output
-    unsigned int _outputNBrows;
-    unsigned int _outputNBcolumns;
-    unsigned int _outputNBpixels;
-    unsigned int _outputDoubleNBpixels;
-    unsigned int _inputDoubleNBpixels;
-
-    // is the object able to manage color flag
-    bool _colorModeCapable;
-    // sampling strenght factor
-    double _samplingStrenght;
-    // sampling reduction factor
-    double _reductionFactor;
-
-    // log sampling parameters
-    double _azero;
-    double _alim;
-    double _minDimension;
-
-    // template buffers
-    std::valarray<float>_sampledFrame;
-    std::valarray<float>&_tempBuffer;
-    std::valarray<unsigned int>_transformTable;
-
-    std::valarray<float> &_irregularLPfilteredFrame; // just a reference for easier understanding
-    unsigned int _usefullpixelIndex;
-
-    // init transformation tables
-    bool _computeLogProjection();
-    bool _computeLogPolarProjection();
-
-    // specifies if init was done correctly
-    bool _initOK;
-    // private init projections functions called by "initProjection(...)" function
-    bool _initLogRetinaSampling(const double reductionFactor, const double samplingStrenght);
-    bool _initLogPolarCortexSampling(const double reductionFactor, const double samplingStrenght);
-
-    ImageLogPolProjection(const ImageLogPolProjection&);
-    ImageLogPolProjection& operator=(const ImageLogPolProjection&);
-
-};
-
-}// end of namespace bioinspired
-}// end of namespace cv
-#endif /*IMAGELOGPOLPROJECTION_H_*/
diff --git a/modules/bioinspired/src/magnoretinafilter.cpp b/modules/bioinspired/src/magnoretinafilter.cpp
deleted file mode 100644
index 81fdb1df5..000000000
--- a/modules/bioinspired/src/magnoretinafilter.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-
-#include <iostream>
-
-#include "magnoretinafilter.hpp"
-
-#include <cmath>
-
-namespace cv
-{
-namespace bioinspired
-{
-// Constructor and Desctructor of the OPL retina filter
-MagnoRetinaFilter::MagnoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns)
-:BasicRetinaFilter(NBrows, NBcolumns, 2),
- _previousInput_ON(NBrows*NBcolumns),
- _previousInput_OFF(NBrows*NBcolumns),
- _amacrinCellsTempOutput_ON(NBrows*NBcolumns),
- _amacrinCellsTempOutput_OFF(NBrows*NBcolumns),
- _magnoXOutputON(NBrows*NBcolumns),
- _magnoXOutputOFF(NBrows*NBcolumns),
- _localProcessBufferON(NBrows*NBcolumns),
- _localProcessBufferOFF(NBrows*NBcolumns)
-{
-    _magnoYOutput=&_filterOutput;
-    _magnoYsaturated=&_localBuffer;
-
-
-    clearAllBuffers();
-
-#ifdef IPL_RETINA_ELEMENT_DEBUG
-    std::cout<<"MagnoRetinaFilter::Init IPL retina filter at specified frame size OK"<<std::endl;
-#endif
-}
-
-MagnoRetinaFilter::~MagnoRetinaFilter()
-{
-#ifdef IPL_RETINA_ELEMENT_DEBUG
-    std::cout<<"MagnoRetinaFilter::Delete IPL retina filter OK"<<std::endl;
-#endif
-}
-
-// function that clears all buffers of the object
-void MagnoRetinaFilter::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _previousInput_ON=0;
-    _previousInput_OFF=0;
-    _amacrinCellsTempOutput_ON=0;
-    _amacrinCellsTempOutput_OFF=0;
-    _magnoXOutputON=0;
-    _magnoXOutputOFF=0;
-    _localProcessBufferON=0;
-    _localProcessBufferOFF=0;
-
-}
-
-/**
-* resize retina magno filter object (resize all allocated buffers
-* @param NBrows: the new height size
-* @param NBcolumns: the new width size
-*/
-void MagnoRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::resize(NBrows, NBcolumns);
-    _previousInput_ON.resize(NBrows*NBcolumns);
-    _previousInput_OFF.resize(NBrows*NBcolumns);
-    _amacrinCellsTempOutput_ON.resize(NBrows*NBcolumns);
-    _amacrinCellsTempOutput_OFF.resize(NBrows*NBcolumns);
-    _magnoXOutputON.resize(NBrows*NBcolumns);
-    _magnoXOutputOFF.resize(NBrows*NBcolumns);
-    _localProcessBufferON.resize(NBrows*NBcolumns);
-    _localProcessBufferOFF.resize(NBrows*NBcolumns);
-
-    // to be sure, relink buffers
-    _magnoYOutput=&_filterOutput;
-    _magnoYsaturated=&_localBuffer;
-
-    // reset all buffers
-    clearAllBuffers();
-}
-
-void MagnoRetinaFilter::setCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float localAdaptIntegration_tau, const float localAdaptIntegration_k )
-{
-    _temporalCoefficient=(float)std::exp(-1.0f/amacrinCellsTemporalCutFrequency);
-    // the first set of parameters is dedicated to the low pass filtering property of the ganglion cells
-    BasicRetinaFilter::setLPfilterParameters(parasolCells_beta, parasolCells_tau, parasolCells_k, 0);
-    // the second set of parameters is dedicated to the ganglion cells output intergartion for their local adaptation property
-    BasicRetinaFilter::setLPfilterParameters(0, localAdaptIntegration_tau, localAdaptIntegration_k, 1);
-}
-
-void MagnoRetinaFilter::_amacrineCellsComputing(const float *OPL_ON, const float *OPL_OFF)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(0,_filterOutput.getNBpixels()), Parallel_amacrineCellsComputing(OPL_ON, OPL_OFF, &_previousInput_ON[0], &_previousInput_OFF[0], &_amacrinCellsTempOutput_ON[0], &_amacrinCellsTempOutput_OFF[0], _temporalCoefficient));
-#else
-    register const float *OPL_ON_PTR=OPL_ON;
-    register const float *OPL_OFF_PTR=OPL_OFF;
-    register float *previousInput_ON_PTR= &_previousInput_ON[0];
-    register float *previousInput_OFF_PTR= &_previousInput_OFF[0];
-    register float *amacrinCellsTempOutput_ON_PTR= &_amacrinCellsTempOutput_ON[0];
-    register float *amacrinCellsTempOutput_OFF_PTR= &_amacrinCellsTempOutput_OFF[0];
-
-    for (unsigned int IDpixel=0 ; IDpixel<this->getNBpixels(); ++IDpixel)
-    {
-
-        /* Compute ON and OFF amacrin cells high pass temporal filter */
-        float magnoXonPixelResult = _temporalCoefficient*(*amacrinCellsTempOutput_ON_PTR+ *OPL_ON_PTR-*previousInput_ON_PTR);
-        *(amacrinCellsTempOutput_ON_PTR++)=((float)(magnoXonPixelResult>0))*magnoXonPixelResult;
-
-        float magnoXoffPixelResult = _temporalCoefficient*(*amacrinCellsTempOutput_OFF_PTR+ *OPL_OFF_PTR-*previousInput_OFF_PTR);
-        *(amacrinCellsTempOutput_OFF_PTR++)=((float)(magnoXoffPixelResult>0))*magnoXoffPixelResult;
-
-        /* prepare next loop */
-        *(previousInput_ON_PTR++)=*(OPL_ON_PTR++);
-        *(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++);
-
-    }
-#endif
-}
-
-// launch filter that runs all the IPL filter
-const std::valarray<float> &MagnoRetinaFilter::runFilter(const std::valarray<float> &OPL_ON, const std::valarray<float> &OPL_OFF)
-{
-    // Compute the high pass temporal filter
-    _amacrineCellsComputing(get_data(OPL_ON), get_data(OPL_OFF));
-
-    // apply low pass filtering on ON and OFF ways after temporal high pass filtering
-    _spatiotemporalLPfilter(&_amacrinCellsTempOutput_ON[0], &_magnoXOutputON[0], 0);
-    _spatiotemporalLPfilter(&_amacrinCellsTempOutput_OFF[0], &_magnoXOutputOFF[0], 0);
-
-    // local adaptation of the ganglion cells to the local contrast of the moving contours
-    _spatiotemporalLPfilter(&_magnoXOutputON[0], &_localProcessBufferON[0], 1);
-    _localLuminanceAdaptation(&_magnoXOutputON[0], &_localProcessBufferON[0]);
-    _spatiotemporalLPfilter(&_magnoXOutputOFF[0], &_localProcessBufferOFF[0], 1);
-    _localLuminanceAdaptation(&_magnoXOutputOFF[0], &_localProcessBufferOFF[0]);
-
-    /* Compute MagnoY */
-    register float *magnoYOutput= &(*_magnoYOutput)[0];
-    register float *magnoXOutputON_PTR= &_magnoXOutputON[0];
-    register float *magnoXOutputOFF_PTR= &_magnoXOutputOFF[0];
-    for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel)
-        *(magnoYOutput++)=*(magnoXOutputON_PTR++)+*(magnoXOutputOFF_PTR++);
-
-    return (*_magnoYOutput);
-}
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/magnoretinafilter.hpp b/modules/bioinspired/src/magnoretinafilter.hpp
deleted file mode 100644
index e06d14ddc..000000000
--- a/modules/bioinspired/src/magnoretinafilter.hpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#ifndef MagnoRetinaFilter_H_
-#define MagnoRetinaFilter_H_
-
-/**
-* @class MagnoRetinaFilter
-* @brief class which describes the magnocellular channel of the retina:
-* -> performs a moving contours extraction with powerfull local data enhancement
-*
-* TYPICAL USE:
-*
-* // create object at a specified picture size
-* MagnoRetinaFilter *movingContoursExtractor;
-* movingContoursExtractor =new MagnoRetinaFilter(frameSizeRows, frameSizeColumns);
-*
-* // init gain, spatial and temporal parameters:
-* movingContoursExtractor->setCoefficientsTable(0, 0.7, 5, 3);
-*
-* // during program execution, call the filter for contours extraction for an input picture called "FrameBuffer":
-* movingContoursExtractor->runfilter(FrameBuffer);
-*
-* // get the output frame, check in the class description below for more outputs:
-* const float *movingContours=movingContoursExtractor->getMagnoYsaturated();
-*
-* // at the end of the program, destroy object:
-* delete movingContoursExtractor;
-
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-* Based on Alexandre BENOIT thesis: "Le système visuel humain au secours de la vision par ordinateur"
-*/
-
-#include "basicretinafilter.hpp"
-
-//#define _IPL_RETINA_ELEMENT_DEBUG
-
-namespace cv
-{
-namespace bioinspired
-{
-    class MagnoRetinaFilter: public BasicRetinaFilter
-    {
-    public:
-        /**
-        * constructor parameters are only linked to image input size
-        * @param NBrows: number of rows of the input image
-        * @param NBcolumns: number of columns of the input image
-        */
-        MagnoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns);
-
-
-        /**
-        * destructor
-        */
-        virtual ~MagnoRetinaFilter();
-
-        /**
-        * function that clears all buffers of the object
-        */
-        void clearAllBuffers();
-
-        /**
-        * resize retina magno filter object (resize all allocated buffers)
-        * @param NBrows: the new height size
-        * @param NBcolumns: the new width size
-        */
-        void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-        /**
-        * set parameters values
-        * @param parasolCells_beta: the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-        * @param parasolCells_tau: the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-        * @param parasolCells_k: the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-        * @param amacrinCellsTemporalCutFrequency: the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, tipicall value is 5
-        * @param localAdaptIntegration_tau: specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-        * @param localAdaptIntegration_k: specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-        */
-        void setCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float localAdaptIntegration_tau, const float localAdaptIntegration_k);
-
-        /**
-        * launch filter that runs all the IPL magno filter (model of the magnocellular channel of the Inner Plexiform Layer of the retina)
-        * @param OPL_ON: the output of the bipolar ON cells of the retina (available from the ParvoRetinaFilter class (getBipolarCellsON() function)
-        * @param OPL_OFF: the output of the bipolar OFF cells of the retina (available from the ParvoRetinaFilter class (getBipolarCellsOFF() function)
-        * @return the processed result without post-processing
-        */
-        const std::valarray<float> &runFilter(const std::valarray<float> &OPL_ON, const std::valarray<float> &OPL_OFF);
-
-        /**
-        * @return the Magnocellular ON channel filtering output
-        */
-        inline const std::valarray<float> &getMagnoON() const {return _magnoXOutputON;};
-
-        /**
-        * @return the Magnocellular OFF channel filtering output
-        */
-        inline const std::valarray<float> &getMagnoOFF() const {return _magnoXOutputOFF;};
-
-        /**
-        * @return the Magnocellular Y (sum of the ON and OFF magno channels) filtering output
-        */
-        inline const std::valarray<float> &getMagnoYsaturated() const {return *_magnoYsaturated;};
-
-        /**
-        * applies an image normalization which saturates the high output values by the use of an assymetric sigmoide
-        */
-        inline void normalizeGrayOutputNearZeroCentreredSigmoide(){_filterOutput.normalizeGrayOutputNearZeroCentreredSigmoide(&(*_magnoYOutput)[0], &(*_magnoYsaturated)[0]);};
-
-        /**
-        * @return the horizontal cells' temporal constant
-        */
-        inline float getTemporalConstant(){return this->_filteringCoeficientsTable[2];};
-
-    private:
-
-        // related pointers to these buffers
-        std::valarray<float> _previousInput_ON;
-        std::valarray<float> _previousInput_OFF;
-        std::valarray<float> _amacrinCellsTempOutput_ON;
-        std::valarray<float> _amacrinCellsTempOutput_OFF;
-        std::valarray<float> _magnoXOutputON;
-        std::valarray<float> _magnoXOutputOFF;
-        std::valarray<float> _localProcessBufferON;
-        std::valarray<float> _localProcessBufferOFF;
-        // reference to parent buffers and allow better readability
-        TemplateBuffer<float> *_magnoYOutput;
-        std::valarray<float> *_magnoYsaturated;
-
-        // varialbles
-        float _temporalCoefficient;
-
-        // amacrine cells filter : high pass temporal filter
-        void _amacrineCellsComputing(const float *ONinput, const float *OFFinput);
-#ifdef MAKE_PARALLEL
-        /******************************************************
-        ** IF some parallelizing thread methods are available, then, main loops are parallelized using these functors
-        ** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
-        ** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
-        ** ==> functors constructors can differ from the parameters used with their related serial functions
-        */
-        class Parallel_amacrineCellsComputing: public cv::ParallelLoopBody
-        {
-        private:
-            const float *OPL_ON, *OPL_OFF;
-            float *previousInput_ON, *previousInput_OFF, *amacrinCellsTempOutput_ON, *amacrinCellsTempOutput_OFF;
-            float temporalCoefficient;
-        public:
-            Parallel_amacrineCellsComputing(const float *OPL_ON_PTR, const float *OPL_OFF_PTR, float *previousInput_ON_PTR, float *previousInput_OFF_PTR, float *amacrinCellsTempOutput_ON_PTR, float *amacrinCellsTempOutput_OFF_PTR, float temporalCoefficientVal)
-                :OPL_ON(OPL_ON_PTR), OPL_OFF(OPL_OFF_PTR), previousInput_ON(previousInput_ON_PTR), previousInput_OFF(previousInput_OFF_PTR), amacrinCellsTempOutput_ON(amacrinCellsTempOutput_ON_PTR), amacrinCellsTempOutput_OFF(amacrinCellsTempOutput_OFF_PTR), temporalCoefficient(temporalCoefficientVal) {}
-
-            virtual void operator()( const Range& r ) const {
-                register const float *OPL_ON_PTR=OPL_ON+r.start;
-                register const float *OPL_OFF_PTR=OPL_OFF+r.start;
-                register float *previousInput_ON_PTR= previousInput_ON+r.start;
-                register float *previousInput_OFF_PTR= previousInput_OFF+r.start;
-                register float *amacrinCellsTempOutput_ON_PTR= amacrinCellsTempOutput_ON+r.start;
-                register float *amacrinCellsTempOutput_OFF_PTR= amacrinCellsTempOutput_OFF+r.start;
-
-                for (int IDpixel=r.start ; IDpixel!=r.end; ++IDpixel)
-                {
-
-                    /* Compute ON and OFF amacrin cells high pass temporal filter */
-                    float magnoXonPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_ON_PTR+ *OPL_ON_PTR-*previousInput_ON_PTR);
-                    *(amacrinCellsTempOutput_ON_PTR++)=((float)(magnoXonPixelResult>0))*magnoXonPixelResult;
-
-                    float magnoXoffPixelResult = temporalCoefficient*(*amacrinCellsTempOutput_OFF_PTR+ *OPL_OFF_PTR-*previousInput_OFF_PTR);
-                    *(amacrinCellsTempOutput_OFF_PTR++)=((float)(magnoXoffPixelResult>0))*magnoXoffPixelResult;
-
-                    /* prepare next loop */
-                    *(previousInput_ON_PTR++)=*(OPL_ON_PTR++);
-                    *(previousInput_OFF_PTR++)=*(OPL_OFF_PTR++);
-
-                }
-            }
-
-        };
-#endif
-    };
-
-}// end of namespace bioinspired
-}// end of namespace cv
-
-#endif /*MagnoRetinaFilter_H_*/
diff --git a/modules/bioinspired/src/opencl/retina_kernel.cl b/modules/bioinspired/src/opencl/retina_kernel.cl
deleted file mode 100644
index 169be4d27..000000000
--- a/modules/bioinspired/src/opencl/retina_kernel.cl
+++ /dev/null
@@ -1,779 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-//data (which is float) is aligend in 32 bytes
-#define WIDTH_MULTIPLE (32 >> 2)
-
-/////////////////////////////////////////////////////////
-//*******************************************************
-// basicretinafilter
-//////////////// _spatiotemporalLPfilter ////////////////
-//_horizontalCausalFilter_addInput
-kernel void horizontalCausalFilter_addInput(
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int in_offset,
-    const int out_offset,
-    const float _tau,
-    const float _a
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= rows)
-    {
-        return;
-    }
-
-    global const float * iptr =
-        input  + mad24(gid, elements_per_row, in_offset / 4);
-    global float * optr =
-        output + mad24(gid, elements_per_row, out_offset / 4);
-
-    float res;
-    float4 in_v4, out_v4, res_v4 = (float4)(0);
-    //vectorize to increase throughput
-    for(int i = 0; i < cols / 4; ++i, iptr += 4, optr += 4)
-    {
-        in_v4  = vload4(0, iptr);
-        out_v4 = vload4(0, optr);
-
-        res_v4.x = in_v4.x + _tau * out_v4.x + _a * res_v4.w;
-        res_v4.y = in_v4.y + _tau * out_v4.y + _a * res_v4.x;
-        res_v4.z = in_v4.z + _tau * out_v4.z + _a * res_v4.y;
-        res_v4.w = in_v4.w + _tau * out_v4.w + _a * res_v4.z;
-
-        vstore4(res_v4, 0, optr);
-    }
-    res = res_v4.w;
-    // there may be left some
-    for(int i = 0; i < cols % 4;  ++i, ++iptr, ++optr)
-    {
-        res = *iptr + _tau * *optr + _a * res;
-        *optr = res;
-    }
-}
-
-//_horizontalAnticausalFilter
-kernel void horizontalAnticausalFilter(
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int out_offset,
-    const float _a
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= rows)
-    {
-        return;
-    }
-
-    global float * optr = output +
-                          mad24(gid + 1, elements_per_row, - 1 + out_offset / 4);
-
-    float4 result_v4 = (float4)(0), out_v4;
-    float result = 0;
-    // we assume elements_per_row is multple of WIDTH_MULTIPLE
-    for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr)
-    {
-        if(i >= elements_per_row - cols)
-        {
-            result = *optr + _a * result;
-        }
-        *optr = result;
-    }
-    result_v4.x = result;
-    optr -= 3;
-    for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4)
-    {
-        // shift left, `offset` is type `size_t` so it cannot be negative
-        out_v4 = vload4(0, optr);
-
-        result_v4.w = out_v4.w + _a * result_v4.x;
-        result_v4.z = out_v4.z + _a * result_v4.w;
-        result_v4.y = out_v4.y + _a * result_v4.z;
-        result_v4.x = out_v4.x + _a * result_v4.y;
-
-        vstore4(result_v4, 0, optr);
-    }
-}
-
-//_verticalCausalFilter
-kernel void verticalCausalFilter(
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int out_offset,
-    const float _a
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= cols)
-    {
-        return;
-    }
-
-    global float * optr = output + gid + out_offset / 4;
-    float result = 0;
-    for(int i = 0; i < rows; ++i, optr += elements_per_row)
-    {
-        result = *optr + _a * result;
-        *optr = result;
-    }
-}
-
-//_verticalCausalFilter
-kernel void verticalAnticausalFilter_multGain(
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int out_offset,
-    const float _a,
-    const float _gain
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= cols)
-    {
-        return;
-    }
-
-    global float * optr = output + (rows - 1) * elements_per_row + gid + out_offset / 4;
-    float result = 0;
-    for(int i = 0; i < rows; ++i, optr -= elements_per_row)
-    {
-        result = *optr + _a * result;
-        *optr = _gain * result;
-    }
-}
-//
-// end of _spatiotemporalLPfilter
-/////////////////////////////////////////////////////////////////////
-
-//////////////// horizontalAnticausalFilter_Irregular ////////////////
-kernel void horizontalAnticausalFilter_Irregular(
-    global float * output,
-    global float * buffer,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int out_offset,
-    const int buffer_offset
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= rows)
-    {
-        return;
-    }
-
-    global float * optr =
-        output + mad24(rows - gid, elements_per_row, -1 + out_offset / 4);
-    global float * bptr =
-        buffer + mad24(rows - gid, elements_per_row, -1 + buffer_offset / 4);
-
-    float4 buf_v4, out_v4, res_v4 = (float4)(0);
-    float result = 0;
-    // we assume elements_per_row is multple of WIDTH_MULTIPLE
-    for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr, -- bptr)
-    {
-        if(i >= elements_per_row - cols)
-        {
-            result = *optr + *bptr * result;
-        }
-        *optr = result;
-    }
-    res_v4.x = result;
-    optr -= 3;
-    bptr -= 3;
-    for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
-    {
-        buf_v4 = vload4(0, bptr);
-        out_v4 = vload4(0, optr);
-
-        res_v4.w = out_v4.w + buf_v4.w * res_v4.x;
-        res_v4.z = out_v4.z + buf_v4.z * res_v4.w;
-        res_v4.y = out_v4.y + buf_v4.y * res_v4.z;
-        res_v4.x = out_v4.x + buf_v4.x * res_v4.y;
-
-        vstore4(res_v4, 0, optr);
-    }
-}
-
-//////////////// verticalCausalFilter_Irregular ////////////////
-kernel void verticalCausalFilter_Irregular(
-    global float * output,
-    global float * buffer,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int out_offset,
-    const int buffer_offset
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= cols)
-    {
-        return;
-    }
-
-    global float * optr = output + gid + out_offset / 4;
-    global float * bptr = buffer + gid + buffer_offset / 4;
-    float result = 0;
-    for(int i = 0; i < rows; ++i, optr += elements_per_row, bptr += elements_per_row)
-    {
-        result = *optr + *bptr * result;
-        *optr = result;
-    }
-}
-
-//////////////// _adaptiveHorizontalCausalFilter_addInput ////////////////
-kernel void adaptiveHorizontalCausalFilter_addInput(
-    global const float * input,
-    global const float * gradient,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int in_offset,
-    const int grad_offset,
-    const int out_offset
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= rows)
-    {
-        return;
-    }
-
-    global const float * iptr =
-        input + mad24(gid, elements_per_row, in_offset / 4);
-    global const float * gptr =
-        gradient + mad24(gid, elements_per_row, grad_offset / 4);
-    global float * optr =
-        output + mad24(gid, elements_per_row, out_offset / 4);
-
-    float4 in_v4, grad_v4, out_v4, res_v4 = (float4)(0);
-    for(int i = 0; i < cols / 4; ++i, iptr += 4, gptr += 4, optr += 4)
-    {
-        in_v4   = vload4(0, iptr);
-        grad_v4 = vload4(0, gptr);
-
-        res_v4.x = in_v4.x + grad_v4.x * res_v4.w;
-        res_v4.y = in_v4.y + grad_v4.y * res_v4.x;
-        res_v4.z = in_v4.z + grad_v4.z * res_v4.y;
-        res_v4.w = in_v4.w + grad_v4.w * res_v4.z;
-
-        vstore4(res_v4, 0, optr);
-    }
-    for(int i = 0; i < cols % 4; ++i, ++iptr, ++gptr, ++optr)
-    {
-        res_v4.w = *iptr + *gptr * res_v4.w;
-        *optr = res_v4.w;
-    }
-}
-
-//////////////// _adaptiveVerticalAnticausalFilter_multGain ////////////////
-kernel void adaptiveVerticalAnticausalFilter_multGain(
-    global const float * gradient,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const int grad_offset,
-    const int out_offset,
-    const float gain
-)
-{
-    int gid = get_global_id(0);
-    if(gid >= cols)
-    {
-        return;
-    }
-
-    int start_idx = mad24(rows - 1, elements_per_row, gid);
-
-    global const float * gptr = gradient + start_idx + grad_offset / 4;
-    global float * optr = output + start_idx + out_offset / 4;
-
-    float result = 0;
-    for(int i = 0; i < rows; ++i, gptr -= elements_per_row, optr -= elements_per_row)
-    {
-        result = *optr + *gptr * result;
-        *optr = gain * result;
-    }
-}
-
-//////////////// _localLuminanceAdaptation ////////////////
-// FIXME:
-//  This kernel seems to have precision problem on GPU
-kernel void localLuminanceAdaptation(
-    global const float * luma,
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float _localLuminanceAddon,
-    const float _localLuminanceFactor,
-    const float _maxInputValue
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    int offset = mad24(gidy, elements_per_row, gidx);
-
-    float X0 = luma[offset] * _localLuminanceFactor + _localLuminanceAddon;
-    float input_val = input[offset];
-    // output of the following line may be different between GPU and CPU
-    output[offset] = (_maxInputValue + X0) * input_val / (input_val + X0 + 0.00000000001f);
-}
-// end of basicretinafilter
-//*******************************************************
-/////////////////////////////////////////////////////////
-
-
-
-/////////////////////////////////////////////////////////
-//******************************************************
-// magno
-// TODO: this kernel has too many buffer accesses, better to make it
-//   vector read/write for fetch efficiency
-kernel void amacrineCellsComputing(
-    global const float * opl_on,
-    global const float * opl_off,
-    global float * prev_in_on,
-    global float * prev_in_off,
-    global float * out_on,
-    global float * out_off,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float coeff
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-
-    int offset = mad24(gidy, elements_per_row, gidx);
-    opl_on      += offset;
-    opl_off     += offset;
-    prev_in_on  += offset;
-    prev_in_off += offset;
-    out_on      += offset;
-    out_off     += offset;
-
-    float magnoXonPixelResult = coeff * (*out_on + *opl_on - *prev_in_on);
-    *out_on = fmax(magnoXonPixelResult, 0);
-    float magnoXoffPixelResult = coeff * (*out_off + *opl_off - *prev_in_off);
-    *out_off = fmax(magnoXoffPixelResult, 0);
-
-    *prev_in_on = *opl_on;
-    *prev_in_off = *opl_off;
-}
-
-/////////////////////////////////////////////////////////
-//******************************************************
-// parvo
-// TODO: this kernel has too many buffer accesses, needs optimization
-kernel void OPL_OnOffWaysComputing(
-    global float4 * photo_out,
-    global float4 * horiz_out,
-    global float4 * bipol_on,
-    global float4 * bipol_off,
-    global float4 * parvo_on,
-    global float4 * parvo_off,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx * 4 >= cols || gidy >= rows)
-    {
-        return;
-    }
-    // we assume elements_per_row must be multiples of 4
-    int offset = mad24(gidy, elements_per_row >> 2, gidx);
-    photo_out += offset;
-    horiz_out += offset;
-    bipol_on  += offset;
-    bipol_off += offset;
-    parvo_on  += offset;
-    parvo_off += offset;
-
-    float4 diff = *photo_out - *horiz_out;
-    float4 isPositive;// = convert_float4(diff > (float4)(0.0f, 0.0f, 0.0f, 0.0f));
-    isPositive.x = diff.x > 0.0f;
-    isPositive.y = diff.y > 0.0f;
-    isPositive.z = diff.z > 0.0f;
-    isPositive.w = diff.w > 0.0f;
-    float4 res_on  = isPositive * diff;
-    float4 res_off = (isPositive - (float4)(1.0f)) * diff;
-
-    *bipol_on = res_on;
-    *parvo_on = res_on;
-
-    *bipol_off = res_off;
-    *parvo_off = res_off;
-}
-
-/////////////////////////////////////////////////////////
-//******************************************************
-// retinacolor
-inline int bayerSampleOffset(int step, int rows, int x, int y)
-{
-    return mad24(y, step, x) +
-           ((y % 2) + (x % 2)) * rows * step;
-}
-
-
-/////// colorMultiplexing //////
-kernel void runColorMultiplexingBayer(
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-
-    int offset = mad24(gidy, elements_per_row, gidx);
-    output[offset] = input[bayerSampleOffset(elements_per_row, rows, gidx, gidy)];
-}
-
-kernel void runColorDemultiplexingBayer(
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-
-    int offset = mad24(gidy, elements_per_row, gidx);
-    output[bayerSampleOffset(elements_per_row, rows, gidx, gidy)] = input[offset];
-}
-
-kernel void demultiplexAssign(
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-
-    int offset = bayerSampleOffset(elements_per_row, rows, gidx, gidy);
-    output[offset] = input[offset];
-}
-
-
-//// normalizeGrayOutputCentredSigmoide
-kernel void normalizeGrayOutputCentredSigmoide(
-    global const float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float meanval,
-    const float X0
-)
-
-{
-    int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    int offset = mad24(gidy, elements_per_row, gidx);
-
-    float input_val = input[offset];
-    output[offset] = meanval +
-                     (meanval + X0) * (input_val - meanval) / (fabs(input_val - meanval) + X0);
-}
-
-//// normalize by photoreceptors density
-kernel void normalizePhotoDensity(
-    global const float * chroma,
-    global const float * colorDensity,
-    global const float * multiplex,
-    global float * luma,
-    global float * demultiplex,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float pG
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-    int index = offset;
-
-    float Cr = chroma[index] * colorDensity[index];
-    index += elements_per_row * rows;
-    float Cg = chroma[index] * colorDensity[index];
-    index += elements_per_row * rows;
-    float Cb = chroma[index] * colorDensity[index];
-
-    const float luma_res = (Cr + Cg + Cb) * pG;
-    luma[offset] = luma_res;
-    demultiplex[bayerSampleOffset(elements_per_row, rows, gidx, gidy)] =
-        multiplex[offset] - luma_res;
-}
-
-
-
-//////// computeGradient ///////
-// TODO:
-// this function maybe accelerated by image2d_t or lds
-kernel void computeGradient(
-    global const float * luma,
-    global float * gradient,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    int gidx = get_global_id(0) + 2, gidy = get_global_id(1) + 2;
-    if(gidx >= cols - 2 || gidy >= rows - 2)
-    {
-        return;
-    }
-    int offset = mad24(gidy, elements_per_row, gidx);
-    luma += offset;
-
-    // horizontal and vertical local gradients
-    const float v_grad = fabs(luma[elements_per_row] - luma[- elements_per_row]);
-    const float h_grad = fabs(luma[1] - luma[-1]);
-
-    // neighborhood horizontal and vertical gradients
-    const float cur_val  = luma[0];
-    const float v_grad_p = fabs(cur_val - luma[- 2 * elements_per_row]);
-    const float h_grad_p = fabs(cur_val - luma[- 2]);
-    const float v_grad_n = fabs(cur_val - luma[2 * elements_per_row]);
-    const float h_grad_n = fabs(cur_val - luma[2]);
-
-    const float horiz_grad = 0.5f * h_grad + 0.25f * (h_grad_p + h_grad_n);
-    const float verti_grad = 0.5f * v_grad + 0.25f * (v_grad_p + v_grad_n);
-    const bool is_vertical_greater = horiz_grad < verti_grad;
-
-    gradient[offset + elements_per_row * rows] = is_vertical_greater ? 0.06f : 0.57f;
-    gradient[offset                          ] = is_vertical_greater ? 0.57f : 0.06f;
-}
-
-
-/////// substractResidual ///////
-kernel void substractResidual(
-    global float * input,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float pR,
-    const float pG,
-    const float pB
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    int indices [3] =
-    {
-        mad24(gidy, elements_per_row, gidx),
-        mad24(gidy + rows, elements_per_row, gidx),
-        mad24(gidy + 2 * rows, elements_per_row, gidx)
-    };
-    float vals[3] = {input[indices[0]], input[indices[1]], input[indices[2]]};
-    float residu = pR * vals[0] + pG * vals[1] + pB * vals[2];
-
-    input[indices[0]] = vals[0] - residu;
-    input[indices[1]] = vals[1] - residu;
-    input[indices[2]] = vals[2] - residu;
-}
-
-///// clipRGBOutput_0_maxInputValue /////
-kernel void clipRGBOutput_0_maxInputValue(
-    global float * input,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float maxVal
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-    float val = input[offset];
-    val = clamp(val, 0.0f, maxVal);
-    input[offset] = val;
-}
-
-//// normalizeGrayOutputNearZeroCentreredSigmoide ////
-kernel void normalizeGrayOutputNearZeroCentreredSigmoide(
-    global float * input,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float maxVal,
-    const float X0cube
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-    float currentCubeLuminance = input[offset];
-    currentCubeLuminance = currentCubeLuminance * currentCubeLuminance * currentCubeLuminance;
-    output[offset] = currentCubeLuminance * X0cube / (X0cube + currentCubeLuminance);
-}
-
-//// centerReductImageLuminance ////
-kernel void centerReductImageLuminance(
-    global float * input,
-    const int cols,
-    const int rows,
-    const int elements_per_row,
-    const float mean,
-    const float std_dev
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-
-    float val = input[offset];
-    input[offset] = (val - mean) / std_dev;
-}
-
-//// inverseValue ////
-kernel void inverseValue(
-    global float * input,
-    const int cols,
-    const int rows,
-    const int elements_per_row
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-    input[offset] = 1.f / input[offset];
-}
-
-#define CV_PI 3.1415926535897932384626433832795
-
-//// _processRetinaParvoMagnoMapping ////
-kernel void processRetinaParvoMagnoMapping(
-    global float * parvo,
-    global float * magno,
-    global float * output,
-    const int cols,
-    const int rows,
-    const int halfCols,
-    const int halfRows,
-    const int elements_per_row,
-    const float minDistance
-)
-{
-    const int gidx = get_global_id(0), gidy = get_global_id(1);
-    if(gidx >= cols || gidy >= rows)
-    {
-        return;
-    }
-    const int offset = mad24(gidy, elements_per_row, gidx);
-
-    float distanceToCenter =
-        sqrt(((float)(gidy - halfRows) * (gidy - halfRows) + (gidx - halfCols) * (gidx - halfCols)));
-
-    float a = distanceToCenter < minDistance ?
-              (0.5f + 0.5f * (float)cos(CV_PI * distanceToCenter / minDistance)) : 0;
-    float b = 1.f - a;
-
-    output[offset] = parvo[offset] * a + magno[offset] * b;
-}
diff --git a/modules/bioinspired/src/parvoretinafilter.cpp b/modules/bioinspired/src/parvoretinafilter.cpp
deleted file mode 100644
index a276d97a4..000000000
--- a/modules/bioinspired/src/parvoretinafilter.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-
-#include "parvoretinafilter.hpp"
-
-// @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-
-#include <iostream>
-#include <cmath>
-
-namespace cv
-{
-namespace bioinspired
-{
-//////////////////////////////////////////////////////////
-//                 OPL RETINA FILTER
-//////////////////////////////////////////////////////////
-
-// Constructor and Desctructor of the OPL retina filter
-
-ParvoRetinaFilter::ParvoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns)
-:BasicRetinaFilter(NBrows, NBcolumns, 3),
- _photoreceptorsOutput(NBrows*NBcolumns),
- _horizontalCellsOutput(NBrows*NBcolumns),
- _parvocellularOutputON(NBrows*NBcolumns),
- _parvocellularOutputOFF(NBrows*NBcolumns),
- _bipolarCellsOutputON(NBrows*NBcolumns),
- _bipolarCellsOutputOFF(NBrows*NBcolumns),
- _localAdaptationOFF(NBrows*NBcolumns)
-{
-    // link to the required local parent adaptation buffers
-    _localAdaptationON=&_localBuffer;
-    _parvocellularOutputONminusOFF=&_filterOutput;
-    // (*_localAdaptationON)=&_localBuffer;
-    // (*_parvocellularOutputONminusOFF)=&(BasicRetinaFilter::TemplateBuffer);
-
-    // init: set all the values to 0
-    clearAllBuffers();
-
-
-#ifdef OPL_RETINA_ELEMENT_DEBUG
-    std::cout<<"ParvoRetinaFilter::Init OPL retina filter at specified frame size OK\n"<<std::endl;
-#endif
-
-}
-
-ParvoRetinaFilter::~ParvoRetinaFilter()
-{
-
-#ifdef OPL_RETINA_ELEMENT_DEBUG
-    std::cout<<"ParvoRetinaFilter::Delete OPL retina filter OK"<<std::endl;
-#endif
-}
-
-////////////////////////////////////
-// functions of the PARVO filter
-////////////////////////////////////
-
-// function that clears all buffers of the object
-void ParvoRetinaFilter::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _photoreceptorsOutput=0;
-    _horizontalCellsOutput=0;
-    _parvocellularOutputON=0;
-    _parvocellularOutputOFF=0;
-    _bipolarCellsOutputON=0;
-    _bipolarCellsOutputOFF=0;
-    _localAdaptationOFF=0;
-}
-
-/**
-* resize parvo retina filter object (resize all allocated buffers
-* @param NBrows: the new height size
-* @param NBcolumns: the new width size
-*/
-void ParvoRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::resize(NBrows, NBcolumns);
-    _photoreceptorsOutput.resize(NBrows*NBcolumns);
-    _horizontalCellsOutput.resize(NBrows*NBcolumns);
-    _parvocellularOutputON.resize(NBrows*NBcolumns);
-    _parvocellularOutputOFF.resize(NBrows*NBcolumns);
-    _bipolarCellsOutputON.resize(NBrows*NBcolumns);
-    _bipolarCellsOutputOFF.resize(NBrows*NBcolumns);
-    _localAdaptationOFF.resize(NBrows*NBcolumns);
-
-    // link to the required local parent adaptation buffers
-    _localAdaptationON=&_localBuffer;
-    _parvocellularOutputONminusOFF=&_filterOutput;
-
-    // clean buffers
-    clearAllBuffers();
-}
-
-// change the parameters of the filter
-void ParvoRetinaFilter::setOPLandParvoFiltersParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2)
-{
-    // init photoreceptors low pass filter
-    setLPfilterParameters(beta1, tau1, k1);
-    // init horizontal cells low pass filter
-    setLPfilterParameters(beta2, tau2, k2, 1);
-    // init parasol ganglion cells low pass filter (default parameters)
-    setLPfilterParameters(0, tau1, k1, 2);
-
-}
-
-// update/set size of the frames
-
-// run filter for a new frame input
-// output return is (*_parvocellularOutputONminusOFF)
-const std::valarray<float> &ParvoRetinaFilter::runFilter(const std::valarray<float> &inputFrame, const bool useParvoOutput)
-{
-    _spatiotemporalLPfilter(get_data(inputFrame), &_photoreceptorsOutput[0]);
-    _spatiotemporalLPfilter(&_photoreceptorsOutput[0], &_horizontalCellsOutput[0], 1);
-    _OPL_OnOffWaysComputing();
-
-    if (useParvoOutput)
-    {
-        // local adaptation processes on ON and OFF ways
-        _spatiotemporalLPfilter(&_bipolarCellsOutputON[0], &(*_localAdaptationON)[0], 2);
-        _localLuminanceAdaptation(&_parvocellularOutputON[0], &(*_localAdaptationON)[0]);
-
-        _spatiotemporalLPfilter(&_bipolarCellsOutputOFF[0], &_localAdaptationOFF[0], 2);
-        _localLuminanceAdaptation(&_parvocellularOutputOFF[0], &_localAdaptationOFF[0]);
-
-        //// Final loop that computes the main output of this filter
-        //
-        //// loop that makes the difference between photoreceptor cells output and horizontal cells
-        //// positive part goes on the ON way, negative pat goes on the OFF way
-        register float *parvocellularOutputONminusOFF_PTR=&(*_parvocellularOutputONminusOFF)[0];
-        register float *parvocellularOutputON_PTR=&_parvocellularOutputON[0];
-        register float *parvocellularOutputOFF_PTR=&_parvocellularOutputOFF[0];
-
-        for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel)
-            *(parvocellularOutputONminusOFF_PTR++)= (*(parvocellularOutputON_PTR++)-*(parvocellularOutputOFF_PTR++));
-    }
-    return (*_parvocellularOutputONminusOFF);
-}
-
-void ParvoRetinaFilter::_OPL_OnOffWaysComputing() // WARNING : this method requires many buffer accesses, parallelizing can increase bandwith & core efficacy
-{
-    // loop that makes the difference between photoreceptor cells output and horizontal cells
-    // positive part goes on the ON way, negative pat goes on the OFF way
-
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(0,_filterOutput.getNBpixels()), Parallel_OPL_OnOffWaysComputing(&_photoreceptorsOutput[0], &_horizontalCellsOutput[0], &_bipolarCellsOutputON[0], &_bipolarCellsOutputOFF[0], &_parvocellularOutputON[0], &_parvocellularOutputOFF[0]));
-#else
-    float *photoreceptorsOutput_PTR= &_photoreceptorsOutput[0];
-    float *horizontalCellsOutput_PTR= &_horizontalCellsOutput[0];
-    float *bipolarCellsON_PTR = &_bipolarCellsOutputON[0];
-    float *bipolarCellsOFF_PTR = &_bipolarCellsOutputOFF[0];
-    float *parvocellularOutputON_PTR= &_parvocellularOutputON[0];
-    float *parvocellularOutputOFF_PTR= &_parvocellularOutputOFF[0];
-    // compute bipolar cells response equal to photoreceptors minus horizontal cells response
-    // and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result
-    for (register unsigned int IDpixel=0 ; IDpixel<_filterOutput.getNBpixels() ; ++IDpixel)
-    {
-        float pixelDifference = *(photoreceptorsOutput_PTR++) -*(horizontalCellsOutput_PTR++);
-        // test condition to allow write pixelDifference in ON or OFF buffer and 0 in the over
-        float isPositive=(float) (pixelDifference>0.0f);
-
-        // ON and OFF channels writing step
-        *(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference;
-        *(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference;
-    }
-#endif
-}
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/parvoretinafilter.hpp b/modules/bioinspired/src/parvoretinafilter.hpp
deleted file mode 100644
index f5ffa1a06..000000000
--- a/modules/bioinspired/src/parvoretinafilter.hpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#ifndef ParvoRetinaFilter_H_
-#define ParvoRetinaFilter_H_
-
-/**
-* @class ParvoRetinaFilter
-* @brief class which describes the OPL retina model and the Inner Plexiform Layer parvocellular channel of the retina:
-* -> performs a contours extraction with powerfull local data enhancement as at the retina level
-* -> spectrum whitening occurs at the OPL (Outer Plexiform Layer) of the retina: corrects the 1/f spectrum tendancy of natural images
-* ---> enhances details with mid spatial frequencies, attenuates low spatial frequencies (luminance), attenuates high temporal frequencies and high spatial frequencies, etc.
-*
-* TYPICAL USE:
-*
-* // create object at a specified picture size
-* ParvoRetinaFilter *contoursExtractor;
-* contoursExtractor =new ParvoRetinaFilter(frameSizeRows, frameSizeColumns);
-*
-* // init gain, spatial and temporal parameters:
-* contoursExtractor->setCoefficientsTable(0, 0.7, 1, 0, 7, 1);
-*
-* // during program execution, call the filter for contours extraction for an input picture called "FrameBuffer":
-* contoursExtractor->runfilter(FrameBuffer);
-*
-* // get the output frame, check in the class description below for more outputs:
-* const float *contours=contoursExtractor->getParvoONminusOFF();
-*
-* // at the end of the program, destroy object:
-* delete contoursExtractor;
-
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-* Based on Alexandre BENOIT thesis: "Le système visuel humain au secours de la vision par ordinateur"
-*
-*/
-
-#include "basicretinafilter.hpp"
-
-
-//#define _OPL_RETINA_ELEMENT_DEBUG
-
-namespace cv
-{
-namespace bioinspired
-{
-//retina classes that derivate from the Basic Retrina class
-class ParvoRetinaFilter: public BasicRetinaFilter
-{
-
-public:
-    /**
-    * constructor parameters are only linked to image input size
-    * @param NBrows: number of rows of the input image
-    * @param NBcolumns: number of columns of the input image
-    */
-    ParvoRetinaFilter(const unsigned int NBrows=480, const unsigned int NBcolumns=640);
-
-    /**
-    * standard desctructor
-    */
-    virtual ~ParvoRetinaFilter();
-
-    /**
-    * resize method, keeps initial parameters, all buffers are flushed
-    * @param NBrows: number of rows of the input image
-    * @param NBcolumns: number of columns of the input image
-    */
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-    /**
-    * function that clears all buffers of the object
-    */
-    void clearAllBuffers();
-
-    /**
-    * setup the OPL and IPL parvo channels
-    * @param beta1: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, the amplitude is boosted but it should only be used for values rescaling... if needed
-    * @param tau1: the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-    * @param k1: the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-    * @param beta2: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-    * @param tau2: the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-    * @param k2: the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-    */
-    void setOPLandParvoFiltersParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2);
-
-    /**
-    * setup more precisely the low pass filter used for the ganglion cells low pass filtering (used for local luminance adaptation)
-    * @param tau: time constant of the filter (unit is frame for video processing)
-    * @param k: spatial constant of the filter (unit is pixels)
-    */
-    void setGanglionCellsLocalAdaptationLPfilterParameters(const float tau, const float k){BasicRetinaFilter::setLPfilterParameters(0, tau, k, 2);}; // change the parameters of the filter
-
-
-    /**
-    * launch filter that runs the OPL spatiotemporal filtering and optionally finalizes IPL Pagno filter (model of the Parvocellular channel of the Inner Plexiform Layer of the retina)
-    * @param inputFrame: the input image to be processed, this can be the direct gray level input frame, but a better efficacy is expected if the input is preliminary processed by the photoreceptors local adaptation possible to acheive with the help of a BasicRetinaFilter object
-    * @param useParvoOutput: set true if the final IPL filtering step has to be computed (local contrast enhancement)
-    * @return the processed Parvocellular channel output (updated only if useParvoOutput is true)
-    * @details: in any case, after this function call, photoreceptors and horizontal cells output are updated, use getPhotoreceptorsLPfilteringOutput() and getHorizontalCellsOutput() to get them
-    * also, bipolar cells output are accessible (difference between photoreceptors and horizontal cells, ON output has positive values, OFF ouput has negative values), use the following access methods: getBipolarCellsON() and getBipolarCellsOFF()if useParvoOutput is true,
-    * if useParvoOutput is true, the complete Parvocellular channel is computed, more outputs are updated and can be accessed threw: getParvoON(), getParvoOFF() and their difference with getOutput()
-    */
-    const std::valarray<float> &runFilter(const std::valarray<float> &inputFrame, const bool useParvoOutput=true); // output return is _parvocellularOutputONminusOFF
-
-    /**
-    * @return the output of the photoreceptors filtering step (high cut frequency spatio-temporal low pass filter)
-    */
-    inline const std::valarray<float> &getPhotoreceptorsLPfilteringOutput() const {return _photoreceptorsOutput;};
-
-    /**
-    * @return the output of the photoreceptors filtering step (low cut frequency spatio-temporal low pass filter)
-    */
-    inline const std::valarray<float> &getHorizontalCellsOutput() const { return _horizontalCellsOutput;};
-
-    /**
-    * @return the output Parvocellular ON channel of the retina model
-    */
-    inline const std::valarray<float> &getParvoON() const {return _parvocellularOutputON;};
-
-    /**
-    * @return the output Parvocellular OFF channel of the retina model
-    */
-    inline const std::valarray<float> &getParvoOFF() const {return _parvocellularOutputOFF;};
-
-    /**
-    * @return the output of the Bipolar cells of the ON channel of the retina model same as function getParvoON() but without luminance local adaptation
-    */
-    inline const std::valarray<float> &getBipolarCellsON() const {return _bipolarCellsOutputON;};
-
-    /**
-    * @return the output of the Bipolar cells of the OFF channel of the retina model same as function getParvoON() but without luminance local adaptation
-    */
-    inline const std::valarray<float> &getBipolarCellsOFF() const {return _bipolarCellsOutputOFF;};
-
-    /**
-    * @return the photoreceptors's temporal constant
-    */
-    inline float getPhotoreceptorsTemporalConstant(){return this->_filteringCoeficientsTable[2];};
-
-    /**
-    * @return the horizontal cells' temporal constant
-    */
-    inline float getHcellsTemporalConstant(){return this->_filteringCoeficientsTable[5];};
-
-private:
-    // template buffers
-    std::valarray <float>_photoreceptorsOutput;
-    std::valarray <float>_horizontalCellsOutput;
-    std::valarray <float>_parvocellularOutputON;
-    std::valarray <float>_parvocellularOutputOFF;
-    std::valarray <float>_bipolarCellsOutputON;
-    std::valarray <float>_bipolarCellsOutputOFF;
-    std::valarray <float>_localAdaptationOFF;
-    std::valarray <float> *_localAdaptationON;
-    TemplateBuffer<float> *_parvocellularOutputONminusOFF;
-    // private functions
-    void _OPL_OnOffWaysComputing();
-
-#ifdef MAKE_PARALLEL
-/******************************************************
-** IF some parallelizing thread methods are available, then, main loops are parallelized using these functors
-** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
-** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
-** ==> functors constructors can differ from the parameters used with their related serial functions
-*/
-    class Parallel_OPL_OnOffWaysComputing: public cv::ParallelLoopBody
-    {
-    private:
-    float *photoreceptorsOutput, *horizontalCellsOutput, *bipolarCellsON, *bipolarCellsOFF, *parvocellularOutputON, *parvocellularOutputOFF;
-    public:
-        Parallel_OPL_OnOffWaysComputing(float *photoreceptorsOutput_PTR, float *horizontalCellsOutput_PTR, float *bipolarCellsON_PTR, float *bipolarCellsOFF_PTR, float *parvocellularOutputON_PTR, float *parvocellularOutputOFF_PTR)
-        :photoreceptorsOutput(photoreceptorsOutput_PTR), horizontalCellsOutput(horizontalCellsOutput_PTR), bipolarCellsON(bipolarCellsON_PTR), bipolarCellsOFF(bipolarCellsOFF_PTR), parvocellularOutputON(parvocellularOutputON_PTR), parvocellularOutputOFF(parvocellularOutputOFF_PTR) {}
-
-        virtual void operator()( const Range& r ) const {
-        // compute bipolar cells response equal to photoreceptors minus horizontal cells response
-        // and copy the result on parvo cellular outputs... keeping time before their local contrast adaptation for final result
-        float *photoreceptorsOutput_PTR= photoreceptorsOutput+r.start;
-        float *horizontalCellsOutput_PTR= horizontalCellsOutput+r.start;
-        float *bipolarCellsON_PTR = bipolarCellsON+r.start;
-        float *bipolarCellsOFF_PTR = bipolarCellsOFF+r.start;
-        float *parvocellularOutputON_PTR= parvocellularOutputON+r.start;
-        float *parvocellularOutputOFF_PTR= parvocellularOutputOFF+r.start;
-
-            for (register int IDpixel=r.start ; IDpixel!=r.end ; ++IDpixel)
-        {
-        float pixelDifference = *(photoreceptorsOutput_PTR++) -*(horizontalCellsOutput_PTR++);
-        // test condition to allow write pixelDifference in ON or OFF buffer and 0 in the over
-        float isPositive=(float) (pixelDifference>0.0f);
-
-        // ON and OFF channels writing step
-        *(parvocellularOutputON_PTR++)=*(bipolarCellsON_PTR++) = isPositive*pixelDifference;
-        *(parvocellularOutputOFF_PTR++)=*(bipolarCellsOFF_PTR++)= (isPositive-1.0f)*pixelDifference;
-        }
-        }
-    };
-#endif
-
-};
-}// end of namespace bioinspired
-}// end of namespace cv
-#endif
diff --git a/modules/bioinspired/src/precomp.hpp b/modules/bioinspired/src/precomp.hpp
deleted file mode 100644
index 61aeb5409..000000000
--- a/modules/bioinspired/src/precomp.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_PRECOMP_H__
-#define __OPENCV_PRECOMP_H__
-
-#include "opencv2/opencv_modules.hpp"
-#include "opencv2/bioinspired.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/core/private.hpp"
-#include "opencv2/core/ocl.hpp"
-
-#include <valarray>
-
-#ifdef HAVE_OPENCV_OCL
-    #include "opencv2/ocl/private/util.hpp"
-#endif
-
-namespace cv
-{
-
-// special function to get pointer to constant valarray elements, since
-// simple &arr[0] does not compile on VS2005/VS2008.
-template<typename T> inline const T* get_data(const std::valarray<T>& arr)
-{ return &((std::valarray<T>&)arr)[0]; }
-
-}
-
-#endif
diff --git a/modules/bioinspired/src/retina.cpp b/modules/bioinspired/src/retina.cpp
deleted file mode 100644
index d2193ea3e..000000000
--- a/modules/bioinspired/src/retina.cpp
+++ /dev/null
@@ -1,743 +0,0 @@
-/*#******************************************************************************
- ** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- **
- ** By downloading, copying, installing or using the software you agree to this license.
- ** If you do not agree to this license, do not download, install,
- ** copy or use the software.
- **
- **
- ** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
- ** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
- **
- ** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
- **
- **  Creation - enhancement process 2007-2011
- **      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
- **
- ** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
- ** Refer to the following research paper for more information:
- ** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- ** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
- ** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- **
- ** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
- ** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
- ** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
- ** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
- ** ====> more informations in the above cited Jeanny Heraults's book.
- **
- **                          License Agreement
- **               For Open Source Computer Vision Library
- **
- ** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- ** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
- **
- **               For Human Visual System tools (bioinspired)
- ** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
- **
- ** Third party copyrights are property of their respective owners.
- **
- ** Redistribution and use in source and binary forms, with or without modification,
- ** are permitted provided that the following conditions are met:
- **
- ** * Redistributions of source code must retain the above copyright notice,
- **    this list of conditions and the following disclaimer.
- **
- ** * Redistributions in binary form must reproduce the above copyright notice,
- **    this list of conditions and the following disclaimer in the documentation
- **    and/or other materials provided with the distribution.
- **
- ** * The name of the copyright holders may not be used to endorse or promote products
- **    derived from this software without specific prior written permission.
- **
- ** This software is provided by the copyright holders and contributors "as is" and
- ** any express or implied warranties, including, but not limited to, the implied
- ** warranties of merchantability and fitness for a particular purpose are disclaimed.
- ** In no event shall the Intel Corporation or contributors be liable for any direct,
- ** indirect, incidental, special, exemplary, or consequential damages
- ** (including, but not limited to, procurement of substitute goods or services;
- ** loss of use, data, or profits; or business interruption) however caused
- ** and on any theory of liability, whether in contract, strict liability,
- ** or tort (including negligence or otherwise) arising in any way out of
- ** the use of this software, even if advised of the possibility of such damage.
- *******************************************************************************/
-
-/*
- * Retina.cpp
- *
- *  Created on: Jul 19, 2011
- *      Author: Alexandre Benoit
- */
-#include "precomp.hpp"
-#include "retinafilter.hpp"
-#include <cstdio>
-#include <sstream>
-#include <valarray>
-
-namespace cv
-{
-namespace bioinspired
-{
-
-class RetinaImpl : public Retina
-{
-public:
-    /**
-     * Main constructor with most commun use setup : create an instance of color ready retina model
-     * @param inputSize : the input frame size
-     */
-    RetinaImpl(Size inputSize);
-
-    /**
-     * Complete Retina filter constructor which allows all basic structural parameters definition
-         * @param inputSize : the input frame size
-     * @param colorMode : the chosen processing mode : with or without color processing
-     * @param colorSamplingMethod: specifies which kind of color sampling will be used
-     * @param useRetinaLogSampling: activate retina log sampling, if true, the 2 following parameters can be used
-     * @param reductionFactor: only usefull if param useRetinaLogSampling=true, specifies the reduction factor of the output frame (as the center (fovea) is high resolution and corners can be underscaled, then a reduction of the output is allowed without precision leak
-     * @param samplingStrenght: only usefull if param useRetinaLogSampling=true, specifies the strenght of the log scale that is applied
-     */
-    RetinaImpl(Size inputSize, const bool colorMode, int colorSamplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-
-    virtual ~RetinaImpl();
-    /**
-        * retreive retina input buffer size
-        */
-        Size getInputSize();
-
-    /**
-        * retreive retina output buffer size
-        */
-        Size getOutputSize();
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param retinaParameterFile : the parameters filename
-         * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-    void setup(String retinaParameterFile="", const bool applyDefaultSetupOnFailure=true);
-
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param fs : the open Filestorage which contains retina parameters
-         * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-        void setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure=true);
-
-    /**
-     * try to open an XML retina parameters file to adjust current retina instance setup
-     * => if the xml file does not exist, then default setup is applied
-     * => warning, Exceptions are thrown if read XML file is not valid
-     * @param newParameters : a parameters structures updated with the new target configuration
-         * @param applyDefaultSetupOnFailure : set to true if an error must be thrown on error
-     */
-    void setup(Retina::RetinaParameters newParameters);
-
-    /**
-    * @return the current parameters setup
-    */
-    struct Retina::RetinaParameters getParameters();
-
-    /**
-     * parameters setup display method
-     * @return a string which contains formatted parameters information
-     */
-    const String printSetup();
-
-    /**
-     * write xml/yml formated parameters information
-     * @rparam fs : the filename of the xml file that will be open and writen with formatted parameters information
-     */
-    virtual void write( String fs ) const;
-
-
-    /**
-     * write xml/yml formated parameters information
-     * @param fs : a cv::Filestorage object ready to be filled
-         */
-    virtual void write( FileStorage& fs ) const;
-
-    /**
-     * setup the OPL and IPL parvo channels (see biologocal model)
-     * OPL is referred as Outer Plexiform Layer of the retina, it allows the spatio-temporal filtering which withens the spectrum and reduces spatio-temporal noise while attenuating global luminance (low frequency energy)
-     * IPL parvo is the OPL next processing stage, it refers to Inner Plexiform layer of the retina, it allows high contours sensitivity in foveal vision.
-     * for more informations, please have a look at the paper Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-     * @param colorMode : specifies if (true) color is processed of not (false) to then processing gray level image
-     * @param normaliseOutput : specifies if (true) output is rescaled between 0 and 255 of not (false)
-     * @param photoreceptorsLocalAdaptationSensitivity: the photoreceptors sensitivity renage is 0-1 (more log compression effect when value increases)
-     * @param photoreceptorsTemporalConstant: the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-     * @param photoreceptorsSpatialConstant: the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-     * @param horizontalCellsGain: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-     * @param HcellsTemporalConstant: the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-     * @param HcellsSpatialConstant: the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-     * @param ganglionCellsSensitivity: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 230
-     */
-    void setupOPLandIPLParvoChannel(const bool colorMode=true, const bool normaliseOutput = true, const float photoreceptorsLocalAdaptationSensitivity=0.7, const float photoreceptorsTemporalConstant=0.5, const float photoreceptorsSpatialConstant=0.53, const float horizontalCellsGain=0, const float HcellsTemporalConstant=1, const float HcellsSpatialConstant=7, const float ganglionCellsSensitivity=0.7);
-
-    /**
-     * set parameters values for the Inner Plexiform Layer (IPL) magnocellular channel
-     * this channel processes signals outpint from OPL processing stage in peripheral vision, it allows motion information enhancement. It is decorrelated from the details channel. See reference paper for more details.
-     * @param normaliseOutput : specifies if (true) output is rescaled between 0 and 255 of not (false)
-     * @param parasolCells_beta: the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-     * @param parasolCells_tau: the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-     * @param parasolCells_k: the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-     * @param amacrinCellsTemporalCutFrequency: the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, tipicall value is 5
-     * @param V0CompressionParameter: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 200
-     * @param localAdaptintegration_tau: specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-     * @param localAdaptintegration_k: specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-     */
-    void setupIPLMagnoChannel(const bool normaliseOutput = true, const float parasolCells_beta=0, const float parasolCells_tau=0, const float parasolCells_k=7, const float amacrinCellsTemporalCutFrequency=1.2, const float V0CompressionParameter=0.95, const float localAdaptintegration_tau=0, const float localAdaptintegration_k=7);
-
-    /**
-     * method which allows retina to be applied on an input image, after run, encapsulated retina module is ready to deliver its outputs using dedicated acccessors, see getParvo and getMagno methods
-     * @param inputImage : the input cv::Mat image to be processed, can be gray level or BGR coded in any format (from 8bit to 16bits)
-     */
-    void run(InputArray inputImage);
-
-    /**
-     * method that applies a luminance correction (initially High Dynamic Range (HDR) tone mapping) using only the 2 local adaptation stages of the retina parvo channel : photoreceptors level and ganlion cells level. Spatio temporal filtering is applied but limited to temporal smoothing and eventually high frequencies attenuation. This is a lighter method than the one available using the regular run method. It is then faster but it does not include complete temporal filtering nor retina spectral whitening. This is an adptation of the original still image HDR tone mapping algorithm of David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
-     @param inputImage the input image to process RGB or gray levels
-     @param outputToneMappedImage the output tone mapped image
-     */
-    void applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage);
-
-    /**
-     * accessor of the details channel of the retina (models foveal vision)
-     * @param retinaOutput_parvo : the output buffer (reallocated if necessary), this output is rescaled for standard 8bits image processing use in OpenCV
-     */
-    void getParvo(OutputArray retinaOutput_parvo);
-
-    /**
-     * accessor of the details channel of the retina (models foveal vision)
-     * @param retinaOutput_parvo : a cv::Mat header filled with the internal parvo buffer of the retina module. This output is the original retina filter model output, without any quantification or rescaling
-     */
-    void getParvoRAW(OutputArray retinaOutput_parvo);
-
-    /**
-     * accessor of the motion channel of the retina (models peripheral vision)
-     * @param retinaOutput_magno : the output buffer (reallocated if necessary), this output is rescaled for standard 8bits image processing use in OpenCV
-     */
-    void getMagno(OutputArray retinaOutput_magno);
-
-    /**
-     * accessor of the motion channel of the retina (models peripheral vision)
-     * @param retinaOutput_magno : a cv::Mat header filled with the internal retina magno buffer of the retina module. This output is the original retina filter model output, without any quantification or rescaling
-     */
-    void getMagnoRAW(OutputArray retinaOutput_magno);
-
-    // original API level data accessors : get buffers addresses from a Mat header, similar to getParvoRAW and getMagnoRAW...
-    const Mat getMagnoRAW() const;
-    const Mat getParvoRAW() const;
-
-    /**
-     * activate color saturation as the final step of the color demultiplexing process
-     * -> this saturation is a sigmoide function applied to each channel of the demultiplexed image.
-     * @param saturateColors: boolean that activates color saturation (if true) or desactivate (if false)
-     * @param colorSaturationValue: the saturation factor
-     */
-    void setColorSaturation(const bool saturateColors=true, const float colorSaturationValue=4.0);
-
-    /**
-     * clear all retina buffers (equivalent to opening the eyes after a long period of eye close ;o)
-     */
-    void clearBuffers();
-
-        /**
-        * Activate/desactivate the Magnocellular pathway processing (motion information extraction), by default, it is activated
-        * @param activate: true if Magnocellular output should be activated, false if not
-        */
-        void activateMovingContoursProcessing(const bool activate);
-
-        /**
-        * Activate/desactivate the Parvocellular pathway processing (contours information extraction), by default, it is activated
-        * @param activate: true if Parvocellular (contours information extraction) output should be activated, false if not
-        */
-        void activateContoursProcessing(const bool activate);
-private:
-
-    // Parameteres setup members
-    RetinaParameters _retinaParameters; // structure of parameters
-
-    // Retina model related modules
-    std::valarray<float> _inputBuffer; //!< buffer used to convert input cv::Mat to internal retina buffers format (valarrays)
-
-    // pointer to retina model
-    RetinaFilter* _retinaFilter; //!< the pointer to the retina module, allocated with instance construction
-
-    //! private method called by constructors, gathers their parameters and use them in a unified way
-    void _init(const Size inputSize, const bool colorMode, int colorSamplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-
-    /**
-     * exports a valarray buffer outing from bioinspired objects to a cv::Mat in CV_8UC1 (gray level picture) or CV_8UC3 (color) format
-     * @param grayMatrixToConvert the valarray to export to OpenCV
-     * @param nbRows : the number of rows of the valarray flatten matrix
-     * @param nbColumns : the number of rows of the valarray flatten matrix
-     * @param colorMode : a flag which mentions if matrix is color (true) or graylevel (false)
-     * @param outBuffer : the output matrix which is reallocated to satisfy Retina output buffer dimensions
-     */
-    void _convertValarrayBuffer2cvMat(const std::valarray<float> &grayMatrixToConvert, const unsigned int nbRows, const unsigned int nbColumns, const bool colorMode, OutputArray outBuffer);
-
-    /**
-     * convert a cv::Mat to a valarray buffer in float format
-     * @param inputMatToConvert : the OpenCV cv::Mat that has to be converted to gray or RGB valarray buffer that will be processed by the retina model
-     * @param outputValarrayMatrix : the output valarray
-     * @return the input image color mode (color=true, gray levels=false)
-     */
-    bool _convertCvMat2ValarrayBuffer(InputArray inputMatToConvert, std::valarray<float> &outputValarrayMatrix);
-
-
-};
-
-// smart pointers allocation :
-Ptr<Retina> createRetina(Size inputSize){ return makePtr<RetinaImpl>(inputSize); }
-Ptr<Retina> createRetina(Size inputSize, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght){
-    return makePtr<RetinaImpl>(inputSize, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-}
-
-
-// RetinaImpl code
-RetinaImpl::RetinaImpl(const cv::Size inputSz)
-{
-    _retinaFilter = 0;
-    _init(inputSz, true, RETINA_COLOR_BAYER, false);
-}
-
-RetinaImpl::RetinaImpl(const cv::Size inputSz, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-{
-    _retinaFilter = 0;
-    _init(inputSz, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-};
-
-RetinaImpl::~RetinaImpl()
-{
-    if (_retinaFilter)
-        delete _retinaFilter;
-}
-
-/**
-* retreive retina input buffer size
-*/
-Size RetinaImpl::getInputSize(){return cv::Size(_retinaFilter->getInputNBcolumns(), _retinaFilter->getInputNBrows());}
-
-/**
-* retreive retina output buffer size
-*/
-Size RetinaImpl::getOutputSize(){return cv::Size(_retinaFilter->getOutputNBcolumns(), _retinaFilter->getOutputNBrows());}
-
-
-void RetinaImpl::setColorSaturation(const bool saturateColors, const float colorSaturationValue)
-{
-    _retinaFilter->setColorSaturation(saturateColors, colorSaturationValue);
-}
-
-struct Retina::RetinaParameters RetinaImpl::getParameters(){return _retinaParameters;}
-
-void RetinaImpl::setup(String retinaParameterFile, const bool applyDefaultSetupOnFailure)
-{
-    try
-    {
-        // opening retinaParameterFile in read mode
-        cv::FileStorage fs(retinaParameterFile, cv::FileStorage::READ);
-        setup(fs, applyDefaultSetupOnFailure);
-    }
-    catch(Exception &e)
-    {
-        printf("Retina::setup: wrong/unappropriate xml parameter file : error report :`n=>%s\n", e.what());
-        if (applyDefaultSetupOnFailure)
-        {
-            printf("Retina::setup: resetting retina with default parameters\n");
-            setupOPLandIPLParvoChannel();
-            setupIPLMagnoChannel();
-        }
-        else
-        {
-            printf("=> keeping current parameters\n");
-        }
-    }
-}
-
-void RetinaImpl::setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure)
-{
-    try
-    {
-        // read parameters file if it exists or apply default setup if asked for
-        if (!fs.isOpened())
-        {
-            printf("Retina::setup: provided parameters file could not be open... skeeping configuration\n");
-            return;
-            // implicit else case : retinaParameterFile could be open (it exists at least)
-        }
-                // OPL and Parvo init first... update at the same time the parameters structure and the retina core
-        cv::FileNode rootFn = fs.root(), currFn=rootFn["OPLandIPLparvo"];
-        currFn["colorMode"]>>_retinaParameters.OPLandIplParvo.colorMode;
-        currFn["normaliseOutput"]>>_retinaParameters.OPLandIplParvo.normaliseOutput;
-        currFn["photoreceptorsLocalAdaptationSensitivity"]>>_retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity;
-        currFn["photoreceptorsTemporalConstant"]>>_retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant;
-        currFn["photoreceptorsSpatialConstant"]>>_retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant;
-        currFn["horizontalCellsGain"]>>_retinaParameters.OPLandIplParvo.horizontalCellsGain;
-        currFn["hcellsTemporalConstant"]>>_retinaParameters.OPLandIplParvo.hcellsTemporalConstant;
-        currFn["hcellsSpatialConstant"]>>_retinaParameters.OPLandIplParvo.hcellsSpatialConstant;
-        currFn["ganglionCellsSensitivity"]>>_retinaParameters.OPLandIplParvo.ganglionCellsSensitivity;
-        setupOPLandIPLParvoChannel(_retinaParameters.OPLandIplParvo.colorMode, _retinaParameters.OPLandIplParvo.normaliseOutput, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant, _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant, _retinaParameters.OPLandIplParvo.horizontalCellsGain, _retinaParameters.OPLandIplParvo.hcellsTemporalConstant, _retinaParameters.OPLandIplParvo.hcellsSpatialConstant, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-
-        // init retina IPL magno setup... update at the same time the parameters structure and the retina core
-        currFn=rootFn["IPLmagno"];
-        currFn["normaliseOutput"]>>_retinaParameters.IplMagno.normaliseOutput;
-        currFn["parasolCells_beta"]>>_retinaParameters.IplMagno.parasolCells_beta;
-        currFn["parasolCells_tau"]>>_retinaParameters.IplMagno.parasolCells_tau;
-        currFn["parasolCells_k"]>>_retinaParameters.IplMagno.parasolCells_k;
-        currFn["amacrinCellsTemporalCutFrequency"]>>_retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency;
-        currFn["V0CompressionParameter"]>>_retinaParameters.IplMagno.V0CompressionParameter;
-        currFn["localAdaptintegration_tau"]>>_retinaParameters.IplMagno.localAdaptintegration_tau;
-        currFn["localAdaptintegration_k"]>>_retinaParameters.IplMagno.localAdaptintegration_k;
-
-        setupIPLMagnoChannel(_retinaParameters.IplMagno.normaliseOutput, _retinaParameters.IplMagno.parasolCells_beta, _retinaParameters.IplMagno.parasolCells_tau, _retinaParameters.IplMagno.parasolCells_k, _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency,_retinaParameters.IplMagno.V0CompressionParameter, _retinaParameters.IplMagno.localAdaptintegration_tau, _retinaParameters.IplMagno.localAdaptintegration_k);
-
-    }catch(Exception &e)
-    {
-        printf("RetinaImpl::setup: resetting retina with default parameters\n");
-        if (applyDefaultSetupOnFailure)
-        {
-            setupOPLandIPLParvoChannel();
-            setupIPLMagnoChannel();
-        }
-        printf("Retina::setup: wrong/unappropriate xml parameter file : error report :`n=>%s\n", e.what());
-        printf("=> keeping current parameters\n");
-    }
-
-    // report current configuration
-    printf("%s\n", printSetup().c_str());
-}
-
-void RetinaImpl::setup(Retina::RetinaParameters newConfiguration)
-{
-    // simply copy structures
-    memcpy(&_retinaParameters, &newConfiguration, sizeof(Retina::RetinaParameters));
-    // apply setup
-    setupOPLandIPLParvoChannel(_retinaParameters.OPLandIplParvo.colorMode, _retinaParameters.OPLandIplParvo.normaliseOutput, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant, _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant, _retinaParameters.OPLandIplParvo.horizontalCellsGain, _retinaParameters.OPLandIplParvo.hcellsTemporalConstant, _retinaParameters.OPLandIplParvo.hcellsSpatialConstant, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-    setupIPLMagnoChannel(_retinaParameters.IplMagno.normaliseOutput, _retinaParameters.IplMagno.parasolCells_beta, _retinaParameters.IplMagno.parasolCells_tau, _retinaParameters.IplMagno.parasolCells_k, _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency,_retinaParameters.IplMagno.V0CompressionParameter, _retinaParameters.IplMagno.localAdaptintegration_tau, _retinaParameters.IplMagno.localAdaptintegration_k);
-
-}
-
-const String RetinaImpl::printSetup()
-{
-    std::stringstream outmessage;
-
-    // displaying OPL and IPL parvo setup
-    outmessage<<"Current Retina instance setup :"
-            <<"\nOPLandIPLparvo"<<"{"
-            << "\n\t colorMode : " << _retinaParameters.OPLandIplParvo.colorMode
-            << "\n\t normalizeParvoOutput :" << _retinaParameters.OPLandIplParvo.normaliseOutput
-            << "\n\t photoreceptorsLocalAdaptationSensitivity : " << _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity
-            << "\n\t photoreceptorsTemporalConstant : " << _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant
-            << "\n\t photoreceptorsSpatialConstant : " << _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant
-            << "\n\t horizontalCellsGain : " << _retinaParameters.OPLandIplParvo.horizontalCellsGain
-            << "\n\t hcellsTemporalConstant : " << _retinaParameters.OPLandIplParvo.hcellsTemporalConstant
-            << "\n\t hcellsSpatialConstant : " << _retinaParameters.OPLandIplParvo.hcellsSpatialConstant
-            << "\n\t parvoGanglionCellsSensitivity : " << _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity
-            <<"}\n";
-
-    // displaying IPL magno setup
-    outmessage<<"Current Retina instance setup :"
-            <<"\nIPLmagno"<<"{"
-            << "\n\t normaliseOutput : " << _retinaParameters.IplMagno.normaliseOutput
-            << "\n\t parasolCells_beta : " << _retinaParameters.IplMagno.parasolCells_beta
-            << "\n\t parasolCells_tau : " << _retinaParameters.IplMagno.parasolCells_tau
-            << "\n\t parasolCells_k : " << _retinaParameters.IplMagno.parasolCells_k
-            << "\n\t amacrinCellsTemporalCutFrequency : " << _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency
-            << "\n\t V0CompressionParameter : " << _retinaParameters.IplMagno.V0CompressionParameter
-            << "\n\t localAdaptintegration_tau : " << _retinaParameters.IplMagno.localAdaptintegration_tau
-            << "\n\t localAdaptintegration_k : " << _retinaParameters.IplMagno.localAdaptintegration_k
-            <<"}";
-    return outmessage.str().c_str();
-}
-
-void RetinaImpl::write( String fs ) const
-{
-    FileStorage parametersSaveFile(fs, cv::FileStorage::WRITE );
-    write(parametersSaveFile);
-}
-
-void RetinaImpl::write( FileStorage& fs ) const
-{
-    if (!fs.isOpened())
-        return; // basic error case
-    fs<<"OPLandIPLparvo"<<"{";
-    fs << "colorMode" << _retinaParameters.OPLandIplParvo.colorMode;
-    fs << "normaliseOutput" << _retinaParameters.OPLandIplParvo.normaliseOutput;
-    fs << "photoreceptorsLocalAdaptationSensitivity" << _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity;
-    fs << "photoreceptorsTemporalConstant" << _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant;
-    fs << "photoreceptorsSpatialConstant" << _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant;
-    fs << "horizontalCellsGain" << _retinaParameters.OPLandIplParvo.horizontalCellsGain;
-    fs << "hcellsTemporalConstant" << _retinaParameters.OPLandIplParvo.hcellsTemporalConstant;
-    fs << "hcellsSpatialConstant" << _retinaParameters.OPLandIplParvo.hcellsSpatialConstant;
-    fs << "ganglionCellsSensitivity" << _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity;
-    fs << "}";
-    fs<<"IPLmagno"<<"{";
-    fs << "normaliseOutput" << _retinaParameters.IplMagno.normaliseOutput;
-    fs << "parasolCells_beta" << _retinaParameters.IplMagno.parasolCells_beta;
-    fs << "parasolCells_tau" << _retinaParameters.IplMagno.parasolCells_tau;
-    fs << "parasolCells_k" << _retinaParameters.IplMagno.parasolCells_k;
-    fs << "amacrinCellsTemporalCutFrequency" << _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency;
-    fs << "V0CompressionParameter" << _retinaParameters.IplMagno.V0CompressionParameter;
-    fs << "localAdaptintegration_tau" << _retinaParameters.IplMagno.localAdaptintegration_tau;
-    fs << "localAdaptintegration_k" << _retinaParameters.IplMagno.localAdaptintegration_k;
-    fs<<"}";
-}
-
-void RetinaImpl::setupOPLandIPLParvoChannel(const bool colorMode, const bool normaliseOutput, const float photoreceptorsLocalAdaptationSensitivity, const float photoreceptorsTemporalConstant, const float photoreceptorsSpatialConstant, const float horizontalCellsGain, const float HcellsTemporalConstant, const float HcellsSpatialConstant, const float ganglionCellsSensitivity)
-{
-    // retina core parameters setup
-    _retinaFilter->setColorMode(colorMode);
-    _retinaFilter->setPhotoreceptorsLocalAdaptationSensitivity(photoreceptorsLocalAdaptationSensitivity);
-    _retinaFilter->setOPLandParvoParameters(0, photoreceptorsTemporalConstant, photoreceptorsSpatialConstant, horizontalCellsGain, HcellsTemporalConstant, HcellsSpatialConstant, ganglionCellsSensitivity);
-    _retinaFilter->setParvoGanglionCellsLocalAdaptationSensitivity(ganglionCellsSensitivity);
-    _retinaFilter->activateNormalizeParvoOutput_0_maxOutputValue(normaliseOutput);
-
-        // update parameters struture
-
-    _retinaParameters.OPLandIplParvo.colorMode = colorMode;
-    _retinaParameters.OPLandIplParvo.normaliseOutput = normaliseOutput;
-    _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity = photoreceptorsLocalAdaptationSensitivity;
-    _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant = photoreceptorsTemporalConstant;
-    _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant = photoreceptorsSpatialConstant;
-    _retinaParameters.OPLandIplParvo.horizontalCellsGain = horizontalCellsGain;
-    _retinaParameters.OPLandIplParvo.hcellsTemporalConstant = HcellsTemporalConstant;
-    _retinaParameters.OPLandIplParvo.hcellsSpatialConstant = HcellsSpatialConstant;
-    _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity = ganglionCellsSensitivity;
-
-}
-
-void RetinaImpl::setupIPLMagnoChannel(const bool normaliseOutput, const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float V0CompressionParameter, const float localAdaptintegration_tau, const float localAdaptintegration_k)
-{
-
-    _retinaFilter->setMagnoCoefficientsTable(parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, V0CompressionParameter, localAdaptintegration_tau, localAdaptintegration_k);
-    _retinaFilter->activateNormalizeMagnoOutput_0_maxOutputValue(normaliseOutput);
-
-        // update parameters struture
-    _retinaParameters.IplMagno.normaliseOutput = normaliseOutput;
-    _retinaParameters.IplMagno.parasolCells_beta = parasolCells_beta;
-    _retinaParameters.IplMagno.parasolCells_tau = parasolCells_tau;
-    _retinaParameters.IplMagno.parasolCells_k = parasolCells_k;
-    _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency = amacrinCellsTemporalCutFrequency;
-    _retinaParameters.IplMagno.V0CompressionParameter = V0CompressionParameter;
-    _retinaParameters.IplMagno.localAdaptintegration_tau = localAdaptintegration_tau;
-    _retinaParameters.IplMagno.localAdaptintegration_k = localAdaptintegration_k;
-}
-
-void RetinaImpl::run(InputArray inputMatToConvert)
-{
-    // first convert input image to the compatible format : std::valarray<float>
-    const bool colorMode = _convertCvMat2ValarrayBuffer(inputMatToConvert.getMat(), _inputBuffer);
-    // process the retina
-    if (!_retinaFilter->runFilter(_inputBuffer, colorMode, false, _retinaParameters.OPLandIplParvo.colorMode && colorMode, false))
-        throw cv::Exception(-1, "RetinaImpl cannot be applied, wrong input buffer size", "RetinaImpl::run", "RetinaImpl.h", 0);
-}
-
-void RetinaImpl::applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)
-{
-    // first convert input image to the compatible format :
-    const bool colorMode = _convertCvMat2ValarrayBuffer(inputImage.getMat(), _inputBuffer);
-    const unsigned int nbPixels=_retinaFilter->getOutputNBrows()*_retinaFilter->getOutputNBcolumns();
-
-    // process tone mapping
-    if (colorMode)
-    {
-        std::valarray<float> imageOutput(nbPixels*3);
-        _retinaFilter->runRGBToneMapping(_inputBuffer, imageOutput, true, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-        _convertValarrayBuffer2cvMat(imageOutput, _retinaFilter->getOutputNBrows(), _retinaFilter->getOutputNBcolumns(), true, outputToneMappedImage);
-    }else
-    {
-        std::valarray<float> imageOutput(nbPixels);
-        _retinaFilter->runGrayToneMapping(_inputBuffer, imageOutput, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-        _convertValarrayBuffer2cvMat(imageOutput, _retinaFilter->getOutputNBrows(), _retinaFilter->getOutputNBcolumns(), false, outputToneMappedImage);
-    }
-
-}
-
-void RetinaImpl::getParvo(OutputArray retinaOutput_parvo)
-{
-    if (_retinaFilter->getColorMode())
-    {
-        // reallocate output buffer (if necessary)
-        _convertValarrayBuffer2cvMat(_retinaFilter->getColorOutput(), _retinaFilter->getOutputNBrows(), _retinaFilter->getOutputNBcolumns(), true, retinaOutput_parvo);
-    }else
-    {
-        // reallocate output buffer (if necessary)
-        _convertValarrayBuffer2cvMat(_retinaFilter->getContours(), _retinaFilter->getOutputNBrows(), _retinaFilter->getOutputNBcolumns(), false, retinaOutput_parvo);
-    }
-    //retinaOutput_parvo/=255.0;
-}
-void RetinaImpl::getMagno(OutputArray retinaOutput_magno)
-{
-    // reallocate output buffer (if necessary)
-    _convertValarrayBuffer2cvMat(_retinaFilter->getMovingContours(), _retinaFilter->getOutputNBrows(), _retinaFilter->getOutputNBcolumns(), false, retinaOutput_magno);
-    //retinaOutput_magno/=255.0;
-}
-
-// original API level data accessors : copy buffers if size matches, reallocate if required
-void RetinaImpl::getMagnoRAW(OutputArray magnoOutputBufferCopy){
-    // get magno channel header
-    const cv::Mat magnoChannel=cv::Mat(getMagnoRAW());
-    // copy data
-    magnoChannel.copyTo(magnoOutputBufferCopy);
-}
-
-void RetinaImpl::getParvoRAW(OutputArray parvoOutputBufferCopy){
-    // get parvo channel header
-    const cv::Mat parvoChannel=cv::Mat(getMagnoRAW());
-    // copy data
-    parvoChannel.copyTo(parvoOutputBufferCopy);
-}
-
-// original API level data accessors : get buffers addresses...
-const Mat RetinaImpl::getMagnoRAW() const {
-    // create a cv::Mat header for the valarray
-    return Mat((int)_retinaFilter->getMovingContours().size(),1, CV_32F, (void*)get_data(_retinaFilter->getMovingContours()));
-
-}
-
-const Mat RetinaImpl::getParvoRAW() const {
-    if (_retinaFilter->getColorMode()) // check if color mode is enabled
-    {
-        // create a cv::Mat table (for RGB planes as a single vector)
-        return Mat((int)_retinaFilter->getColorOutput().size(), 1, CV_32F, (void*)get_data(_retinaFilter->getColorOutput()));
-    }
-    // otherwise, output is gray level
-    // create a cv::Mat header for the valarray
-    return Mat((int)_retinaFilter->getContours().size(), 1, CV_32F, (void*)get_data(_retinaFilter->getContours()));
-}
-
-// private method called by constructirs
-void RetinaImpl::_init(const cv::Size inputSz, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-{
-    // basic error check
-    if (inputSz.height*inputSz.width <= 0)
-        throw cv::Exception(-1, "Bad retina size setup : size height and with must be superior to zero", "RetinaImpl::setup", "Retina.cpp", 0);
-
-    unsigned int nbPixels=inputSz.height*inputSz.width;
-    // resize buffers if size does not match
-    _inputBuffer.resize(nbPixels*3); // buffer supports gray images but also 3 channels color buffers... (larger is better...)
-
-    // allocate the retina model
-        if (_retinaFilter)
-           delete _retinaFilter;
-    _retinaFilter = new RetinaFilter(inputSz.height, inputSz.width, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-
-    _retinaParameters.OPLandIplParvo.colorMode = colorMode;
-    // prepare the default parameter XML file with default setup
-    setup(_retinaParameters);
-
-    // init retina
-    _retinaFilter->clearAllBuffers();
-
-    // report current configuration
-    printf("%s\n", printSetup().c_str());
-}
-
-void RetinaImpl::_convertValarrayBuffer2cvMat(const std::valarray<float> &grayMatrixToConvert, const unsigned int nbRows, const unsigned int nbColumns, const bool colorMode, OutputArray outBuffer)
-{
-    // fill output buffer with the valarray buffer
-    const float *valarrayPTR=get_data(grayMatrixToConvert);
-    if (!colorMode)
-    {
-        outBuffer.create(cv::Size(nbColumns, nbRows), CV_8U);
-        Mat outMat = outBuffer.getMat();
-        for (unsigned int i=0;i<nbRows;++i)
-        {
-            for (unsigned int j=0;j<nbColumns;++j)
-            {
-                cv::Point2d pixel(j,i);
-                outMat.at<unsigned char>(pixel)=(unsigned char)*(valarrayPTR++);
-            }
-        }
-    }else
-    {
-        const unsigned int nbPixels=nbColumns*nbRows;
-        const unsigned int doubleNBpixels=nbColumns*nbRows*2;
-        outBuffer.create(cv::Size(nbColumns, nbRows), CV_8UC3);
-        Mat outMat = outBuffer.getMat();
-        for (unsigned int i=0;i<nbRows;++i)
-        {
-            for (unsigned int j=0;j<nbColumns;++j,++valarrayPTR)
-            {
-                cv::Point2d pixel(j,i);
-                cv::Vec3b pixelValues;
-                pixelValues[2]=(unsigned char)*(valarrayPTR);
-                pixelValues[1]=(unsigned char)*(valarrayPTR+nbPixels);
-                pixelValues[0]=(unsigned char)*(valarrayPTR+doubleNBpixels);
-
-                outMat.at<cv::Vec3b>(pixel)=pixelValues;
-            }
-        }
-    }
-}
-
-bool RetinaImpl::_convertCvMat2ValarrayBuffer(InputArray inputMat, std::valarray<float> &outputValarrayMatrix)
-{
-    const Mat inputMatToConvert=inputMat.getMat();
-    // first check input consistency
-    if (inputMatToConvert.empty())
-        throw cv::Exception(-1, "RetinaImpl cannot be applied, input buffer is empty", "RetinaImpl::run", "RetinaImpl.h", 0);
-
-    // retreive color mode from image input
-    int imageNumberOfChannels = inputMatToConvert.channels();
-
-        // convert to float AND fill the valarray buffer
-    typedef float T; // define here the target pixel format, here, float
-    const int dsttype = DataType<T>::depth; // output buffer is float format
-
-    const unsigned int nbPixels=inputMat.getMat().rows*inputMat.getMat().cols;
-    const unsigned int doubleNBpixels=inputMat.getMat().rows*inputMat.getMat().cols*2;
-
-    if(imageNumberOfChannels==4)
-    {
-    // create a cv::Mat table (for RGBA planes)
-        cv::Mat planes[4] =
-        {
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[doubleNBpixels]),
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[nbPixels]),
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0])
-        };
-        planes[3] = cv::Mat(inputMatToConvert.size(), dsttype);     // last channel (alpha) does not point on the valarray (not usefull in our case)
-        // split color cv::Mat in 4 planes... it fills valarray directely
-        cv::split(Mat_<Vec<T, 4> >(inputMatToConvert), planes);
-    }
-    else if (imageNumberOfChannels==3)
-    {
-        // create a cv::Mat table (for RGB planes)
-        cv::Mat planes[] =
-        {
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[doubleNBpixels]),
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[nbPixels]),
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0])
-        };
-        // split color cv::Mat in 3 planes... it fills valarray directely
-        cv::split(cv::Mat_<Vec<T, 3> >(inputMatToConvert), planes);
-    }
-    else if(imageNumberOfChannels==1)
-    {
-        // create a cv::Mat header for the valarray
-        cv::Mat dst(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0]);
-        inputMatToConvert.convertTo(dst, dsttype);
-    }
-        else
-            CV_Error(Error::StsUnsupportedFormat, "input image must be single channel (gray levels), bgr format (color) or bgra (color with transparency which won't be considered");
-
-    return imageNumberOfChannels>1; // return bool : false for gray level image processing, true for color mode
-}
-
-void RetinaImpl::clearBuffers() {_retinaFilter->clearAllBuffers();}
-
-void RetinaImpl::activateMovingContoursProcessing(const bool activate){_retinaFilter->activateMovingContoursProcessing(activate);}
-
-void RetinaImpl::activateContoursProcessing(const bool activate){_retinaFilter->activateContoursProcessing(activate);}
-
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/retina_ocl.cpp b/modules/bioinspired/src/retina_ocl.cpp
deleted file mode 100644
index 5d2b4bd15..000000000
--- a/modules/bioinspired/src/retina_ocl.cpp
+++ /dev/null
@@ -1,1643 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "retina_ocl.hpp"
-#include <iostream>
-#include <sstream>
-
-#ifdef HAVE_OPENCV_OCL
-
-#include "opencl_kernels.hpp"
-
-#define NOT_IMPLEMENTED CV_Error(cv::Error::StsNotImplemented, "Not implemented")
-
-namespace cv
-{
-static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel;
-
-namespace bioinspired
-{
-namespace ocl
-{
-using namespace cv::ocl;
-
-class RetinaOCLImpl : public Retina
-{
-public:
-    RetinaOCLImpl(Size getInputSize);
-    RetinaOCLImpl(Size getInputSize, const bool colorMode, int colorSamplingMethod = RETINA_COLOR_BAYER, const bool useRetinaLogSampling = false, const double reductionFactor = 1.0, const double samplingStrenght = 10.0);
-    virtual ~RetinaOCLImpl();
-
-    Size getInputSize();
-    Size getOutputSize();
-
-    void setup(String retinaParameterFile = "", const bool applyDefaultSetupOnFailure = true);
-    void setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure = true);
-    void setup(RetinaParameters newParameters);
-
-    RetinaOCLImpl::RetinaParameters getParameters();
-
-    const String printSetup();
-    virtual void write( String fs ) const;
-    virtual void write( FileStorage& fs ) const;
-
-    void setupOPLandIPLParvoChannel(const bool colorMode = true, const bool normaliseOutput = true, const float photoreceptorsLocalAdaptationSensitivity = 0.7, const float photoreceptorsTemporalConstant = 0.5, const float photoreceptorsSpatialConstant = 0.53, const float horizontalCellsGain = 0, const float HcellsTemporalConstant = 1, const float HcellsSpatialConstant = 7, const float ganglionCellsSensitivity = 0.7);
-    void setupIPLMagnoChannel(const bool normaliseOutput = true, const float parasolCells_beta = 0, const float parasolCells_tau = 0, const float parasolCells_k = 7, const float amacrinCellsTemporalCutFrequency = 1.2, const float V0CompressionParameter = 0.95, const float localAdaptintegration_tau = 0, const float localAdaptintegration_k = 7);
-
-    void run(InputArray inputImage);
-    void getParvo(OutputArray retinaOutput_parvo);
-    void getMagno(OutputArray retinaOutput_magno);
-
-    void setColorSaturation(const bool saturateColors = true, const float colorSaturationValue = 4.0);
-    void clearBuffers();
-    void activateMovingContoursProcessing(const bool activate);
-    void activateContoursProcessing(const bool activate);
-
-    // unimplemented interfaces:
-    void applyFastToneMapping(InputArray /*inputImage*/, OutputArray /*outputToneMappedImage*/) { NOT_IMPLEMENTED; }
-    void getParvoRAW(OutputArray /*retinaOutput_parvo*/) { NOT_IMPLEMENTED; }
-    void getMagnoRAW(OutputArray /*retinaOutput_magno*/) { NOT_IMPLEMENTED; }
-    const Mat getMagnoRAW() const { NOT_IMPLEMENTED; return Mat(); }
-    const Mat getParvoRAW() const { NOT_IMPLEMENTED; return Mat(); }
-
-protected:
-    RetinaParameters _retinaParameters;
-    cv::ocl::oclMat _inputBuffer;
-    RetinaFilter* _retinaFilter;
-    bool convertToColorPlanes(const cv::ocl::oclMat& input, cv::ocl::oclMat &output);
-    void convertToInterleaved(const cv::ocl::oclMat& input, bool colorMode, cv::ocl::oclMat &output);
-    void _init(const Size getInputSize, const bool colorMode, int colorSamplingMethod = RETINA_COLOR_BAYER, const bool useRetinaLogSampling = false, const double reductionFactor = 1.0, const double samplingStrenght = 10.0);
-};
-
-RetinaOCLImpl::RetinaOCLImpl(const cv::Size inputSz)
-{
-    _retinaFilter = 0;
-    _init(inputSz, true, RETINA_COLOR_BAYER, false);
-}
-
-RetinaOCLImpl::RetinaOCLImpl(const cv::Size inputSz, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-{
-    _retinaFilter = 0;
-    _init(inputSz, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-};
-
-RetinaOCLImpl::~RetinaOCLImpl()
-{
-    if (_retinaFilter)
-    {
-        delete _retinaFilter;
-    }
-}
-
-/**
-* retreive retina input buffer size
-*/
-Size RetinaOCLImpl::getInputSize()
-{
-    return cv::Size(_retinaFilter->getInputNBcolumns(), _retinaFilter->getInputNBrows());
-}
-
-/**
-* retreive retina output buffer size
-*/
-Size RetinaOCLImpl::getOutputSize()
-{
-    return cv::Size(_retinaFilter->getOutputNBcolumns(), _retinaFilter->getOutputNBrows());
-}
-
-
-void RetinaOCLImpl::setColorSaturation(const bool saturateColors, const float colorSaturationValue)
-{
-    _retinaFilter->setColorSaturation(saturateColors, colorSaturationValue);
-}
-
-struct RetinaOCLImpl::RetinaParameters RetinaOCLImpl::getParameters()
-{
-    return _retinaParameters;
-}
-
-
-void RetinaOCLImpl::setup(String retinaParameterFile, const bool applyDefaultSetupOnFailure)
-{
-    try
-    {
-        // opening retinaParameterFile in read mode
-        cv::FileStorage fs(retinaParameterFile, cv::FileStorage::READ);
-        setup(fs, applyDefaultSetupOnFailure);
-    }
-    catch(Exception &e)
-    {
-        std::cout << "RetinaOCLImpl::setup: wrong/unappropriate xml parameter file : error report :`n=>" << e.what() << std::endl;
-        if (applyDefaultSetupOnFailure)
-        {
-            std::cout << "RetinaOCLImpl::setup: resetting retina with default parameters" << std::endl;
-            setupOPLandIPLParvoChannel();
-            setupIPLMagnoChannel();
-        }
-        else
-        {
-            std::cout << "=> keeping current parameters" << std::endl;
-        }
-    }
-}
-
-void RetinaOCLImpl::setup(cv::FileStorage &fs, const bool applyDefaultSetupOnFailure)
-{
-    try
-    {
-        // read parameters file if it exists or apply default setup if asked for
-        if (!fs.isOpened())
-        {
-            std::cout << "RetinaOCLImpl::setup: provided parameters file could not be open... skeeping configuration" << std::endl;
-            return;
-            // implicit else case : retinaParameterFile could be open (it exists at least)
-        }
-        // OPL and Parvo init first... update at the same time the parameters structure and the retina core
-        cv::FileNode rootFn = fs.root(), currFn = rootFn["OPLandIPLparvo"];
-        currFn["colorMode"] >> _retinaParameters.OPLandIplParvo.colorMode;
-        currFn["normaliseOutput"] >> _retinaParameters.OPLandIplParvo.normaliseOutput;
-        currFn["photoreceptorsLocalAdaptationSensitivity"] >> _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity;
-        currFn["photoreceptorsTemporalConstant"] >> _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant;
-        currFn["photoreceptorsSpatialConstant"] >> _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant;
-        currFn["horizontalCellsGain"] >> _retinaParameters.OPLandIplParvo.horizontalCellsGain;
-        currFn["hcellsTemporalConstant"] >> _retinaParameters.OPLandIplParvo.hcellsTemporalConstant;
-        currFn["hcellsSpatialConstant"] >> _retinaParameters.OPLandIplParvo.hcellsSpatialConstant;
-        currFn["ganglionCellsSensitivity"] >> _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity;
-        setupOPLandIPLParvoChannel(_retinaParameters.OPLandIplParvo.colorMode, _retinaParameters.OPLandIplParvo.normaliseOutput, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant, _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant, _retinaParameters.OPLandIplParvo.horizontalCellsGain, _retinaParameters.OPLandIplParvo.hcellsTemporalConstant, _retinaParameters.OPLandIplParvo.hcellsSpatialConstant, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-
-        // init retina IPL magno setup... update at the same time the parameters structure and the retina core
-        currFn = rootFn["IPLmagno"];
-        currFn["normaliseOutput"] >> _retinaParameters.IplMagno.normaliseOutput;
-        currFn["parasolCells_beta"] >> _retinaParameters.IplMagno.parasolCells_beta;
-        currFn["parasolCells_tau"] >> _retinaParameters.IplMagno.parasolCells_tau;
-        currFn["parasolCells_k"] >> _retinaParameters.IplMagno.parasolCells_k;
-        currFn["amacrinCellsTemporalCutFrequency"] >> _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency;
-        currFn["V0CompressionParameter"] >> _retinaParameters.IplMagno.V0CompressionParameter;
-        currFn["localAdaptintegration_tau"] >> _retinaParameters.IplMagno.localAdaptintegration_tau;
-        currFn["localAdaptintegration_k"] >> _retinaParameters.IplMagno.localAdaptintegration_k;
-
-        setupIPLMagnoChannel(_retinaParameters.IplMagno.normaliseOutput, _retinaParameters.IplMagno.parasolCells_beta, _retinaParameters.IplMagno.parasolCells_tau, _retinaParameters.IplMagno.parasolCells_k, _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency, _retinaParameters.IplMagno.V0CompressionParameter, _retinaParameters.IplMagno.localAdaptintegration_tau, _retinaParameters.IplMagno.localAdaptintegration_k);
-
-    }
-    catch(Exception &e)
-    {
-        std::cout << "RetinaOCLImpl::setup: resetting retina with default parameters" << std::endl;
-        if (applyDefaultSetupOnFailure)
-        {
-            setupOPLandIPLParvoChannel();
-            setupIPLMagnoChannel();
-        }
-        std::cout << "RetinaOCLImpl::setup: wrong/unappropriate xml parameter file : error report :`n=>" << e.what() << std::endl;
-        std::cout << "=> keeping current parameters" << std::endl;
-    }
-}
-
-void RetinaOCLImpl::setup(cv::bioinspired::Retina::RetinaParameters newConfiguration)
-{
-    // simply copy structures
-    memcpy(&_retinaParameters, &newConfiguration, sizeof(cv::bioinspired::Retina::RetinaParameters));
-    // apply setup
-    setupOPLandIPLParvoChannel(_retinaParameters.OPLandIplParvo.colorMode, _retinaParameters.OPLandIplParvo.normaliseOutput, _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity, _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant, _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant, _retinaParameters.OPLandIplParvo.horizontalCellsGain, _retinaParameters.OPLandIplParvo.hcellsTemporalConstant, _retinaParameters.OPLandIplParvo.hcellsSpatialConstant, _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity);
-    setupIPLMagnoChannel(_retinaParameters.IplMagno.normaliseOutput, _retinaParameters.IplMagno.parasolCells_beta, _retinaParameters.IplMagno.parasolCells_tau, _retinaParameters.IplMagno.parasolCells_k, _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency, _retinaParameters.IplMagno.V0CompressionParameter, _retinaParameters.IplMagno.localAdaptintegration_tau, _retinaParameters.IplMagno.localAdaptintegration_k);
-}
-
-const String RetinaOCLImpl::printSetup()
-{
-    std::stringstream outmessage;
-
-    // displaying OPL and IPL parvo setup
-    outmessage << "Current Retina instance setup :"
-               << "\nOPLandIPLparvo" << "{"
-               << "\n==> colorMode : " << _retinaParameters.OPLandIplParvo.colorMode
-               << "\n==> normalizeParvoOutput :" << _retinaParameters.OPLandIplParvo.normaliseOutput
-               << "\n==> photoreceptorsLocalAdaptationSensitivity : " << _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity
-               << "\n==> photoreceptorsTemporalConstant : " << _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant
-               << "\n==> photoreceptorsSpatialConstant : " << _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant
-               << "\n==> horizontalCellsGain : " << _retinaParameters.OPLandIplParvo.horizontalCellsGain
-               << "\n==> hcellsTemporalConstant : " << _retinaParameters.OPLandIplParvo.hcellsTemporalConstant
-               << "\n==> hcellsSpatialConstant : " << _retinaParameters.OPLandIplParvo.hcellsSpatialConstant
-               << "\n==> parvoGanglionCellsSensitivity : " << _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity
-               << "}\n";
-
-    // displaying IPL magno setup
-    outmessage << "Current Retina instance setup :"
-               << "\nIPLmagno" << "{"
-               << "\n==> normaliseOutput : " << _retinaParameters.IplMagno.normaliseOutput
-               << "\n==> parasolCells_beta : " << _retinaParameters.IplMagno.parasolCells_beta
-               << "\n==> parasolCells_tau : " << _retinaParameters.IplMagno.parasolCells_tau
-               << "\n==> parasolCells_k : " << _retinaParameters.IplMagno.parasolCells_k
-               << "\n==> amacrinCellsTemporalCutFrequency : " << _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency
-               << "\n==> V0CompressionParameter : " << _retinaParameters.IplMagno.V0CompressionParameter
-               << "\n==> localAdaptintegration_tau : " << _retinaParameters.IplMagno.localAdaptintegration_tau
-               << "\n==> localAdaptintegration_k : " << _retinaParameters.IplMagno.localAdaptintegration_k
-               << "}";
-    return outmessage.str().c_str();
-}
-
-void RetinaOCLImpl::write( String fs ) const
-{
-    FileStorage parametersSaveFile(fs, cv::FileStorage::WRITE );
-    write(parametersSaveFile);
-}
-
-void RetinaOCLImpl::write( FileStorage& fs ) const
-{
-    if (!fs.isOpened())
-    {
-        return;    // basic error case
-    }
-    fs << "OPLandIPLparvo" << "{";
-    fs << "colorMode" << _retinaParameters.OPLandIplParvo.colorMode;
-    fs << "normaliseOutput" << _retinaParameters.OPLandIplParvo.normaliseOutput;
-    fs << "photoreceptorsLocalAdaptationSensitivity" << _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity;
-    fs << "photoreceptorsTemporalConstant" << _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant;
-    fs << "photoreceptorsSpatialConstant" << _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant;
-    fs << "horizontalCellsGain" << _retinaParameters.OPLandIplParvo.horizontalCellsGain;
-    fs << "hcellsTemporalConstant" << _retinaParameters.OPLandIplParvo.hcellsTemporalConstant;
-    fs << "hcellsSpatialConstant" << _retinaParameters.OPLandIplParvo.hcellsSpatialConstant;
-    fs << "ganglionCellsSensitivity" << _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity;
-    fs << "}";
-    fs << "IPLmagno" << "{";
-    fs << "normaliseOutput" << _retinaParameters.IplMagno.normaliseOutput;
-    fs << "parasolCells_beta" << _retinaParameters.IplMagno.parasolCells_beta;
-    fs << "parasolCells_tau" << _retinaParameters.IplMagno.parasolCells_tau;
-    fs << "parasolCells_k" << _retinaParameters.IplMagno.parasolCells_k;
-    fs << "amacrinCellsTemporalCutFrequency" << _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency;
-    fs << "V0CompressionParameter" << _retinaParameters.IplMagno.V0CompressionParameter;
-    fs << "localAdaptintegration_tau" << _retinaParameters.IplMagno.localAdaptintegration_tau;
-    fs << "localAdaptintegration_k" << _retinaParameters.IplMagno.localAdaptintegration_k;
-    fs << "}";
-}
-
-void RetinaOCLImpl::setupOPLandIPLParvoChannel(const bool colorMode, const bool normaliseOutput, const float photoreceptorsLocalAdaptationSensitivity, const float photoreceptorsTemporalConstant, const float photoreceptorsSpatialConstant, const float horizontalCellsGain, const float HcellsTemporalConstant, const float HcellsSpatialConstant, const float ganglionCellsSensitivity)
-{
-    // retina core parameters setup
-    _retinaFilter->setColorMode(colorMode);
-    _retinaFilter->setPhotoreceptorsLocalAdaptationSensitivity(photoreceptorsLocalAdaptationSensitivity);
-    _retinaFilter->setOPLandParvoParameters(0, photoreceptorsTemporalConstant, photoreceptorsSpatialConstant, horizontalCellsGain, HcellsTemporalConstant, HcellsSpatialConstant, ganglionCellsSensitivity);
-    _retinaFilter->setParvoGanglionCellsLocalAdaptationSensitivity(ganglionCellsSensitivity);
-    _retinaFilter->activateNormalizeParvoOutput_0_maxOutputValue(normaliseOutput);
-
-    // update parameters struture
-
-    _retinaParameters.OPLandIplParvo.colorMode = colorMode;
-    _retinaParameters.OPLandIplParvo.normaliseOutput = normaliseOutput;
-    _retinaParameters.OPLandIplParvo.photoreceptorsLocalAdaptationSensitivity = photoreceptorsLocalAdaptationSensitivity;
-    _retinaParameters.OPLandIplParvo.photoreceptorsTemporalConstant = photoreceptorsTemporalConstant;
-    _retinaParameters.OPLandIplParvo.photoreceptorsSpatialConstant = photoreceptorsSpatialConstant;
-    _retinaParameters.OPLandIplParvo.horizontalCellsGain = horizontalCellsGain;
-    _retinaParameters.OPLandIplParvo.hcellsTemporalConstant = HcellsTemporalConstant;
-    _retinaParameters.OPLandIplParvo.hcellsSpatialConstant = HcellsSpatialConstant;
-    _retinaParameters.OPLandIplParvo.ganglionCellsSensitivity = ganglionCellsSensitivity;
-}
-
-void RetinaOCLImpl::setupIPLMagnoChannel(const bool normaliseOutput, const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float V0CompressionParameter, const float localAdaptintegration_tau, const float localAdaptintegration_k)
-{
-
-    _retinaFilter->setMagnoCoefficientsTable(parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, V0CompressionParameter, localAdaptintegration_tau, localAdaptintegration_k);
-    _retinaFilter->activateNormalizeMagnoOutput_0_maxOutputValue(normaliseOutput);
-
-    // update parameters struture
-    _retinaParameters.IplMagno.normaliseOutput = normaliseOutput;
-    _retinaParameters.IplMagno.parasolCells_beta = parasolCells_beta;
-    _retinaParameters.IplMagno.parasolCells_tau = parasolCells_tau;
-    _retinaParameters.IplMagno.parasolCells_k = parasolCells_k;
-    _retinaParameters.IplMagno.amacrinCellsTemporalCutFrequency = amacrinCellsTemporalCutFrequency;
-    _retinaParameters.IplMagno.V0CompressionParameter = V0CompressionParameter;
-    _retinaParameters.IplMagno.localAdaptintegration_tau = localAdaptintegration_tau;
-    _retinaParameters.IplMagno.localAdaptintegration_k = localAdaptintegration_k;
-}
-
-void RetinaOCLImpl::run(const InputArray input)
-{
-    oclMat &inputMatToConvert = getOclMatRef(input);
-    bool colorMode = convertToColorPlanes(inputMatToConvert, _inputBuffer);
-    // first convert input image to the compatible format : std::valarray<float>
-    // process the retina
-    if (!_retinaFilter->runFilter(_inputBuffer, colorMode, false, _retinaParameters.OPLandIplParvo.colorMode && colorMode, false))
-    {
-        throw cv::Exception(-1, "Retina cannot be applied, wrong input buffer size", "RetinaOCLImpl::run", "Retina.h", 0);
-    }
-}
-
-void RetinaOCLImpl::getParvo(OutputArray output)
-{
-    oclMat &retinaOutput_parvo = getOclMatRef(output);
-    if (_retinaFilter->getColorMode())
-    {
-        // reallocate output buffer (if necessary)
-        convertToInterleaved(_retinaFilter->getColorOutput(), true, retinaOutput_parvo);
-    }
-    else
-    {
-        // reallocate output buffer (if necessary)
-        convertToInterleaved(_retinaFilter->getContours(), false, retinaOutput_parvo);
-    }
-    //retinaOutput_parvo/=255.0;
-}
-void RetinaOCLImpl::getMagno(OutputArray output)
-{
-    oclMat &retinaOutput_magno = getOclMatRef(output);
-    // reallocate output buffer (if necessary)
-    convertToInterleaved(_retinaFilter->getMovingContours(), false, retinaOutput_magno);
-    //retinaOutput_magno/=255.0;
-}
-// private method called by constructirs
-void RetinaOCLImpl::_init(const cv::Size inputSz, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-{
-    // basic error check
-    if (inputSz.height*inputSz.width <= 0)
-    {
-        throw cv::Exception(-1, "Bad retina size setup : size height and with must be superior to zero", "RetinaOCLImpl::setup", "Retina.h", 0);
-    }
-
-    // allocate the retina model
-    if (_retinaFilter)
-    {
-        delete _retinaFilter;
-    }
-    _retinaFilter = new RetinaFilter(inputSz.height, inputSz.width, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-
-    // prepare the default parameter XML file with default setup
-    setup(_retinaParameters);
-
-    // init retina
-    _retinaFilter->clearAllBuffers();
-}
-
-bool RetinaOCLImpl::convertToColorPlanes(const oclMat& input, oclMat &output)
-{
-    oclMat convert_input;
-    input.convertTo(convert_input, CV_32F);
-    if(convert_input.channels() == 3 || convert_input.channels() == 4)
-    {
-        ocl::ensureSizeIsEnough(int(_retinaFilter->getInputNBrows() * 4),
-                                int(_retinaFilter->getInputNBcolumns()), CV_32FC1, output);
-        oclMat channel_splits[4] =
-        {
-            output(Rect(Point(0, _retinaFilter->getInputNBrows() * 2), getInputSize())),
-            output(Rect(Point(0, _retinaFilter->getInputNBrows()), getInputSize())),
-            output(Rect(Point(0, 0), getInputSize())),
-            output(Rect(Point(0, _retinaFilter->getInputNBrows() * 3), getInputSize()))
-        };
-        ocl::split(convert_input, channel_splits);
-        return true;
-    }
-    else if(convert_input.channels() == 1)
-    {
-        convert_input.copyTo(output);
-        return false;
-    }
-    else
-    {
-        CV_Error(-1, "Retina ocl only support 1, 3, 4 channel input");
-        return false;
-    }
-}
-void RetinaOCLImpl::convertToInterleaved(const oclMat& input, bool colorMode, oclMat &output)
-{
-    input.convertTo(output, CV_8U);
-    if(colorMode)
-    {
-        int numOfSplits = input.rows / getInputSize().height;
-        std::vector<oclMat> channel_splits(numOfSplits);
-        for(int i = 0; i < static_cast<int>(channel_splits.size()); i ++)
-        {
-            channel_splits[i] =
-                output(Rect(Point(0, _retinaFilter->getInputNBrows() * (numOfSplits - i - 1)), getInputSize()));
-        }
-        merge(channel_splits, output);
-    }
-    else
-    {
-        //...
-    }
-}
-
-void RetinaOCLImpl::clearBuffers()
-{
-    _retinaFilter->clearAllBuffers();
-}
-
-void RetinaOCLImpl::activateMovingContoursProcessing(const bool activate)
-{
-    _retinaFilter->activateMovingContoursProcessing(activate);
-}
-
-void RetinaOCLImpl::activateContoursProcessing(const bool activate)
-{
-    _retinaFilter->activateContoursProcessing(activate);
-}
-
-///////////////////////////////////////
-///////// BasicRetinaFilter ///////////
-///////////////////////////////////////
-BasicRetinaFilter::BasicRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns, const unsigned int parametersListSize, const bool)
-    : _NBrows(NBrows), _NBcols(NBcolumns),
-      _filterOutput(NBrows, NBcolumns, CV_32FC1),
-      _localBuffer(NBrows, NBcolumns, CV_32FC1),
-      _filteringCoeficientsTable(3 * parametersListSize)
-{
-    _halfNBrows = _filterOutput.rows / 2;
-    _halfNBcolumns = _filterOutput.cols / 2;
-
-    // set default values
-    _maxInputValue = 256.0;
-
-    // reset all buffers
-    clearAllBuffers();
-}
-
-BasicRetinaFilter::~BasicRetinaFilter()
-{
-}
-
-void BasicRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    // resizing buffers
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _filterOutput);
-
-    // updating variables
-    _halfNBrows = _filterOutput.rows / 2;
-    _halfNBcolumns = _filterOutput.cols / 2;
-
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _localBuffer);
-    // reset buffers
-    clearAllBuffers();
-}
-
-void BasicRetinaFilter::setLPfilterParameters(const float beta, const float tau, const float desired_k, const unsigned int filterIndex)
-{
-    float _beta = beta + tau;
-    float k = desired_k;
-    // check if the spatial constant is correct (avoid 0 value to avoid division by 0)
-    if (desired_k <= 0)
-    {
-        k = 0.001f;
-        std::cerr << "BasicRetinaFilter::spatial constant of the low pass filter must be superior to zero !!! correcting parameter setting to 0,001" << std::endl;
-    }
-
-    float _alpha = k * k;
-    float _mu = 0.8f;
-    unsigned int tableOffset = filterIndex * 3;
-    if (k <= 0)
-    {
-        std::cerr << "BasicRetinaFilter::spatial filtering coefficient must be superior to zero, correcting value to 0.01" << std::endl;
-        _alpha = 0.0001f;
-    }
-
-    float _temp =  (1.0f + _beta) / (2.0f * _mu * _alpha);
-    float a = _filteringCoeficientsTable[tableOffset] = 1.0f + _temp - (float)sqrt( (1.0f + _temp) * (1.0f + _temp) - 1.0f);
-    _filteringCoeficientsTable[1 + tableOffset] = (1.0f - a) * (1.0f - a) * (1.0f - a) * (1.0f - a) / (1.0f + _beta);
-    _filteringCoeficientsTable[2 + tableOffset] = tau;
-}
-const oclMat &BasicRetinaFilter::runFilter_LocalAdapdation(const oclMat &inputFrame, const oclMat &localLuminance)
-{
-    _localLuminanceAdaptation(inputFrame, localLuminance, _filterOutput);
-    return _filterOutput;
-}
-
-
-void BasicRetinaFilter::runFilter_LocalAdapdation(const oclMat &inputFrame, const oclMat &localLuminance, oclMat &outputFrame)
-{
-    _localLuminanceAdaptation(inputFrame, localLuminance, outputFrame);
-}
-
-const oclMat &BasicRetinaFilter::runFilter_LocalAdapdation_autonomous(const oclMat &inputFrame)
-{
-    _spatiotemporalLPfilter(inputFrame, _filterOutput);
-    _localLuminanceAdaptation(inputFrame, _filterOutput, _filterOutput);
-    return _filterOutput;
-}
-void BasicRetinaFilter::runFilter_LocalAdapdation_autonomous(const oclMat &inputFrame, oclMat &outputFrame)
-{
-    _spatiotemporalLPfilter(inputFrame, _filterOutput);
-    _localLuminanceAdaptation(inputFrame, _filterOutput, outputFrame);
-}
-
-void BasicRetinaFilter::_localLuminanceAdaptation(oclMat &inputOutputFrame, const oclMat &localLuminance)
-{
-    _localLuminanceAdaptation(inputOutputFrame, localLuminance, inputOutputFrame, false);
-}
-
-void BasicRetinaFilter::_localLuminanceAdaptation(const oclMat &inputFrame, const oclMat &localLuminance, oclMat &outputFrame, const bool updateLuminanceMean)
-{
-    if (updateLuminanceMean)
-    {
-        float meanLuminance = saturate_cast<float>(ocl::sum(inputFrame)[0]) / getNBpixels();
-        updateCompressionParameter(meanLuminance);
-    }
-    int elements_per_row = static_cast<int>(inputFrame.step / inputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, _NBrows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &localLuminance.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &_localLuminanceAddon));
-    args.push_back(std::make_pair(sizeof(cl_float), &_localLuminanceFactor));
-    args.push_back(std::make_pair(sizeof(cl_float), &_maxInputValue));
-    openCLExecuteKernel(ctx, &retina_kernel, "localLuminanceAdaptation", globalSize, localSize, args, -1, -1);
-}
-
-const oclMat &BasicRetinaFilter::runFilter_LPfilter(const oclMat &inputFrame, const unsigned int filterIndex)
-{
-    _spatiotemporalLPfilter(inputFrame, _filterOutput, filterIndex);
-    return _filterOutput;
-}
-void BasicRetinaFilter::runFilter_LPfilter(const oclMat &inputFrame, oclMat &outputFrame, const unsigned int filterIndex)
-{
-    _spatiotemporalLPfilter(inputFrame, outputFrame, filterIndex);
-}
-
-void BasicRetinaFilter::_spatiotemporalLPfilter(const oclMat &inputFrame, oclMat &LPfilterOutput, const unsigned int filterIndex)
-{
-    unsigned int coefTableOffset = filterIndex * 3;
-
-    _a = _filteringCoeficientsTable[coefTableOffset];
-    _gain = _filteringCoeficientsTable[1 + coefTableOffset];
-    _tau = _filteringCoeficientsTable[2 + coefTableOffset];
-
-    _horizontalCausalFilter_addInput(inputFrame, LPfilterOutput);
-    _horizontalAnticausalFilter(LPfilterOutput);
-    _verticalCausalFilter(LPfilterOutput);
-    _verticalAnticausalFilter_multGain(LPfilterOutput);
-}
-
-void BasicRetinaFilter::_horizontalCausalFilter_addInput(const oclMat &inputFrame, oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(inputFrame.step / inputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBrows, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_float), &_tau));
-    args.push_back(std::make_pair(sizeof(cl_float), &_a));
-    openCLExecuteKernel(ctx, &retina_kernel, "horizontalCausalFilter_addInput", globalSize, localSize, args, -1, -1);
-}
-
-void BasicRetinaFilter::_horizontalAnticausalFilter(oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBrows, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_float), &_a));
-    openCLExecuteKernel(ctx, &retina_kernel, "horizontalAnticausalFilter", globalSize, localSize, args, -1, -1);
-}
-
-void BasicRetinaFilter::_verticalCausalFilter(oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_float), &_a));
-    openCLExecuteKernel(ctx, &retina_kernel, "verticalCausalFilter", globalSize, localSize, args, -1, -1);
-}
-
-void BasicRetinaFilter::_verticalAnticausalFilter_multGain(oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_float), &_a));
-    args.push_back(std::make_pair(sizeof(cl_float), &_gain));
-    openCLExecuteKernel(ctx, &retina_kernel, "verticalAnticausalFilter_multGain", globalSize, localSize, args, -1, -1);
-}
-
-void BasicRetinaFilter::_horizontalAnticausalFilter_Irregular(oclMat &outputFrame, const oclMat &spatialConstantBuffer)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {outputFrame.rows, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &spatialConstantBuffer.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &spatialConstantBuffer.offset));
-    openCLExecuteKernel(ctx, &retina_kernel, "horizontalAnticausalFilter_Irregular", globalSize, localSize, args, -1, -1);
-}
-
-//  vertical anticausal filter
-void BasicRetinaFilter::_verticalCausalFilter_Irregular(oclMat &outputFrame, const oclMat &spatialConstantBuffer)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {outputFrame.cols, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &spatialConstantBuffer.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &spatialConstantBuffer.offset));
-    openCLExecuteKernel(ctx, &retina_kernel, "verticalCausalFilter_Irregular", globalSize, localSize, args, -1, -1);
-}
-
-void normalizeGrayOutput_0_maxOutputValue(oclMat &inputOutputBuffer, const float maxOutputValue)
-{
-    double min_val, max_val;
-    ocl::minMax(inputOutputBuffer, &min_val, &max_val);
-    float factor = maxOutputValue / static_cast<float>(max_val - min_val);
-    float offset = - static_cast<float>(min_val) * factor;
-    ocl::multiply(factor, inputOutputBuffer, inputOutputBuffer);
-    ocl::add(inputOutputBuffer, offset, inputOutputBuffer);
-}
-
-void normalizeGrayOutputCentredSigmoide(const float meanValue, const float sensitivity, oclMat &in, oclMat &out, const float maxValue)
-{
-    if (sensitivity == 1.0f)
-    {
-        std::cerr << "TemplateBuffer::TemplateBuffer<type>::normalizeGrayOutputCentredSigmoide error: 2nd parameter (sensitivity) must not equal 0, copying original data..." << std::endl;
-        in.copyTo(out);
-        return;
-    }
-
-    float X0 = maxValue / (sensitivity - 1.0f);
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {in.cols, out.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    int elements_per_row = static_cast<int>(out.step / out.elemSize());
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &in.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &out.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &in.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &in.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &meanValue));
-    args.push_back(std::make_pair(sizeof(cl_float), &X0));
-    openCLExecuteKernel(ctx, &retina_kernel, "normalizeGrayOutputCentredSigmoide", globalSize, localSize, args, -1, -1);
-}
-
-void normalizeGrayOutputNearZeroCentreredSigmoide(oclMat &inputPicture, oclMat &outputBuffer, const float sensitivity, const float maxOutputValue)
-{
-    float X0cube = sensitivity * sensitivity * sensitivity;
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {inputPicture.cols, inputPicture.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    int elements_per_row = static_cast<int>(inputPicture.step / inputPicture.elemSize());
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputPicture.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputBuffer.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputPicture.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputPicture.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &maxOutputValue));
-    args.push_back(std::make_pair(sizeof(cl_float), &X0cube));
-    openCLExecuteKernel(ctx, &retina_kernel, "normalizeGrayOutputNearZeroCentreredSigmoide", globalSize, localSize, args, -1, -1);
-}
-
-void centerReductImageLuminance(oclMat &inputoutput)
-{
-    Scalar mean, stddev;
-    cv::meanStdDev((Mat)inputoutput, mean, stddev);
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {inputoutput.cols, inputoutput.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    float f_mean = static_cast<float>(mean[0]);
-    float f_stddev = static_cast<float>(stddev[0]);
-    int elements_per_row = static_cast<int>(inputoutput.step / inputoutput.elemSize());
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputoutput.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputoutput.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputoutput.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &f_mean));
-    args.push_back(std::make_pair(sizeof(cl_float), &f_stddev));
-    openCLExecuteKernel(ctx, &retina_kernel, "centerReductImageLuminance", globalSize, localSize, args, -1, -1);
-}
-
-///////////////////////////////////////
-///////// ParvoRetinaFilter ///////////
-///////////////////////////////////////
-ParvoRetinaFilter::ParvoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns)
-    : BasicRetinaFilter(NBrows, NBcolumns, 3),
-      _photoreceptorsOutput(NBrows, NBcolumns, CV_32FC1),
-      _horizontalCellsOutput(NBrows, NBcolumns, CV_32FC1),
-      _parvocellularOutputON(NBrows, NBcolumns, CV_32FC1),
-      _parvocellularOutputOFF(NBrows, NBcolumns, CV_32FC1),
-      _bipolarCellsOutputON(NBrows, NBcolumns, CV_32FC1),
-      _bipolarCellsOutputOFF(NBrows, NBcolumns, CV_32FC1),
-      _localAdaptationOFF(NBrows, NBcolumns, CV_32FC1)
-{
-    // link to the required local parent adaptation buffers
-    _localAdaptationON = _localBuffer;
-    _parvocellularOutputONminusOFF = _filterOutput;
-
-    // init: set all the values to 0
-    clearAllBuffers();
-}
-
-ParvoRetinaFilter::~ParvoRetinaFilter()
-{
-}
-
-void ParvoRetinaFilter::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _photoreceptorsOutput = 0;
-    _horizontalCellsOutput = 0;
-    _parvocellularOutputON = 0;
-    _parvocellularOutputOFF = 0;
-    _bipolarCellsOutputON = 0;
-    _bipolarCellsOutputOFF = 0;
-    _localAdaptationOFF = 0;
-}
-void ParvoRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::resize(NBrows, NBcolumns);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _photoreceptorsOutput);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _horizontalCellsOutput);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _parvocellularOutputON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _parvocellularOutputOFF);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _bipolarCellsOutputON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _bipolarCellsOutputOFF);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _localAdaptationOFF);
-
-    // link to the required local parent adaptation buffers
-    _localAdaptationON = _localBuffer;
-    _parvocellularOutputONminusOFF = _filterOutput;
-
-    // clean buffers
-    clearAllBuffers();
-}
-
-void ParvoRetinaFilter::setOPLandParvoFiltersParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2)
-{
-    // init photoreceptors low pass filter
-    setLPfilterParameters(beta1, tau1, k1);
-    // init horizontal cells low pass filter
-    setLPfilterParameters(beta2, tau2, k2, 1);
-    // init parasol ganglion cells low pass filter (default parameters)
-    setLPfilterParameters(0, tau1, k1, 2);
-
-}
-const oclMat &ParvoRetinaFilter::runFilter(const oclMat &inputFrame, const bool useParvoOutput)
-{
-    _spatiotemporalLPfilter(inputFrame, _photoreceptorsOutput);
-    _spatiotemporalLPfilter(_photoreceptorsOutput, _horizontalCellsOutput, 1);
-    _OPL_OnOffWaysComputing();
-
-    if (useParvoOutput)
-    {
-        // local adaptation processes on ON and OFF ways
-        _spatiotemporalLPfilter(_bipolarCellsOutputON, _localAdaptationON, 2);
-        _localLuminanceAdaptation(_parvocellularOutputON, _localAdaptationON);
-        _spatiotemporalLPfilter(_bipolarCellsOutputOFF, _localAdaptationOFF, 2);
-        _localLuminanceAdaptation(_parvocellularOutputOFF, _localAdaptationOFF);
-        ocl::subtract(_parvocellularOutputON, _parvocellularOutputOFF, _parvocellularOutputONminusOFF);
-    }
-
-    return _parvocellularOutputONminusOFF;
-}
-void ParvoRetinaFilter::_OPL_OnOffWaysComputing()
-{
-    int elements_per_row = static_cast<int>(_photoreceptorsOutput.step / _photoreceptorsOutput.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {(_photoreceptorsOutput.cols + 3) / 4, _photoreceptorsOutput.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_photoreceptorsOutput.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_horizontalCellsOutput.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_bipolarCellsOutputON.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_bipolarCellsOutputOFF.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_parvocellularOutputON.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_parvocellularOutputOFF.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_photoreceptorsOutput.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_photoreceptorsOutput.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "OPL_OnOffWaysComputing", globalSize, localSize, args, -1, -1);
-}
-
-///////////////////////////////////////
-//////////// MagnoFilter //////////////
-///////////////////////////////////////
-MagnoRetinaFilter::MagnoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns)
-    : BasicRetinaFilter(NBrows, NBcolumns, 2),
-      _previousInput_ON(NBrows, NBcolumns, CV_32FC1),
-      _previousInput_OFF(NBrows, NBcolumns, CV_32FC1),
-      _amacrinCellsTempOutput_ON(NBrows, NBcolumns, CV_32FC1),
-      _amacrinCellsTempOutput_OFF(NBrows, NBcolumns, CV_32FC1),
-      _magnoXOutputON(NBrows, NBcolumns, CV_32FC1),
-      _magnoXOutputOFF(NBrows, NBcolumns, CV_32FC1),
-      _localProcessBufferON(NBrows, NBcolumns, CV_32FC1),
-      _localProcessBufferOFF(NBrows, NBcolumns, CV_32FC1)
-{
-    _magnoYOutput = _filterOutput;
-    _magnoYsaturated = _localBuffer;
-
-    clearAllBuffers();
-}
-
-MagnoRetinaFilter::~MagnoRetinaFilter()
-{
-}
-void MagnoRetinaFilter::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _previousInput_ON = 0;
-    _previousInput_OFF = 0;
-    _amacrinCellsTempOutput_ON = 0;
-    _amacrinCellsTempOutput_OFF = 0;
-    _magnoXOutputON = 0;
-    _magnoXOutputOFF = 0;
-    _localProcessBufferON = 0;
-    _localProcessBufferOFF = 0;
-
-}
-void MagnoRetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::resize(NBrows, NBcolumns);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _previousInput_ON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _previousInput_OFF);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _amacrinCellsTempOutput_ON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _amacrinCellsTempOutput_OFF);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _magnoXOutputON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _magnoXOutputOFF);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _localProcessBufferON);
-    ensureSizeIsEnough(NBrows, NBcolumns, CV_32FC1, _localProcessBufferOFF);
-
-    // to be sure, relink buffers
-    _magnoYOutput = _filterOutput;
-    _magnoYsaturated = _localBuffer;
-
-    // reset all buffers
-    clearAllBuffers();
-}
-
-void MagnoRetinaFilter::setCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float localAdaptIntegration_tau, const float localAdaptIntegration_k )
-{
-    _temporalCoefficient = (float)std::exp(-1.0f / amacrinCellsTemporalCutFrequency);
-    // the first set of parameters is dedicated to the low pass filtering property of the ganglion cells
-    BasicRetinaFilter::setLPfilterParameters(parasolCells_beta, parasolCells_tau, parasolCells_k, 0);
-    // the second set of parameters is dedicated to the ganglion cells output intergartion for their local adaptation property
-    BasicRetinaFilter::setLPfilterParameters(0, localAdaptIntegration_tau, localAdaptIntegration_k, 1);
-}
-
-void MagnoRetinaFilter::_amacrineCellsComputing(
-    const oclMat &OPL_ON,
-    const oclMat &OPL_OFF
-)
-{
-    int elements_per_row = static_cast<int>(OPL_ON.step / OPL_ON.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {OPL_ON.cols, OPL_ON.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &OPL_ON.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &OPL_OFF.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_previousInput_ON.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_previousInput_OFF.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_amacrinCellsTempOutput_ON.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &_amacrinCellsTempOutput_OFF.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &OPL_ON.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &OPL_ON.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &_temporalCoefficient));
-    openCLExecuteKernel(ctx, &retina_kernel, "amacrineCellsComputing", globalSize, localSize, args, -1, -1);
-}
-
-const oclMat &MagnoRetinaFilter::runFilter(const oclMat &OPL_ON, const oclMat &OPL_OFF)
-{
-    // Compute the high pass temporal filter
-    _amacrineCellsComputing(OPL_ON, OPL_OFF);
-
-    // apply low pass filtering on ON and OFF ways after temporal high pass filtering
-    _spatiotemporalLPfilter(_amacrinCellsTempOutput_ON, _magnoXOutputON, 0);
-    _spatiotemporalLPfilter(_amacrinCellsTempOutput_OFF, _magnoXOutputOFF, 0);
-
-    // local adaptation of the ganglion cells to the local contrast of the moving contours
-    _spatiotemporalLPfilter(_magnoXOutputON, _localProcessBufferON, 1);
-    _localLuminanceAdaptation(_magnoXOutputON, _localProcessBufferON);
-
-    _spatiotemporalLPfilter(_magnoXOutputOFF, _localProcessBufferOFF, 1);
-    _localLuminanceAdaptation(_magnoXOutputOFF, _localProcessBufferOFF);
-
-    _magnoYOutput = _magnoXOutputON + _magnoXOutputOFF;
-
-    return _magnoYOutput;
-}
-
-///////////////////////////////////////
-//////////// RetinaColor //////////////
-///////////////////////////////////////
-
-// define an array of ROI headers of input x
-#define MAKE_OCLMAT_SLICES(x, n) \
-    oclMat x##_slices[n];\
-    for(int _SLICE_INDEX_ = 0; _SLICE_INDEX_ < n; _SLICE_INDEX_ ++)\
-    {\
-        x##_slices[_SLICE_INDEX_] = x(getROI(_SLICE_INDEX_));\
-    }
-
-RetinaColor::RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns, const int samplingMethod)
-    : BasicRetinaFilter(NBrows, NBcolumns, 3),
-      _RGBmosaic(NBrows * 3, NBcolumns, CV_32FC1),
-      _tempMultiplexedFrame(NBrows, NBcolumns, CV_32FC1),
-      _demultiplexedTempBuffer(NBrows * 3, NBcolumns, CV_32FC1),
-      _demultiplexedColorFrame(NBrows * 3, NBcolumns, CV_32FC1),
-      _chrominance(NBrows * 3, NBcolumns, CV_32FC1),
-      _colorLocalDensity(NBrows * 3, NBcolumns, CV_32FC1),
-      _imageGradient(NBrows * 3, NBcolumns, CV_32FC1)
-{
-    // link to parent buffers (let's recycle !)
-    _luminance = _filterOutput;
-    _multiplexedFrame = _localBuffer;
-
-    _objectInit = false;
-    _samplingMethod = samplingMethod;
-    _saturateColors = false;
-    _colorSaturationValue = 4.0;
-
-    // set default spatio-temporal filter parameters
-    setLPfilterParameters(0.0, 0.0, 1.5);
-    setLPfilterParameters(0.0, 0.0, 10.5, 1);// for the low pass filter dedicated to contours energy extraction (demultiplexing process)
-    setLPfilterParameters(0.f, 0.f, 0.9f, 2);
-
-    // init default value on image Gradient
-    _imageGradient = 0.57f;
-
-    // init color sampling map
-    _initColorSampling();
-
-    // flush all buffers
-    clearAllBuffers();
-}
-
-RetinaColor::~RetinaColor()
-{
-
-}
-
-void RetinaColor::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _tempMultiplexedFrame = 0.f;
-    _demultiplexedTempBuffer = 0.f;
-
-    _demultiplexedColorFrame = 0.f;
-    _chrominance = 0.f;
-    _imageGradient = 0.57f;
-}
-
-void RetinaColor::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::clearAllBuffers();
-    ensureSizeIsEnough(NBrows,     NBcolumns, CV_32FC1, _tempMultiplexedFrame);
-    ensureSizeIsEnough(NBrows * 2, NBcolumns, CV_32FC1, _imageGradient);
-    ensureSizeIsEnough(NBrows * 3, NBcolumns, CV_32FC1, _RGBmosaic);
-    ensureSizeIsEnough(NBrows * 3, NBcolumns, CV_32FC1, _demultiplexedTempBuffer);
-    ensureSizeIsEnough(NBrows * 3, NBcolumns, CV_32FC1, _demultiplexedColorFrame);
-    ensureSizeIsEnough(NBrows * 3, NBcolumns, CV_32FC1, _chrominance);
-    ensureSizeIsEnough(NBrows * 3, NBcolumns, CV_32FC1, _colorLocalDensity);
-
-    // link to parent buffers (let's recycle !)
-    _luminance = _filterOutput;
-    _multiplexedFrame = _localBuffer;
-
-    // init color sampling map
-    _initColorSampling();
-
-    // clean buffers
-    clearAllBuffers();
-}
-
-static void inverseValue(oclMat &input)
-{
-    int elements_per_row = static_cast<int>(input.step / input.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {input.cols, input.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &input.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &input.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &input.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "inverseValue", globalSize, localSize, args, -1, -1);
-}
-
-void RetinaColor::_initColorSampling()
-{
-    CV_Assert(_samplingMethod == RETINA_COLOR_BAYER);
-    _pR = _pB = 0.25;
-    _pG = 0.5;
-    // filling the mosaic buffer:
-    _RGBmosaic = 0;
-    Mat tmp_mat(_NBrows * 3, _NBcols, CV_32FC1);
-    float * tmp_mat_ptr = tmp_mat.ptr<float>();
-    tmp_mat.setTo(0);
-    for (unsigned int index = 0 ; index < getNBpixels(); ++index)
-    {
-        tmp_mat_ptr[bayerSampleOffset(index)] = 1.0;
-    }
-    _RGBmosaic.upload(tmp_mat);
-    // computing photoreceptors local density
-    MAKE_OCLMAT_SLICES(_RGBmosaic, 3);
-    MAKE_OCLMAT_SLICES(_colorLocalDensity, 3);
-    _colorLocalDensity.setTo(0);
-    _spatiotemporalLPfilter(_RGBmosaic_slices[0], _colorLocalDensity_slices[0]);
-    _spatiotemporalLPfilter(_RGBmosaic_slices[1], _colorLocalDensity_slices[1]);
-    _spatiotemporalLPfilter(_RGBmosaic_slices[2], _colorLocalDensity_slices[2]);
-
-    //_colorLocalDensity = oclMat(_colorLocalDensity.size(), _colorLocalDensity.type(), 1.f) / _colorLocalDensity;
-    inverseValue(_colorLocalDensity);
-
-    _objectInit = true;
-}
-
-static void demultiplex(const oclMat &input, oclMat &ouput)
-{
-    int elements_per_row = static_cast<int>(input.step / input.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {input.cols, input.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &input.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &ouput.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &input.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &input.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "runColorDemultiplexingBayer", globalSize, localSize, args, -1, -1);
-}
-
-static void normalizePhotoDensity(
-    const oclMat &chroma,
-    const oclMat &colorDensity,
-    const oclMat &multiplex,
-    oclMat &ocl_luma,
-    oclMat &demultiplex,
-    const float pG
-)
-{
-    int elements_per_row = static_cast<int>(ocl_luma.step / ocl_luma.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {ocl_luma.cols, ocl_luma.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &chroma.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &colorDensity.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &multiplex.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &ocl_luma.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &demultiplex.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &ocl_luma.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &ocl_luma.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &pG));
-    openCLExecuteKernel(ctx, &retina_kernel, "normalizePhotoDensity", globalSize, localSize, args, -1, -1);
-}
-
-static void substractResidual(
-    oclMat &colorDemultiplex,
-    float pR,
-    float pG,
-    float pB
-)
-{
-    int elements_per_row = static_cast<int>(colorDemultiplex.step / colorDemultiplex.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    int rows = colorDemultiplex.rows / 3, cols = colorDemultiplex.cols;
-    size_t globalSize[] = {cols, rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &colorDemultiplex.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &pR));
-    args.push_back(std::make_pair(sizeof(cl_float), &pG));
-    args.push_back(std::make_pair(sizeof(cl_float), &pB));
-    openCLExecuteKernel(ctx, &retina_kernel, "substractResidual", globalSize, localSize, args, -1, -1);
-}
-
-static void demultiplexAssign(const oclMat& input, const oclMat& output)
-{
-    // only supports bayer
-    int elements_per_row = static_cast<int>(input.step / input.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    int rows = input.rows / 3, cols = input.cols;
-    size_t globalSize[] = {cols, rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &input.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &output.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "demultiplexAssign", globalSize, localSize, args, -1, -1);
-}
-
-void RetinaColor::runColorDemultiplexing(
-    const oclMat &ocl_multiplexed_input,
-    const bool adaptiveFiltering,
-    const float maxInputValue
-)
-{
-    MAKE_OCLMAT_SLICES(_demultiplexedTempBuffer, 3);
-    MAKE_OCLMAT_SLICES(_chrominance, 3);
-    MAKE_OCLMAT_SLICES(_RGBmosaic, 3);
-    MAKE_OCLMAT_SLICES(_demultiplexedColorFrame, 3);
-    MAKE_OCLMAT_SLICES(_colorLocalDensity, 3);
-
-    _demultiplexedTempBuffer.setTo(0);
-    demultiplex(ocl_multiplexed_input, _demultiplexedTempBuffer);
-
-    // interpolate the demultiplexed frame depending on the color sampling method
-    if (!adaptiveFiltering)
-    {
-        CV_Assert(adaptiveFiltering == false);
-    }
-
-    _spatiotemporalLPfilter(_demultiplexedTempBuffer_slices[0], _chrominance_slices[0]);
-    _spatiotemporalLPfilter(_demultiplexedTempBuffer_slices[1], _chrominance_slices[1]);
-    _spatiotemporalLPfilter(_demultiplexedTempBuffer_slices[2], _chrominance_slices[2]);
-
-    if (!adaptiveFiltering)// compute the gradient on the luminance
-    {
-        // TODO: implement me!
-        CV_Assert(adaptiveFiltering == false);
-    }
-    else
-    {
-        normalizePhotoDensity(_chrominance, _colorLocalDensity, ocl_multiplexed_input, _luminance, _demultiplexedTempBuffer, _pG);
-        // compute the gradient of the luminance
-        _computeGradient(_luminance, _imageGradient);
-
-        _adaptiveSpatialLPfilter(_RGBmosaic_slices[0], _imageGradient, _chrominance_slices[0]);
-        _adaptiveSpatialLPfilter(_RGBmosaic_slices[1], _imageGradient, _chrominance_slices[1]);
-        _adaptiveSpatialLPfilter(_RGBmosaic_slices[2], _imageGradient, _chrominance_slices[2]);
-
-        _adaptiveSpatialLPfilter(_demultiplexedTempBuffer_slices[0], _imageGradient, _demultiplexedColorFrame_slices[0]);
-        _adaptiveSpatialLPfilter(_demultiplexedTempBuffer_slices[1], _imageGradient, _demultiplexedColorFrame_slices[1]);
-        _adaptiveSpatialLPfilter(_demultiplexedTempBuffer_slices[2], _imageGradient, _demultiplexedColorFrame_slices[2]);
-
-        _demultiplexedColorFrame /= _chrominance; // per element division
-        substractResidual(_demultiplexedColorFrame, _pR, _pG, _pB);
-        runColorMultiplexing(_demultiplexedColorFrame, _tempMultiplexedFrame);
-
-        _demultiplexedTempBuffer.setTo(0);
-        _luminance = ocl_multiplexed_input - _tempMultiplexedFrame;
-        demultiplexAssign(_demultiplexedColorFrame, _demultiplexedTempBuffer);
-
-        for(int i = 0; i < 3; i ++)
-        {
-            _spatiotemporalLPfilter(_demultiplexedTempBuffer_slices[i], _demultiplexedTempBuffer_slices[i]);
-            _demultiplexedColorFrame_slices[i] = _demultiplexedTempBuffer_slices[i] * _colorLocalDensity_slices[i] + _luminance;
-        }
-    }
-    // eliminate saturated colors by simple clipping values to the input range
-    clipRGBOutput_0_maxInputValue(_demultiplexedColorFrame, maxInputValue);
-
-    if (_saturateColors)
-    {
-        ocl::normalizeGrayOutputCentredSigmoide(128, maxInputValue, _demultiplexedColorFrame, _demultiplexedColorFrame);
-    }
-}
-void RetinaColor::runColorMultiplexing(const oclMat &demultiplexedInputFrame, oclMat &multiplexedFrame)
-{
-    int elements_per_row = static_cast<int>(multiplexedFrame.step / multiplexedFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {multiplexedFrame.cols, multiplexedFrame.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &demultiplexedInputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &multiplexedFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &multiplexedFrame.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &multiplexedFrame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "runColorMultiplexingBayer", globalSize, localSize, args, -1, -1);
-}
-
-void RetinaColor::clipRGBOutput_0_maxInputValue(oclMat &inputOutputBuffer, const float maxInputValue)
-{
-    // the kernel is equivalent to:
-    //ocl::threshold(inputOutputBuffer, inputOutputBuffer, maxInputValue, maxInputValue, THRESH_TRUNC);
-    //ocl::threshold(inputOutputBuffer, inputOutputBuffer, 0, 0, THRESH_TOZERO);
-    int elements_per_row = static_cast<int>(inputOutputBuffer.step / inputOutputBuffer.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, inputOutputBuffer.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputOutputBuffer.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputOutputBuffer.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &maxInputValue));
-    openCLExecuteKernel(ctx, &retina_kernel, "clipRGBOutput_0_maxInputValue", globalSize, localSize, args, -1, -1);
-}
-
-void RetinaColor::_adaptiveSpatialLPfilter(const oclMat &inputFrame, const oclMat &gradient, oclMat &outputFrame)
-{
-    /**********/
-    _gain = (1 - 0.57f) * (1 - 0.57f) * (1 - 0.06f) * (1 - 0.06f);
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    // -> horizontal filters work with the first layer of imageGradient
-    _adaptiveHorizontalCausalFilter_addInput(inputFrame, gradient, outputFrame);
-    _horizontalAnticausalFilter_Irregular(outputFrame, gradient);
-    // -> horizontal filters work with the second layer of imageGradient
-    _verticalCausalFilter_Irregular(outputFrame, gradient(getROI(1)));
-    _adaptiveVerticalAnticausalFilter_multGain(gradient, outputFrame);
-}
-
-void RetinaColor::_adaptiveHorizontalCausalFilter_addInput(const oclMat &inputFrame, const oclMat &gradient, oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(inputFrame.step / inputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBrows, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &inputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &gradient.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &inputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &gradient.offset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    openCLExecuteKernel(ctx, &retina_kernel, "adaptiveHorizontalCausalFilter_addInput", globalSize, localSize, args, -1, -1);
-}
-
-void RetinaColor::_adaptiveVerticalAnticausalFilter_multGain(const oclMat &gradient, oclMat &outputFrame)
-{
-    int elements_per_row = static_cast<int>(outputFrame.step / outputFrame.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, 1, 1};
-    size_t localSize[]  = {256, 1, 1};
-
-    int gradOffset = gradient.offset + static_cast<int>(gradient.step * _NBrows);
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &gradient.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &outputFrame.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int),   &gradOffset));
-    args.push_back(std::make_pair(sizeof(cl_int),   &outputFrame.offset));
-    args.push_back(std::make_pair(sizeof(cl_float), &_gain));
-    openCLExecuteKernel(ctx, &retina_kernel, "adaptiveVerticalAnticausalFilter_multGain", globalSize, localSize, args, -1, -1);
-}
-void RetinaColor::_computeGradient(const oclMat &luminance, oclMat &gradient)
-{
-    int elements_per_row = static_cast<int>(luminance.step / luminance.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {_NBcols, _NBrows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &luminance.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &gradient.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBcols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &_NBrows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    openCLExecuteKernel(ctx, &retina_kernel, "computeGradient", globalSize, localSize, args, -1, -1);
-}
-
-///////////////////////////////////////
-//////////// RetinaFilter /////////////
-///////////////////////////////////////
-RetinaFilter::RetinaFilter(const unsigned int sizeRows, const unsigned int sizeColumns, const bool colorMode, const int samplingMethod, const bool useRetinaLogSampling, const double, const double)
-    :
-    _photoreceptorsPrefilter(sizeRows, sizeColumns, 4),
-    _ParvoRetinaFilter(sizeRows, sizeColumns),
-    _MagnoRetinaFilter(sizeRows, sizeColumns),
-    _colorEngine(sizeRows, sizeColumns, samplingMethod)
-{
-    CV_Assert(!useRetinaLogSampling);
-
-    // set default processing activities
-    _useParvoOutput = true;
-    _useMagnoOutput = true;
-
-    _useColorMode = colorMode;
-
-    // set default parameters
-    setGlobalParameters();
-
-    // stability controls values init
-    _setInitPeriodCount();
-    _globalTemporalConstant = 25;
-
-    // reset all buffers
-    clearAllBuffers();
-}
-
-RetinaFilter::~RetinaFilter()
-{
-}
-
-void RetinaFilter::clearAllBuffers()
-{
-    _photoreceptorsPrefilter.clearAllBuffers();
-    _ParvoRetinaFilter.clearAllBuffers();
-    _MagnoRetinaFilter.clearAllBuffers();
-    _colorEngine.clearAllBuffers();
-    // stability controls value init
-    _setInitPeriodCount();
-}
-
-void RetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    unsigned int rows = NBrows, cols = NBcolumns;
-
-    // resize optionnal member and adjust other modules size if required
-    _photoreceptorsPrefilter.resize(rows, cols);
-    _ParvoRetinaFilter.resize(rows, cols);
-    _MagnoRetinaFilter.resize(rows, cols);
-    _colorEngine.resize(rows, cols);
-
-    // clean buffers
-    clearAllBuffers();
-
-}
-
-void RetinaFilter::_setInitPeriodCount()
-{
-    // find out the maximum temporal constant value and apply a security factor
-    // false value (obviously too long) but appropriate for simple use
-    _globalTemporalConstant = (unsigned int)(_ParvoRetinaFilter.getPhotoreceptorsTemporalConstant() + _ParvoRetinaFilter.getHcellsTemporalConstant() + _MagnoRetinaFilter.getTemporalConstant());
-    // reset frame counter
-    _ellapsedFramesSinceLastReset = 0;
-}
-
-void RetinaFilter::setGlobalParameters(const float OPLspatialResponse1, const float OPLtemporalresponse1, const float OPLassymetryGain, const float OPLspatialResponse2, const float OPLtemporalresponse2, const float LPfilterSpatialResponse, const float LPfilterGain, const float LPfilterTemporalresponse, const float MovingContoursExtractorCoefficient, const bool normalizeParvoOutput_0_maxOutputValue, const bool normalizeMagnoOutput_0_maxOutputValue, const float maxOutputValue, const float maxInputValue, const float meanValue)
-{
-    _normalizeParvoOutput_0_maxOutputValue = normalizeParvoOutput_0_maxOutputValue;
-    _normalizeMagnoOutput_0_maxOutputValue = normalizeMagnoOutput_0_maxOutputValue;
-    _maxOutputValue = maxOutputValue;
-    _photoreceptorsPrefilter.setV0CompressionParameter(0.9f, maxInputValue, meanValue);
-    _photoreceptorsPrefilter.setLPfilterParameters(0, 0, 10, 3); // keeps low pass filter with low cut frequency in memory (usefull for the tone mapping function)
-    _ParvoRetinaFilter.setOPLandParvoFiltersParameters(0, OPLtemporalresponse1, OPLspatialResponse1, OPLassymetryGain, OPLtemporalresponse2, OPLspatialResponse2);
-    _ParvoRetinaFilter.setV0CompressionParameter(0.9f, maxInputValue, meanValue);
-    _MagnoRetinaFilter.setCoefficientsTable(LPfilterGain, LPfilterTemporalresponse, LPfilterSpatialResponse, MovingContoursExtractorCoefficient, 0, 2.0f * LPfilterSpatialResponse);
-    _MagnoRetinaFilter.setV0CompressionParameter(0.7f, maxInputValue, meanValue);
-
-    // stability controls value init
-    _setInitPeriodCount();
-}
-
-bool RetinaFilter::checkInput(const oclMat &input, const bool)
-{
-    BasicRetinaFilter *inputTarget = &_photoreceptorsPrefilter;
-
-    bool test = (input.rows == static_cast<int>(inputTarget->getNBrows())
-                 || input.rows == static_cast<int>(inputTarget->getNBrows()) * 3
-                 || input.rows == static_cast<int>(inputTarget->getNBrows()) * 4)
-                && input.cols == static_cast<int>(inputTarget->getNBcolumns());
-    if (!test)
-    {
-        std::cerr << "RetinaFilter::checkInput: input buffer does not match retina buffer size, conversion aborted" << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-// main function that runs the filter for a given input frame
-bool RetinaFilter::runFilter(const oclMat &imageInput, const bool useAdaptiveFiltering, const bool processRetinaParvoMagnoMapping, const bool useColorMode, const bool inputIsColorMultiplexed)
-{
-    // preliminary check
-    bool processSuccess = true;
-    if (!checkInput(imageInput, useColorMode))
-    {
-        return false;
-    }
-
-    // run the color multiplexing if needed and compute each suub filter of the retina:
-    // -> local adaptation
-    // -> contours OPL extraction
-    // -> moving contours extraction
-
-    // stability controls value update
-    ++_ellapsedFramesSinceLastReset;
-
-    _useColorMode = useColorMode;
-
-    oclMat selectedPhotoreceptorsLocalAdaptationInput = imageInput;
-    oclMat selectedPhotoreceptorsColorInput = imageInput;
-
-    //********** Following is input data specific photoreceptors processing
-    if (useColorMode && (!inputIsColorMultiplexed)) // not multiplexed color input case
-    {
-        _colorEngine.runColorMultiplexing(selectedPhotoreceptorsColorInput);
-        selectedPhotoreceptorsLocalAdaptationInput = _colorEngine.getMultiplexedFrame();
-    }
-    //********** Following is generic Retina processing
-
-    // photoreceptors local adaptation
-    _photoreceptorsPrefilter.runFilter_LocalAdapdation(selectedPhotoreceptorsLocalAdaptationInput, _ParvoRetinaFilter.getHorizontalCellsOutput());
-
-    // run parvo filter
-    _ParvoRetinaFilter.runFilter(_photoreceptorsPrefilter.getOutput(), _useParvoOutput);
-
-    if (_useParvoOutput)
-    {
-        _ParvoRetinaFilter.normalizeGrayOutputCentredSigmoide(); // models the saturation of the cells, usefull for visualisation of the ON-OFF Parvo Output, Bipolar cells outputs do not change !!!
-        _ParvoRetinaFilter.centerReductImageLuminance(); // best for further spectrum analysis
-
-        if (_normalizeParvoOutput_0_maxOutputValue)
-        {
-            _ParvoRetinaFilter.normalizeGrayOutput_0_maxOutputValue(_maxOutputValue);
-        }
-    }
-
-    if (_useParvoOutput && _useMagnoOutput)
-    {
-        _MagnoRetinaFilter.runFilter(_ParvoRetinaFilter.getBipolarCellsON(), _ParvoRetinaFilter.getBipolarCellsOFF());
-        if (_normalizeMagnoOutput_0_maxOutputValue)
-        {
-            _MagnoRetinaFilter.normalizeGrayOutput_0_maxOutputValue(_maxOutputValue);
-        }
-        _MagnoRetinaFilter.normalizeGrayOutputNearZeroCentreredSigmoide();
-    }
-
-    if (_useParvoOutput && _useMagnoOutput && processRetinaParvoMagnoMapping)
-    {
-        _processRetinaParvoMagnoMapping();
-        if (_useColorMode)
-        {
-            _colorEngine.runColorDemultiplexing(_retinaParvoMagnoMappedFrame, useAdaptiveFiltering, _maxOutputValue);
-        }
-        return processSuccess;
-    }
-
-    if (_useParvoOutput && _useColorMode)
-    {
-        _colorEngine.runColorDemultiplexing(_ParvoRetinaFilter.getOutput(), useAdaptiveFiltering, _maxOutputValue);
-    }
-    return processSuccess;
-}
-
-const oclMat &RetinaFilter::getContours()
-{
-    if (_useColorMode)
-    {
-        return _colorEngine.getLuminance();
-    }
-    else
-    {
-        return _ParvoRetinaFilter.getOutput();
-    }
-}
-void RetinaFilter::_processRetinaParvoMagnoMapping()
-{
-    oclMat parvo = _ParvoRetinaFilter.getOutput();
-    oclMat magno = _MagnoRetinaFilter.getOutput();
-
-    int halfRows = parvo.rows / 2;
-    int halfCols = parvo.cols / 2;
-    float minDistance = MIN(halfRows, halfCols) * 0.7f;
-
-    int elements_per_row = static_cast<int>(parvo.step / parvo.elemSize());
-
-    Context * ctx = Context::getContext();
-    std::vector<std::pair<size_t, const void *> > args;
-    size_t globalSize[] = {parvo.cols, parvo.rows, 1};
-    size_t localSize[]  = {16, 16, 1};
-
-    args.push_back(std::make_pair(sizeof(cl_mem),   &parvo.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),   &magno.data));
-    args.push_back(std::make_pair(sizeof(cl_int),   &parvo.cols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &parvo.rows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &halfCols));
-    args.push_back(std::make_pair(sizeof(cl_int),   &halfRows));
-    args.push_back(std::make_pair(sizeof(cl_int),   &elements_per_row));
-    args.push_back(std::make_pair(sizeof(cl_float), &minDistance));
-    openCLExecuteKernel(ctx, &retina_kernel, "processRetinaParvoMagnoMapping", globalSize, localSize, args, -1, -1);
-}
-}  /* namespace ocl */
-
-Ptr<Retina> createRetina_OCL(Size getInputSize){ return makePtr<ocl::RetinaOCLImpl>(getInputSize); }
-Ptr<Retina> createRetina_OCL(Size getInputSize, const bool colorMode, int colorSamplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-{
-    return makePtr<ocl::RetinaOCLImpl>(getInputSize, colorMode, colorSamplingMethod, useRetinaLogSampling, reductionFactor, samplingStrenght);
-}
-
-}  /* namespace bioinspired */
-}  /* namespace cv */
-
-#endif /* #ifdef HAVE_OPENCV_OCL */
diff --git a/modules/bioinspired/src/retina_ocl.hpp b/modules/bioinspired/src/retina_ocl.hpp
deleted file mode 100644
index 90df0601c..000000000
--- a/modules/bioinspired/src/retina_ocl.hpp
+++ /dev/null
@@ -1,634 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OCL_RETINA_HPP__
-#define __OCL_RETINA_HPP__
-
-#include "precomp.hpp"
-
-#ifdef HAVE_OPENCV_OCL
-
-// please refer to c++ headers for API comments
-namespace cv
-{
-namespace bioinspired
-{
-namespace ocl
-{
-void normalizeGrayOutputCentredSigmoide(const float meanValue, const float sensitivity, cv::ocl::oclMat &in, cv::ocl::oclMat &out, const float maxValue = 255.f);
-void normalizeGrayOutput_0_maxOutputValue(cv::ocl::oclMat &inputOutputBuffer, const float maxOutputValue = 255.0);
-void normalizeGrayOutputNearZeroCentreredSigmoide(cv::ocl::oclMat &inputPicture, cv::ocl::oclMat &outputBuffer, const float sensitivity = 40, const float maxOutputValue = 255.0f);
-void centerReductImageLuminance(cv::ocl::oclMat &inputOutputBuffer);
-
-class BasicRetinaFilter
-{
-public:
-    BasicRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns, const unsigned int parametersListSize = 1, const bool useProgressiveFilter = false);
-    ~BasicRetinaFilter();
-    inline void clearOutputBuffer()
-    {
-        _filterOutput = 0;
-    };
-    inline void clearSecondaryBuffer()
-    {
-        _localBuffer = 0;
-    };
-    inline void clearAllBuffers()
-    {
-        clearOutputBuffer();
-        clearSecondaryBuffer();
-    };
-    void  resize(const unsigned int NBrows, const unsigned int NBcolumns);
-    const cv::ocl::oclMat &runFilter_LPfilter(const cv::ocl::oclMat &inputFrame, const unsigned int filterIndex = 0);
-    void  runFilter_LPfilter(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame, const unsigned int filterIndex = 0);
-    void  runFilter_LPfilter_Autonomous(cv::ocl::oclMat &inputOutputFrame, const unsigned int filterIndex = 0);
-    const cv::ocl::oclMat &runFilter_LocalAdapdation(const cv::ocl::oclMat &inputOutputFrame, const cv::ocl::oclMat &localLuminance);
-    void  runFilter_LocalAdapdation(const cv::ocl::oclMat &inputFrame, const cv::ocl::oclMat &localLuminance, cv::ocl::oclMat &outputFrame);
-    const cv::ocl::oclMat &runFilter_LocalAdapdation_autonomous(const cv::ocl::oclMat &inputFrame);
-    void  runFilter_LocalAdapdation_autonomous(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame);
-    void  setLPfilterParameters(const float beta, const float tau, const float k, const unsigned int filterIndex = 0);
-    inline void setV0CompressionParameter(const float v0, const float maxInputValue, const float)
-    {
-        _v0 = v0 * maxInputValue;
-        _localLuminanceFactor = v0;
-        _localLuminanceAddon = maxInputValue * (1.0f - v0);
-        _maxInputValue = maxInputValue;
-    };
-    inline void setV0CompressionParameter(const float v0, const float meanLuminance)
-    {
-        this->setV0CompressionParameter(v0, _maxInputValue, meanLuminance);
-    };
-    inline void setV0CompressionParameter(const float v0)
-    {
-        _v0 = v0 * _maxInputValue;
-        _localLuminanceFactor = v0;
-        _localLuminanceAddon = _maxInputValue * (1.0f - v0);
-    };
-    inline void setV0CompressionParameterToneMapping(const float v0, const float maxInputValue, const float meanLuminance = 128.0f)
-    {
-        _v0 = v0 * maxInputValue;
-        _localLuminanceFactor = 1.0f;
-        _localLuminanceAddon = meanLuminance * _v0;
-        _maxInputValue = maxInputValue;
-    };
-    inline void updateCompressionParameter(const float meanLuminance)
-    {
-        _localLuminanceFactor = 1;
-        _localLuminanceAddon = meanLuminance * _v0;
-    };
-    inline float getV0CompressionParameter()
-    {
-        return _v0 / _maxInputValue;
-    };
-    inline const cv::ocl::oclMat &getOutput() const
-    {
-        return _filterOutput;
-    };
-    inline unsigned int getNBrows()
-    {
-        return _filterOutput.rows;
-    };
-    inline unsigned int getNBcolumns()
-    {
-        return _filterOutput.cols;
-    };
-    inline unsigned int getNBpixels()
-    {
-        return _filterOutput.size().area();
-    };
-    inline void normalizeGrayOutput_0_maxOutputValue(const float maxValue)
-    {
-        ocl::normalizeGrayOutput_0_maxOutputValue(_filterOutput, maxValue);
-    };
-    inline void normalizeGrayOutputCentredSigmoide()
-    {
-        ocl::normalizeGrayOutputCentredSigmoide(0.0, 2.0, _filterOutput, _filterOutput);
-    };
-    inline void centerReductImageLuminance()
-    {
-        ocl::centerReductImageLuminance(_filterOutput);
-    };
-    inline float getMaxInputValue()
-    {
-        return this->_maxInputValue;
-    };
-    inline void setMaxInputValue(const float newMaxInputValue)
-    {
-        this->_maxInputValue = newMaxInputValue;
-    };
-
-protected:
-
-    int _NBrows;
-    int _NBcols;
-    unsigned int _halfNBrows;
-    unsigned int _halfNBcolumns;
-
-    cv::ocl::oclMat _filterOutput;
-    cv::ocl::oclMat _localBuffer;
-
-    std::valarray <float>_filteringCoeficientsTable;
-    float _v0;
-    float _maxInputValue;
-    float _meanInputValue;
-    float _localLuminanceFactor;
-    float _localLuminanceAddon;
-
-    float _a;
-    float _tau;
-    float _gain;
-
-    void _spatiotemporalLPfilter(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &LPfilterOutput, const unsigned int coefTableOffset = 0);
-    float _squaringSpatiotemporalLPfilter(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame, const unsigned int filterIndex = 0);
-    void _spatiotemporalLPfilter_Irregular(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame, const unsigned int filterIndex = 0);
-    void _localSquaringSpatioTemporalLPfilter(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &LPfilterOutput, const unsigned int *integrationAreas, const unsigned int filterIndex = 0);
-    void _localLuminanceAdaptation(const cv::ocl::oclMat &inputFrame, const cv::ocl::oclMat &localLuminance, cv::ocl::oclMat &outputFrame, const bool updateLuminanceMean = true);
-    void _localLuminanceAdaptation(cv::ocl::oclMat &inputOutputFrame, const cv::ocl::oclMat &localLuminance);
-    void _localLuminanceAdaptationPosNegValues(const cv::ocl::oclMat &inputFrame, const cv::ocl::oclMat &localLuminance, float *outputFrame);
-    void _horizontalCausalFilter_addInput(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame);
-    void _horizontalAnticausalFilter(cv::ocl::oclMat &outputFrame);
-    void _verticalCausalFilter(cv::ocl::oclMat &outputFrame);
-    void _horizontalAnticausalFilter_Irregular(cv::ocl::oclMat &outputFrame, const cv::ocl::oclMat &spatialConstantBuffer);
-    void _verticalCausalFilter_Irregular(cv::ocl::oclMat &outputFrame, const cv::ocl::oclMat &spatialConstantBuffer);
-    void _verticalAnticausalFilter_multGain(cv::ocl::oclMat &outputFrame);
-};
-
-class MagnoRetinaFilter: public BasicRetinaFilter
-{
-public:
-    MagnoRetinaFilter(const unsigned int NBrows, const unsigned int NBcolumns);
-    virtual ~MagnoRetinaFilter();
-    void clearAllBuffers();
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-    void setCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float localAdaptIntegration_tau, const float localAdaptIntegration_k);
-
-    const cv::ocl::oclMat &runFilter(const cv::ocl::oclMat &OPL_ON, const cv::ocl::oclMat &OPL_OFF);
-
-    inline const cv::ocl::oclMat &getMagnoON() const
-    {
-        return _magnoXOutputON;
-    };
-    inline const cv::ocl::oclMat &getMagnoOFF() const
-    {
-        return _magnoXOutputOFF;
-    };
-    inline const cv::ocl::oclMat &getMagnoYsaturated() const
-    {
-        return _magnoYsaturated;
-    };
-    inline void normalizeGrayOutputNearZeroCentreredSigmoide()
-    {
-        ocl::normalizeGrayOutputNearZeroCentreredSigmoide(_magnoYOutput, _magnoYsaturated);
-    };
-    inline float getTemporalConstant()
-    {
-        return this->_filteringCoeficientsTable[2];
-    };
-private:
-    cv::ocl::oclMat _previousInput_ON;
-    cv::ocl::oclMat _previousInput_OFF;
-    cv::ocl::oclMat _amacrinCellsTempOutput_ON;
-    cv::ocl::oclMat _amacrinCellsTempOutput_OFF;
-    cv::ocl::oclMat _magnoXOutputON;
-    cv::ocl::oclMat _magnoXOutputOFF;
-    cv::ocl::oclMat _localProcessBufferON;
-    cv::ocl::oclMat _localProcessBufferOFF;
-    cv::ocl::oclMat _magnoYOutput;
-    cv::ocl::oclMat _magnoYsaturated;
-
-    float _temporalCoefficient;
-    void _amacrineCellsComputing(const cv::ocl::oclMat &OPL_ON,  const cv::ocl::oclMat &OPL_OFF);
-};
-
-class ParvoRetinaFilter: public BasicRetinaFilter
-{
-public:
-    ParvoRetinaFilter(const unsigned int NBrows = 480, const unsigned int NBcolumns = 640);
-    virtual ~ParvoRetinaFilter();
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-    void clearAllBuffers();
-    void setOPLandParvoFiltersParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2);
-
-    inline void setGanglionCellsLocalAdaptationLPfilterParameters(const float tau, const float k)
-    {
-        BasicRetinaFilter::setLPfilterParameters(0, tau, k, 2);
-    };
-    const cv::ocl::oclMat &runFilter(const cv::ocl::oclMat &inputFrame, const bool useParvoOutput = true);
-
-    inline const cv::ocl::oclMat &getPhotoreceptorsLPfilteringOutput() const
-    {
-        return _photoreceptorsOutput;
-    };
-
-    inline const cv::ocl::oclMat &getHorizontalCellsOutput() const
-    {
-        return _horizontalCellsOutput;
-    };
-
-    inline const cv::ocl::oclMat &getParvoON() const
-    {
-        return _parvocellularOutputON;
-    };
-
-    inline const cv::ocl::oclMat &getParvoOFF() const
-    {
-        return _parvocellularOutputOFF;
-    };
-
-    inline const cv::ocl::oclMat &getBipolarCellsON() const
-    {
-        return _bipolarCellsOutputON;
-    };
-
-    inline const cv::ocl::oclMat &getBipolarCellsOFF() const
-    {
-        return _bipolarCellsOutputOFF;
-    };
-
-    inline float getPhotoreceptorsTemporalConstant()
-    {
-        return this->_filteringCoeficientsTable[2];
-    };
-
-    inline float getHcellsTemporalConstant()
-    {
-        return this->_filteringCoeficientsTable[5];
-    };
-private:
-    cv::ocl::oclMat _photoreceptorsOutput;
-    cv::ocl::oclMat _horizontalCellsOutput;
-    cv::ocl::oclMat _parvocellularOutputON;
-    cv::ocl::oclMat _parvocellularOutputOFF;
-    cv::ocl::oclMat _bipolarCellsOutputON;
-    cv::ocl::oclMat _bipolarCellsOutputOFF;
-    cv::ocl::oclMat _localAdaptationOFF;
-    cv::ocl::oclMat _localAdaptationON;
-    cv::ocl::oclMat _parvocellularOutputONminusOFF;
-    void _OPL_OnOffWaysComputing();
-};
-class RetinaColor: public BasicRetinaFilter
-{
-public:
-    RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns, const int samplingMethod = RETINA_COLOR_DIAGONAL);
-    virtual ~RetinaColor();
-
-    void clearAllBuffers();
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-    inline void runColorMultiplexing(const cv::ocl::oclMat &inputRGBFrame)
-    {
-        runColorMultiplexing(inputRGBFrame, _multiplexedFrame);
-    };
-    void runColorMultiplexing(const cv::ocl::oclMat &demultiplexedInputFrame, cv::ocl::oclMat &multiplexedFrame);
-    void runColorDemultiplexing(const cv::ocl::oclMat &multiplexedColorFrame, const bool adaptiveFiltering = false, const float maxInputValue = 255.0);
-
-    void setColorSaturation(const bool saturateColors = true, const float colorSaturationValue = 4.0)
-    {
-        _saturateColors = saturateColors;
-        _colorSaturationValue = colorSaturationValue;
-    };
-
-    void setChrominanceLPfilterParameters(const float beta, const float tau, const float k)
-    {
-        setLPfilterParameters(beta, tau, k);
-    };
-
-    bool applyKrauskopfLMS2Acr1cr2Transform(cv::ocl::oclMat &result);
-    bool applyLMS2LabTransform(cv::ocl::oclMat &result);
-    inline const cv::ocl::oclMat &getMultiplexedFrame() const
-    {
-        return _multiplexedFrame;
-    };
-
-    inline const cv::ocl::oclMat &getDemultiplexedColorFrame() const
-    {
-        return _demultiplexedColorFrame;
-    };
-
-    inline const cv::ocl::oclMat &getLuminance() const
-    {
-        return _luminance;
-    };
-    inline const cv::ocl::oclMat &getChrominance() const
-    {
-        return _chrominance;
-    };
-    void clipRGBOutput_0_maxInputValue(cv::ocl::oclMat &inputOutputBuffer, const float maxOutputValue = 255.0);
-    void normalizeRGBOutput_0_maxOutputValue(const float maxOutputValue = 255.0);
-    inline void setDemultiplexedColorFrame(const cv::ocl::oclMat &demultiplexedImage)
-    {
-        _demultiplexedColorFrame = demultiplexedImage;
-    };
-protected:
-    inline unsigned int bayerSampleOffset(unsigned int index)
-    {
-        return index + ((index / getNBcolumns()) % 2) * getNBpixels() + ((index % getNBcolumns()) % 2) * getNBpixels();
-    }
-    inline Rect getROI(int idx)
-    {
-        return Rect(0, idx * _NBrows, _NBcols, _NBrows);
-    }
-    int _samplingMethod;
-    bool _saturateColors;
-    float _colorSaturationValue;
-    cv::ocl::oclMat _luminance;
-    cv::ocl::oclMat _multiplexedFrame;
-    cv::ocl::oclMat _RGBmosaic;
-    cv::ocl::oclMat _tempMultiplexedFrame;
-    cv::ocl::oclMat _demultiplexedTempBuffer;
-    cv::ocl::oclMat _demultiplexedColorFrame;
-    cv::ocl::oclMat _chrominance;
-    cv::ocl::oclMat _colorLocalDensity;
-    cv::ocl::oclMat _imageGradient;
-
-    float _pR, _pG, _pB;
-    bool _objectInit;
-
-    void _initColorSampling();
-    void _adaptiveSpatialLPfilter(const cv::ocl::oclMat &inputFrame, const cv::ocl::oclMat &gradient, cv::ocl::oclMat &outputFrame);
-    void _adaptiveHorizontalCausalFilter_addInput(const cv::ocl::oclMat &inputFrame, const cv::ocl::oclMat &gradient, cv::ocl::oclMat &outputFrame);
-    void _adaptiveVerticalAnticausalFilter_multGain(const cv::ocl::oclMat &gradient, cv::ocl::oclMat &outputFrame);
-    void _computeGradient(const cv::ocl::oclMat &luminance, cv::ocl::oclMat &gradient);
-    void _normalizeOutputs_0_maxOutputValue(void);
-    void _applyImageColorSpaceConversion(const cv::ocl::oclMat &inputFrame, cv::ocl::oclMat &outputFrame, const float *transformTable);
-};
-class RetinaFilter
-{
-public:
-    RetinaFilter(const unsigned int sizeRows, const unsigned int sizeColumns, const bool colorMode = false, const int samplingMethod = RETINA_COLOR_BAYER, const bool useRetinaLogSampling = false, const double reductionFactor = 1.0, const double samplingStrenght = 10.0);
-    ~RetinaFilter();
-
-    void clearAllBuffers();
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-    bool checkInput(const cv::ocl::oclMat &input, const bool colorMode);
-    bool runFilter(const cv::ocl::oclMat &imageInput, const bool useAdaptiveFiltering = true, const bool processRetinaParvoMagnoMapping = false, const bool useColorMode = false, const bool inputIsColorMultiplexed = false);
-
-    void setGlobalParameters(const float OPLspatialResponse1 = 0.7, const float OPLtemporalresponse1 = 1, const float OPLassymetryGain = 0, const float OPLspatialResponse2 = 5, const float OPLtemporalresponse2 = 1, const float LPfilterSpatialResponse = 5, const float LPfilterGain = 0, const float LPfilterTemporalresponse = 0, const float MovingContoursExtractorCoefficient = 5, const bool normalizeParvoOutput_0_maxOutputValue = false, const bool normalizeMagnoOutput_0_maxOutputValue = false, const float maxOutputValue = 255.0, const float maxInputValue = 255.0, const float meanValue = 128.0);
-
-    inline void setPhotoreceptorsLocalAdaptationSensitivity(const float V0CompressionParameter)
-    {
-        _photoreceptorsPrefilter.setV0CompressionParameter(1 - V0CompressionParameter);
-        _setInitPeriodCount();
-    };
-
-    inline void setParvoGanglionCellsLocalAdaptationSensitivity(const float V0CompressionParameter)
-    {
-        _ParvoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);
-        _setInitPeriodCount();
-    };
-
-    inline void setGanglionCellsLocalAdaptationLPfilterParameters(const float spatialResponse, const float temporalResponse)
-    {
-        _ParvoRetinaFilter.setGanglionCellsLocalAdaptationLPfilterParameters(temporalResponse, spatialResponse);
-        _setInitPeriodCount();
-    };
-
-    inline void setMagnoGanglionCellsLocalAdaptationSensitivity(const float V0CompressionParameter)
-    {
-        _MagnoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);
-        _setInitPeriodCount();
-    };
-
-    void setOPLandParvoParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2, const float V0CompressionParameter)
-    {
-        _ParvoRetinaFilter.setOPLandParvoFiltersParameters(beta1, tau1, k1, beta2, tau2, k2);
-        _ParvoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);
-        _setInitPeriodCount();
-    };
-
-    void setMagnoCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float V0CompressionParameter, const float localAdaptintegration_tau, const float localAdaptintegration_k)
-    {
-        _MagnoRetinaFilter.setCoefficientsTable(parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, localAdaptintegration_tau, localAdaptintegration_k);
-        _MagnoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);
-        _setInitPeriodCount();
-    };
-
-    inline void activateNormalizeParvoOutput_0_maxOutputValue(const bool normalizeParvoOutput_0_maxOutputValue)
-    {
-        _normalizeParvoOutput_0_maxOutputValue = normalizeParvoOutput_0_maxOutputValue;
-    };
-
-    inline void activateNormalizeMagnoOutput_0_maxOutputValue(const bool normalizeMagnoOutput_0_maxOutputValue)
-    {
-        _normalizeMagnoOutput_0_maxOutputValue = normalizeMagnoOutput_0_maxOutputValue;
-    };
-
-    inline void setMaxOutputValue(const float maxOutputValue)
-    {
-        _maxOutputValue = maxOutputValue;
-    };
-
-    void setColorMode(const bool desiredColorMode)
-    {
-        _useColorMode = desiredColorMode;
-    };
-    inline void setColorSaturation(const bool saturateColors = true, const float colorSaturationValue = 4.0)
-    {
-        _colorEngine.setColorSaturation(saturateColors, colorSaturationValue);
-    };
-    inline const cv::ocl::oclMat &getLocalAdaptation() const
-    {
-        return _photoreceptorsPrefilter.getOutput();
-    };
-    inline const cv::ocl::oclMat &getPhotoreceptors() const
-    {
-        return _ParvoRetinaFilter.getPhotoreceptorsLPfilteringOutput();
-    };
-
-    inline const cv::ocl::oclMat &getHorizontalCells() const
-    {
-        return _ParvoRetinaFilter.getHorizontalCellsOutput();
-    };
-    inline bool areContoursProcessed()
-    {
-        return _useParvoOutput;
-    };
-    bool getParvoFoveaResponse(cv::ocl::oclMat &parvoFovealResponse);
-    inline void activateContoursProcessing(const bool useParvoOutput)
-    {
-        _useParvoOutput = useParvoOutput;
-    };
-
-    const cv::ocl::oclMat &getContours();
-
-    inline const cv::ocl::oclMat &getContoursON() const
-    {
-        return _ParvoRetinaFilter.getParvoON();
-    };
-
-    inline const cv::ocl::oclMat &getContoursOFF() const
-    {
-        return _ParvoRetinaFilter.getParvoOFF();
-    };
-
-    inline bool areMovingContoursProcessed()
-    {
-        return _useMagnoOutput;
-    };
-
-    inline void activateMovingContoursProcessing(const bool useMagnoOutput)
-    {
-        _useMagnoOutput = useMagnoOutput;
-    };
-
-    inline const cv::ocl::oclMat &getMovingContours() const
-    {
-        return _MagnoRetinaFilter.getOutput();
-    };
-
-    inline const cv::ocl::oclMat &getMovingContoursSaturated() const
-    {
-        return _MagnoRetinaFilter.getMagnoYsaturated();
-    };
-
-    inline const cv::ocl::oclMat &getMovingContoursON() const
-    {
-        return _MagnoRetinaFilter.getMagnoON();
-    };
-
-    inline const cv::ocl::oclMat &getMovingContoursOFF() const
-    {
-        return _MagnoRetinaFilter.getMagnoOFF();
-    };
-
-    inline const cv::ocl::oclMat &getRetinaParvoMagnoMappedOutput() const
-    {
-        return _retinaParvoMagnoMappedFrame;
-    };
-
-    inline const cv::ocl::oclMat &getParvoContoursChannel() const
-    {
-        return _colorEngine.getLuminance();
-    };
-
-    inline const cv::ocl::oclMat &getParvoChrominance() const
-    {
-        return _colorEngine.getChrominance();
-    };
-    inline const cv::ocl::oclMat &getColorOutput() const
-    {
-        return _colorEngine.getDemultiplexedColorFrame();
-    };
-
-    inline bool isColorMode()
-    {
-        return _useColorMode;
-    };
-    bool getColorMode()
-    {
-        return _useColorMode;
-    };
-
-    inline bool isInitTransitionDone()
-    {
-        if (_ellapsedFramesSinceLastReset < _globalTemporalConstant)
-        {
-            return false;
-        }
-        return true;
-    };
-    inline float getRetinaSamplingBackProjection(const float projectedRadiusLength)
-    {
-        return projectedRadiusLength;
-    };
-
-    inline unsigned int getInputNBrows()
-    {
-        return _photoreceptorsPrefilter.getNBrows();
-    };
-
-    inline unsigned int getInputNBcolumns()
-    {
-        return _photoreceptorsPrefilter.getNBcolumns();
-    };
-
-    inline unsigned int getInputNBpixels()
-    {
-        return _photoreceptorsPrefilter.getNBpixels();
-    };
-
-    inline unsigned int getOutputNBrows()
-    {
-        return _photoreceptorsPrefilter.getNBrows();
-    };
-
-    inline unsigned int getOutputNBcolumns()
-    {
-        return _photoreceptorsPrefilter.getNBcolumns();
-    };
-
-    inline unsigned int getOutputNBpixels()
-    {
-        return _photoreceptorsPrefilter.getNBpixels();
-    };
-private:
-    bool _useParvoOutput;
-    bool _useMagnoOutput;
-
-    unsigned int _ellapsedFramesSinceLastReset;
-    unsigned int _globalTemporalConstant;
-
-    cv::ocl::oclMat _retinaParvoMagnoMappedFrame;
-    BasicRetinaFilter _photoreceptorsPrefilter;
-    ParvoRetinaFilter _ParvoRetinaFilter;
-    MagnoRetinaFilter _MagnoRetinaFilter;
-    RetinaColor       _colorEngine;
-
-    bool _useMinimalMemoryForToneMappingONLY;
-    bool _normalizeParvoOutput_0_maxOutputValue;
-    bool _normalizeMagnoOutput_0_maxOutputValue;
-    float _maxOutputValue;
-    bool _useColorMode;
-
-    void _setInitPeriodCount();
-    void _processRetinaParvoMagnoMapping();
-    void _runGrayToneMapping(const cv::ocl::oclMat &grayImageInput, cv::ocl::oclMat &grayImageOutput , const float PhotoreceptorsCompression = 0.6, const float ganglionCellsCompression = 0.6);
-};
-
-}  /* namespace ocl */
-}  /* namespace bioinspired */
-}  /* namespace cv */
-
-#endif  /* HAVE_OPENCV_OCL */
-#endif  /* __OCL_RETINA_HPP__ */
diff --git a/modules/bioinspired/src/retinacolor.cpp b/modules/bioinspired/src/retinacolor.cpp
deleted file mode 100644
index 3fbc55385..000000000
--- a/modules/bioinspired/src/retinacolor.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-
-#include "retinacolor.hpp"
-
-// @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-
-#include <iostream>
-#include <ctime>
-
-namespace cv
-{
-namespace bioinspired
-{
-// init static values
-static float _LMStoACr1Cr2[]={1.0,  1.0, 0.0,  1.0, -1.0, 0.0,  -0.5, -0.5, 1.0};
-//static double _ACr1Cr2toLMS[]={0.5,  0.5, 0.0,   0.5, -0.5, 0.0,  0.5,  0.0, 1.0};
-static float _LMStoLab[]={0.5774f, 0.5774f, 0.5774f, 0.4082f, 0.4082f, -0.8165f, 0.7071f, -0.7071f, 0.f};
-
-// constructor/desctructor
-RetinaColor::RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns, const int samplingMethod)
-:BasicRetinaFilter(NBrows, NBcolumns, 3),
- _colorSampling(NBrows*NBcolumns),
- _RGBmosaic(NBrows*NBcolumns*3),
- _tempMultiplexedFrame(NBrows*NBcolumns),
- _demultiplexedTempBuffer(NBrows*NBcolumns*3),
- _demultiplexedColorFrame(NBrows*NBcolumns*3),
- _chrominance(NBrows*NBcolumns*3),
- _colorLocalDensity(NBrows*NBcolumns*3),
- _imageGradient(NBrows*NBcolumns*2)
-{
-    // link to parent buffers (let's recycle !)
-    _luminance=&_filterOutput;
-    _multiplexedFrame=&_localBuffer;
-
-    _objectInit=false;
-    _samplingMethod=samplingMethod;
-    _saturateColors=false;
-    _colorSaturationValue=4.0;
-
-    // set default spatio-temporal filter parameters
-    setLPfilterParameters(0.0, 0.0, 1.5);
-    setLPfilterParameters(0.0, 0.0, 10.5, 1);// for the low pass filter dedicated to contours energy extraction (demultiplexing process)
-    setLPfilterParameters(0.f, 0.f, 0.9f, 2);
-
-    // init default value on image Gradient
-    _imageGradient=0.57f;
-
-    // init color sampling map
-    _initColorSampling();
-
-    // flush all buffers
-    clearAllBuffers();
-}
-
-RetinaColor::~RetinaColor()
-{
-
-}
-
-/**
-* function that clears all buffers of the object
-*/
-void RetinaColor::clearAllBuffers()
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _tempMultiplexedFrame=0.f;
-    _demultiplexedTempBuffer=0.f;
-
-    _demultiplexedColorFrame=0.f;
-    _chrominance=0.f;
-    _imageGradient=0.57f;
-}
-
-/**
-* resize retina color filter object (resize all allocated buffers)
-* @param NBrows: the new height size
-* @param NBcolumns: the new width size
-*/
-void RetinaColor::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-{
-    BasicRetinaFilter::clearAllBuffers();
-    _colorSampling.resize(NBrows*NBcolumns);
-    _RGBmosaic.resize(NBrows*NBcolumns*3);
-    _tempMultiplexedFrame.resize(NBrows*NBcolumns);
-    _demultiplexedTempBuffer.resize(NBrows*NBcolumns*3);
-    _demultiplexedColorFrame.resize(NBrows*NBcolumns*3);
-    _chrominance.resize(NBrows*NBcolumns*3);
-    _colorLocalDensity.resize(NBrows*NBcolumns*3);
-    _imageGradient.resize(NBrows*NBcolumns*2);
-
-    // link to parent buffers (let's recycle !)
-    _luminance=&_filterOutput;
-    _multiplexedFrame=&_localBuffer;
-
-    // init color sampling map
-    _initColorSampling();
-
-    // clean buffers
-    clearAllBuffers();
-}
-
-
-void RetinaColor::_initColorSampling()
-{
-
-    // filling the conversion table for multiplexed <=> demultiplexed frame
-    srand((unsigned)time(NULL));
-
-    // preInit cones probabilities
-    _pR=_pB=_pG=0;
-    switch (_samplingMethod)
-    {
-    case RETINA_COLOR_RANDOM:
-        for (unsigned int index=0 ; index<this->getNBpixels(); ++index)
-        {
-
-            // random RGB sampling
-            unsigned int colorIndex=rand()%24;
-
-            if (colorIndex<8){
-                colorIndex=0;
-
-                ++_pR;
-            }else
-            {
-                if (colorIndex<21){
-                    colorIndex=1;
-                    ++_pG;
-                }else{
-                    colorIndex=2;
-                    ++_pB;
-                }
-            }
-            _colorSampling[index] = colorIndex*this->getNBpixels()+index;
-        }
-        _pR/=(float)this->getNBpixels();
-        _pG/=(float)this->getNBpixels();
-        _pB/=(float)this->getNBpixels();
-        std::cout<<"Color channels proportions: pR, pG, pB= "<<_pR<<", "<<_pG<<", "<<_pB<<", "<<std::endl;
-        break;
-    case RETINA_COLOR_DIAGONAL:
-        for (unsigned int index=0 ; index<this->getNBpixels(); ++index)
-        {
-            _colorSampling[index] = index+((index%3+(index%_filterOutput.getNBcolumns()))%3)*_filterOutput.getNBpixels();
-        }
-        _pR=_pB=_pG=1.f/3;
-        break;
-    case RETINA_COLOR_BAYER: // default sets bayer sampling
-        for (unsigned int index=0 ; index<_filterOutput.getNBpixels(); ++index)
-        {
-            //First line: R G R G
-            _colorSampling[index] = index+((index/_filterOutput.getNBcolumns())%2)*_filterOutput.getNBpixels()+((index%_filterOutput.getNBcolumns())%2)*_filterOutput.getNBpixels();
-            //First line: G R G R
-            //_colorSampling[index] = 3*index+((index/_filterOutput.getNBcolumns())%2)+((index%_filterOutput.getNBcolumns()+1)%2);
-        }
-        _pR=_pB=0.25;
-        _pG=0.5;
-        break;
-    default:
-#ifdef RETINACOLORDEBUG
-        std::cerr<<"RetinaColor::No or wrong color sampling method, skeeping"<<std::endl;
-#endif
-        return;
-        break;//.. not useful, yes
-
-    }
-    // feeling the mosaic buffer:
-    _RGBmosaic=0;
-    for (unsigned int index=0 ; index<_filterOutput.getNBpixels(); ++index)
-        // the RGB _RGBmosaic buffer contains 1 where the pixel corresponds to a sampled color
-        _RGBmosaic[_colorSampling[index]]=1.0;
-
-    // computing photoreceptors local density
-    _spatiotemporalLPfilter(&_RGBmosaic[0], &_colorLocalDensity[0]);
-    _spatiotemporalLPfilter(&_RGBmosaic[0]+_filterOutput.getNBpixels(), &_colorLocalDensity[0]+_filterOutput.getNBpixels());
-    _spatiotemporalLPfilter(&_RGBmosaic[0]+_filterOutput.getDoubleNBpixels(), &_colorLocalDensity[0]+_filterOutput.getDoubleNBpixels());
-    unsigned int maxNBpixels=3*_filterOutput.getNBpixels();
-    register float *colorLocalDensityPTR=&_colorLocalDensity[0];
-    for (unsigned int i=0;i<maxNBpixels;++i, ++colorLocalDensityPTR)
-        *colorLocalDensityPTR=1.f/ *colorLocalDensityPTR;
-
-#ifdef RETINACOLORDEBUG
-    std::cout<<"INIT    _colorLocalDensity max, min: "<<_colorLocalDensity.max()<<", "<<_colorLocalDensity.min()<<std::endl;
-#endif
-    // end of the init step
-    _objectInit=true;
-}
-
-// public functions
-
-void RetinaColor::runColorDemultiplexing(const std::valarray<float> &multiplexedColorFrame, const bool adaptiveFiltering, const float maxInputValue)
-{
-    // demultiplex the grey frame to RGB frame
-    // -> first set demultiplexed frame to 0
-    _demultiplexedTempBuffer=0;
-    // -> demultiplex process
-    register unsigned int *colorSamplingPRT=&_colorSampling[0];
-    register const float *multiplexedColorFramePtr=get_data(multiplexedColorFrame);
-    for (unsigned int indexa=0; indexa<_filterOutput.getNBpixels() ; ++indexa)
-        _demultiplexedTempBuffer[*(colorSamplingPRT++)]=*(multiplexedColorFramePtr++);
-
-    // interpolate the demultiplexed frame depending on the color sampling method
-    if (!adaptiveFiltering)
-        _interpolateImageDemultiplexedImage(&_demultiplexedTempBuffer[0]);
-
-    // low pass filtering the demultiplexed frame
-    _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0], &_chrominance[0]);
-    _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels(), &_chrominance[0]+_filterOutput.getNBpixels());
-    _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels(), &_chrominance[0]+_filterOutput.getDoubleNBpixels());
-
-    /*if (_samplingMethod=BAYER)
-    {
-        _applyRIFfilter(_chrominance, _chrominance);
-        _applyRIFfilter(_chrominance+_filterOutput.getNBpixels(), _chrominance+_filterOutput.getNBpixels());
-        _applyRIFfilter(_chrominance+_filterOutput.getDoubleNBpixels(), _chrominance+_filterOutput.getDoubleNBpixels());
-    }*/
-
-    // normalize by the photoreceptors local density and retrieve the local luminance
-    register float *chrominancePTR= &_chrominance[0];
-    register float *colorLocalDensityPTR= &_colorLocalDensity[0];
-    register float *luminance= &(*_luminance)[0];
-    if (!adaptiveFiltering)// compute the gradient on the luminance
-    {
-        if (_samplingMethod==RETINA_COLOR_RANDOM)
-            for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance)
-            {
-                // normalize by photoreceptors density
-                float Cr=*(chrominancePTR)*_colorLocalDensity[indexc];
-                float Cg=*(chrominancePTR+_filterOutput.getNBpixels())*_colorLocalDensity[indexc+_filterOutput.getNBpixels()];
-                float Cb=*(chrominancePTR+_filterOutput.getDoubleNBpixels())*_colorLocalDensity[indexc+_filterOutput.getDoubleNBpixels()];
-                *luminance=(Cr+Cg+Cb)*_pG;
-                *(chrominancePTR)=Cr-*luminance;
-                *(chrominancePTR+_filterOutput.getNBpixels())=Cg-*luminance;
-                *(chrominancePTR+_filterOutput.getDoubleNBpixels())=Cb-*luminance;
-            }
-        else
-            for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance)
-            {
-                float Cr=*(chrominancePTR);
-                float Cg=*(chrominancePTR+_filterOutput.getNBpixels());
-                float Cb=*(chrominancePTR+_filterOutput.getDoubleNBpixels());
-                *luminance=_pR*Cr+_pG*Cg+_pB*Cb;
-                *(chrominancePTR)=Cr-*luminance;
-                *(chrominancePTR+_filterOutput.getNBpixels())=Cg-*luminance;
-                *(chrominancePTR+_filterOutput.getDoubleNBpixels())=Cb-*luminance;
-            }
-
-        // in order to get the color image, each colored map needs to be added the luminance
-        // -> to do so, compute:  multiplexedColorFrame - remultiplexed chrominances
-        runColorMultiplexing(_chrominance, _tempMultiplexedFrame);
-        //lum = 1/3((f*(ImR))/(f*mR) + (f*(ImG))/(f*mG) + (f*(ImB))/(f*mB));
-        float *luminancePTR= &(*_luminance)[0];
-        chrominancePTR= &_chrominance[0];
-        float *demultiplexedColorFramePTR= &_demultiplexedColorFrame[0];
-        for (unsigned int indexp=0; indexp<_filterOutput.getNBpixels() ; ++indexp, ++luminancePTR, ++chrominancePTR, ++demultiplexedColorFramePTR)
-        {
-            *luminancePTR=(multiplexedColorFrame[indexp]-_tempMultiplexedFrame[indexp]);
-            *(demultiplexedColorFramePTR)=*(chrominancePTR)+*luminancePTR;
-            *(demultiplexedColorFramePTR+_filterOutput.getNBpixels())=*(chrominancePTR+_filterOutput.getNBpixels())+*luminancePTR;
-            *(demultiplexedColorFramePTR+_filterOutput.getDoubleNBpixels())=*(chrominancePTR+_filterOutput.getDoubleNBpixels())+*luminancePTR;
-        }
-
-    }else
-    {
-        register const float *multiplexedColorFramePTR= get_data(multiplexedColorFrame);
-        for (unsigned int indexc=0; indexc<_filterOutput.getNBpixels() ; ++indexc, ++chrominancePTR, ++colorLocalDensityPTR, ++luminance, ++multiplexedColorFramePTR)
-        {
-            // normalize by photoreceptors density
-            float Cr=*(chrominancePTR)*_colorLocalDensity[indexc];
-            float Cg=*(chrominancePTR+_filterOutput.getNBpixels())*_colorLocalDensity[indexc+_filterOutput.getNBpixels()];
-            float Cb=*(chrominancePTR+_filterOutput.getDoubleNBpixels())*_colorLocalDensity[indexc+_filterOutput.getDoubleNBpixels()];
-            *luminance=(Cr+Cg+Cb)*_pG;
-            _demultiplexedTempBuffer[_colorSampling[indexc]] = *multiplexedColorFramePTR - *luminance;
-
-        }
-
-        // compute the gradient of the luminance
-#ifdef MAKE_PARALLEL // call the TemplateBuffer TBB clipping method
-        cv::parallel_for_(cv::Range(2,_filterOutput.getNBrows()-2), Parallel_computeGradient(_filterOutput.getNBcolumns(), _filterOutput.getNBrows(), &(*_luminance)[0], &_imageGradient[0]));
-#else
-        _computeGradient(&(*_luminance)[0]);
-#endif
-        // adaptively filter the submosaics to get the adaptive densities, here the buffer _chrominance is used as a temp buffer
-        _adaptiveSpatialLPfilter(&_RGBmosaic[0], &_chrominance[0]);
-        _adaptiveSpatialLPfilter(&_RGBmosaic[0]+_filterOutput.getNBpixels(), &_chrominance[0]+_filterOutput.getNBpixels());
-        _adaptiveSpatialLPfilter(&_RGBmosaic[0]+_filterOutput.getDoubleNBpixels(), &_chrominance[0]+_filterOutput.getDoubleNBpixels());
-
-        _adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0], &_demultiplexedColorFrame[0]);
-        _adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels());
-        _adaptiveSpatialLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getDoubleNBpixels());
-
-/*      for (unsigned int index=0; index<_filterOutput.getNBpixels()*3 ; ++index) // cette boucle pourrait �tre supprimee en passant la densit� � la fonction de filtrage
-            _demultiplexedColorFrame[index] /= _chrominance[index];*/
-        _demultiplexedColorFrame/=_chrominance; // more optimal ;o)
-
-        // compute and substract the residual luminance
-        for (unsigned int index=0; index<_filterOutput.getNBpixels() ; ++index)
-        {
-            float residu = _pR*_demultiplexedColorFrame[index] + _pG*_demultiplexedColorFrame[index+_filterOutput.getNBpixels()] + _pB*_demultiplexedColorFrame[index+_filterOutput.getDoubleNBpixels()];
-            _demultiplexedColorFrame[index] = _demultiplexedColorFrame[index] - residu;
-            _demultiplexedColorFrame[index+_filterOutput.getNBpixels()] = _demultiplexedColorFrame[index+_filterOutput.getNBpixels()] - residu;
-            _demultiplexedColorFrame[index+_filterOutput.getDoubleNBpixels()] = _demultiplexedColorFrame[index+_filterOutput.getDoubleNBpixels()] - residu;
-        }
-
-        // multiplex the obtained chrominance
-        runColorMultiplexing(_demultiplexedColorFrame, _tempMultiplexedFrame);
-        _demultiplexedTempBuffer=0;
-
-        // get the luminance, et and add it to each chrominance
-        for (unsigned int index=0; index<_filterOutput.getNBpixels() ; ++index)
-        {
-            (*_luminance)[index]=multiplexedColorFrame[index]-_tempMultiplexedFrame[index];
-            _demultiplexedTempBuffer[_colorSampling[index]] = _demultiplexedColorFrame[_colorSampling[index]];//multiplexedColorFrame[index] - (*_luminance)[index];
-        }
-
-        _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0], &_demultiplexedTempBuffer[0]);
-        _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels(), &_demultiplexedTempBuffer[0]+_filterOutput.getNBpixels());
-        _spatiotemporalLPfilter(&_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels(), &_demultiplexedTempBuffer[0]+_filterOutput.getDoubleNBpixels());
-
-        // get the luminance and add it to each chrominance
-        for (unsigned int index=0; index<_filterOutput.getNBpixels() ; ++index)
-        {
-            _demultiplexedColorFrame[index] = _demultiplexedTempBuffer[index]*_colorLocalDensity[index]+ (*_luminance)[index];
-            _demultiplexedColorFrame[index+_filterOutput.getNBpixels()] = _demultiplexedTempBuffer[index+_filterOutput.getNBpixels()]*_colorLocalDensity[index+_filterOutput.getNBpixels()]+ (*_luminance)[index];
-            _demultiplexedColorFrame[index+_filterOutput.getDoubleNBpixels()] = _demultiplexedTempBuffer[index+_filterOutput.getDoubleNBpixels()]*_colorLocalDensity[index+_filterOutput.getDoubleNBpixels()]+ (*_luminance)[index];
-        }
-    }
-
-    // eliminate saturated colors by simple clipping values to the input range
-    clipRGBOutput_0_maxInputValue(NULL, maxInputValue);
-
-    /* transfert image gradient in order to check validity
-    memcpy((*_luminance), _imageGradient, sizeof(float)*_filterOutput.getNBpixels());
-    memcpy(_demultiplexedColorFrame, _imageGradient+_filterOutput.getNBpixels(), sizeof(float)*_filterOutput.getNBpixels());
-    memcpy(_demultiplexedColorFrame+_filterOutput.getNBpixels(), _imageGradient+_filterOutput.getNBpixels(), sizeof(float)*_filterOutput.getNBpixels());
-    memcpy(_demultiplexedColorFrame+2*_filterOutput.getNBpixels(), _imageGradient+_filterOutput.getNBpixels(), sizeof(float)*_filterOutput.getNBpixels());
-     */
-
-    if (_saturateColors)
-    {
-        TemplateBuffer<float>::normalizeGrayOutputCentredSigmoide(128, _colorSaturationValue, maxInputValue, &_demultiplexedColorFrame[0], &_demultiplexedColorFrame[0], _filterOutput.getNBpixels());
-        TemplateBuffer<float>::normalizeGrayOutputCentredSigmoide(128, _colorSaturationValue, maxInputValue, &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels(), &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels(), _filterOutput.getNBpixels());
-        TemplateBuffer<float>::normalizeGrayOutputCentredSigmoide(128, _colorSaturationValue, maxInputValue, &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels()*2, &_demultiplexedColorFrame[0]+_filterOutput.getNBpixels()*2, _filterOutput.getNBpixels());
-    }
-}
-
-// color multiplexing: input frame size=_NBrows*_filterOutput.getNBcolumns()*3, multiplexedFrame output size=_NBrows*_filterOutput.getNBcolumns()
-void RetinaColor::runColorMultiplexing(const std::valarray<float> &demultiplexedInputFrame, std::valarray<float> &multiplexedFrame)
-{
-    // multiply each color layer by its bayer mask
-    register unsigned int *colorSamplingPTR= &_colorSampling[0];
-    register float *multiplexedFramePTR= &multiplexedFrame[0];
-    for (unsigned int indexp=0; indexp<_filterOutput.getNBpixels(); ++indexp)
-        *(multiplexedFramePTR++)=demultiplexedInputFrame[*(colorSamplingPTR++)];
-}
-
-void RetinaColor::normalizeRGBOutput_0_maxOutputValue(const float maxOutputValue)
-{
-    //normalizeGrayOutputCentredSigmoide(0.0, 2, _chrominance);
-    TemplateBuffer<float>::normalizeGrayOutput_0_maxOutputValue(&_demultiplexedColorFrame[0], 3*_filterOutput.getNBpixels(), maxOutputValue);
-    //normalizeGrayOutputCentredSigmoide(0.0, 2, _chrominance+_filterOutput.getNBpixels());
-    //normalizeGrayOutput_0_maxOutputValue(_demultiplexedColorFrame+_filterOutput.getNBpixels(), _filterOutput.getNBpixels(), maxOutputValue);
-    //normalizeGrayOutputCentredSigmoide(0.0, 2, _chrominance+2*_filterOutput.getNBpixels());
-    //normalizeGrayOutput_0_maxOutputValue(_demultiplexedColorFrame+_filterOutput.getDoubleNBpixels(), _filterOutput.getNBpixels(), maxOutputValue);
-    TemplateBuffer<float>::normalizeGrayOutput_0_maxOutputValue(&(*_luminance)[0], _filterOutput.getNBpixels(), maxOutputValue);
-}
-
-/// normalize output between 0 and maxOutputValue;
-void RetinaColor::clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const float maxInputValue)
-{
-    //std::cout<<"RetinaColor::normalizing RGB frame..."<<std::endl;
-    // if outputBuffer unsassigned, the rewrite the buffer
-    if (inputOutputBuffer==NULL)
-        inputOutputBuffer= &_demultiplexedColorFrame[0];
-
-#ifdef MAKE_PARALLEL // call the TemplateBuffer TBB clipping method
-        cv::parallel_for_(cv::Range(0,_filterOutput.getNBpixels()*3), Parallel_clipBufferValues<float>(inputOutputBuffer, 0,  maxInputValue));
-#else
-    register float *inputOutputBufferPTR=inputOutputBuffer;
-    for (register unsigned int jf = 0; jf < _filterOutput.getNBpixels()*3; ++jf, ++inputOutputBufferPTR)
-    {
-        if (*inputOutputBufferPTR>maxInputValue)
-            *inputOutputBufferPTR=maxInputValue;
-        else if (*inputOutputBufferPTR<0)
-            *inputOutputBufferPTR=0;
-    }
-#endif
-    //std::cout<<"RetinaColor::...normalizing RGB frame OK"<<std::endl;
-}
-
-void RetinaColor::_interpolateImageDemultiplexedImage(float *inputOutputBuffer)
-{
-
-    switch(_samplingMethod)
-    {
-
-    case RETINA_COLOR_RANDOM:
-        return; // no need to interpolate
-        break;
-
-    case RETINA_COLOR_DIAGONAL:
-        _interpolateSingleChannelImage111(inputOutputBuffer);
-        break;
-
-    case RETINA_COLOR_BAYER: // default sets bayer sampling
-        _interpolateBayerRGBchannels(inputOutputBuffer);
-        break;
-    default:
-        std::cerr<<"RetinaColor::No or wrong color sampling method, skeeping"<<std::endl;
-        return;
-        break;//.. not useful, yes
-
-    }
-
-}
-
-void RetinaColor::_interpolateSingleChannelImage111(float *inputOutputBuffer)
-{
-    for (unsigned int indexr=0 ; indexr<_filterOutput.getNBrows(); ++indexr)
-    {
-        for (unsigned int indexc=1 ; indexc<_filterOutput.getNBcolumns()-1; ++indexc)
-        {
-            unsigned int index=indexc+indexr*_filterOutput.getNBcolumns();
-            inputOutputBuffer[index]=(inputOutputBuffer[index-1]+inputOutputBuffer[index]+inputOutputBuffer[index+1])/3.f;
-        }
-    }
-    for (unsigned int indexc=0 ; indexc<_filterOutput.getNBcolumns(); ++indexc)
-    {
-        for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; ++indexr)
-        {
-            unsigned int index=indexc+indexr*_filterOutput.getNBcolumns();
-            inputOutputBuffer[index]=(inputOutputBuffer[index-_filterOutput.getNBcolumns()]+inputOutputBuffer[index]+inputOutputBuffer[index+_filterOutput.getNBcolumns()])/3.f;
-        }
-    }
-}
-
-void RetinaColor::_interpolateBayerRGBchannels(float *inputOutputBuffer)
-{
-    for (unsigned int indexr=0 ; indexr<_filterOutput.getNBrows()-1; indexr+=2)
-    {
-        for (unsigned int indexc=1 ; indexc<_filterOutput.getNBcolumns()-1; indexc+=2)
-        {
-            unsigned int indexR=indexc+indexr*_filterOutput.getNBcolumns();
-            unsigned int indexB=_filterOutput.getDoubleNBpixels()+indexc+1+(indexr+1)*_filterOutput.getNBcolumns();
-            inputOutputBuffer[indexR]=(inputOutputBuffer[indexR-1]+inputOutputBuffer[indexR+1])/2.f;
-            inputOutputBuffer[indexB]=(inputOutputBuffer[indexB-1]+inputOutputBuffer[indexB+1])/2.f;
-        }
-    }
-    for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; indexr+=2)
-    {
-        for (unsigned int indexc=0 ; indexc<_filterOutput.getNBcolumns(); ++indexc)
-        {
-            unsigned int indexR=indexc+indexr*_filterOutput.getNBcolumns();
-            unsigned int indexB=_filterOutput.getDoubleNBpixels()+indexc+1+(indexr+1)*_filterOutput.getNBcolumns();
-            inputOutputBuffer[indexR]=(inputOutputBuffer[indexR-_filterOutput.getNBcolumns()]+inputOutputBuffer[indexR+_filterOutput.getNBcolumns()])/2.f;
-            inputOutputBuffer[indexB]=(inputOutputBuffer[indexB-_filterOutput.getNBcolumns()]+inputOutputBuffer[indexB+_filterOutput.getNBcolumns()])/2.f;
-
-        }
-    }
-    for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; ++indexr)
-        for (unsigned int indexc=0 ; indexc<_filterOutput.getNBcolumns(); indexc+=2)
-        {
-            unsigned int indexG=_filterOutput.getNBpixels()+indexc+(indexr)*_filterOutput.getNBcolumns()+indexr%2;
-            inputOutputBuffer[indexG]=(inputOutputBuffer[indexG-1]+inputOutputBuffer[indexG+1]+inputOutputBuffer[indexG-_filterOutput.getNBcolumns()]+inputOutputBuffer[indexG+_filterOutput.getNBcolumns()])*0.25f;
-        }
-}
-
-void RetinaColor::_applyRIFfilter(const float *sourceBuffer, float *destinationBuffer)
-{
-    for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; ++indexr)
-    {
-        for (unsigned int indexc=1 ; indexc<_filterOutput.getNBcolumns()-1; ++indexc)
-        {
-            unsigned int index=indexc+indexr*_filterOutput.getNBcolumns();
-            _tempMultiplexedFrame[index]=(4.f*sourceBuffer[index]+sourceBuffer[index-1-_filterOutput.getNBcolumns()]+sourceBuffer[index-1+_filterOutput.getNBcolumns()]+sourceBuffer[index+1-_filterOutput.getNBcolumns()]+sourceBuffer[index+1+_filterOutput.getNBcolumns()])*0.125f;
-        }
-    }
-    memcpy(destinationBuffer, &_tempMultiplexedFrame[0], sizeof(float)*_filterOutput.getNBpixels());
-}
-
-void RetinaColor::_getNormalizedContoursImage(const float *inputFrame, float *outputFrame)
-{
-    float maxValue=0.f;
-    float normalisationFactor=1.f/3.f;
-    for (unsigned int indexr=1 ; indexr<_filterOutput.getNBrows()-1; ++indexr)
-    {
-        for (unsigned int indexc=1 ; indexc<_filterOutput.getNBcolumns()-1; ++indexc)
-        {
-            unsigned int index=indexc+indexr*_filterOutput.getNBcolumns();
-            outputFrame[index]=normalisationFactor*fabs(8.f*inputFrame[index]-inputFrame[index-1]-inputFrame[index+1]-inputFrame[index-_filterOutput.getNBcolumns()]-inputFrame[index+_filterOutput.getNBcolumns()]-inputFrame[index-1-_filterOutput.getNBcolumns()]-inputFrame[index-1+_filterOutput.getNBcolumns()]-inputFrame[index+1-_filterOutput.getNBcolumns()]-inputFrame[index+1+_filterOutput.getNBcolumns()]);
-            if (outputFrame[index]>maxValue)
-                maxValue=outputFrame[index];
-        }
-    }
-    normalisationFactor=1.f/maxValue;
-    // normalisation [0, 1]
-    for (unsigned int indexp=1 ; indexp<_filterOutput.getNBrows()-1; ++indexp)
-       outputFrame[indexp]=outputFrame[indexp]*normalisationFactor;
-}
-
-//////////////////////////////////////////////////////////
-//        ADAPTIVE BASIC RETINA FILTER
-//////////////////////////////////////////////////////////
-// run LP filter for a new frame input and save result at a specific output adress
-void RetinaColor::_adaptiveSpatialLPfilter(const float *inputFrame, float *outputFrame)
-{
-
-    /**********/
-    _gain = (1-0.57f)*(1-0.57f)*(1-0.06f)*(1-0.06f);
-
-    // launch the serie of 1D directional filters in order to compute the 2D low pass filter
-    // -> horizontal filters work with the first layer of imageGradient
-    _adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, 0, _filterOutput.getNBrows());
-    _horizontalAnticausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBrows(), &_imageGradient[0]);
-    // -> horizontal filters work with the second layer of imageGradient
-    _verticalCausalFilter_Irregular(outputFrame, 0, _filterOutput.getNBcolumns(), &_imageGradient[0]+_filterOutput.getNBpixels());
-    _adaptiveVerticalAnticausalFilter_multGain(outputFrame, 0, _filterOutput.getNBcolumns());
-}
-
-//  horizontal causal filter which adds the input inside... replaces the parent _horizontalCausalFilter_Irregular_addInput by avoiding a product for each pixel
-void RetinaColor::_adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, unsigned int IDrowStart, unsigned int IDrowEnd)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDrowStart,IDrowEnd), Parallel_adaptiveHorizontalCausalFilter_addInput(inputFrame, outputFrame, &_imageGradient[0], _filterOutput.getNBcolumns()));
-#else
-    register float* outputPTR=outputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float* inputPTR=inputFrame+IDrowStart*_filterOutput.getNBcolumns();
-    register const float *imageGradientPTR= &_imageGradient[0]+IDrowStart*_filterOutput.getNBcolumns();
-    for (unsigned int IDrow=IDrowStart; IDrow<IDrowEnd; ++IDrow)
-    {
-        register float result=0;
-        for (unsigned int index=0; index<_filterOutput.getNBcolumns(); ++index)
-        {
-            //std::cout<<(*imageGradientPTR)<<" ";
-            result = *(inputPTR++) + (*imageGradientPTR)* result;
-            *(outputPTR++) = result;
-            ++imageGradientPTR;
-        }
-        //        std::cout<<" "<<std::endl;
-    }
-#endif
-}
-
-//  vertical anticausal filter which multiplies the output by _gain... replaces the parent _verticalAnticausalFilter_multGain by avoiding a product for each pixel and taking into account the second layer of the _imageGradient buffer
-void RetinaColor::_adaptiveVerticalAnticausalFilter_multGain(float *outputFrame, unsigned int IDcolumnStart, unsigned int IDcolumnEnd)
-{
-#ifdef MAKE_PARALLEL
-        cv::parallel_for_(cv::Range(IDcolumnStart,IDcolumnEnd), Parallel_adaptiveVerticalAnticausalFilter_multGain(outputFrame, &_imageGradient[0]+_filterOutput.getNBpixels(), _filterOutput.getNBrows(), _filterOutput.getNBcolumns(), _gain));
-#else
-    float* outputOffset=outputFrame+_filterOutput.getNBpixels()-_filterOutput.getNBcolumns();
-    float* gradOffset= &_imageGradient[0]+_filterOutput.getNBpixels()*2-_filterOutput.getNBcolumns();
-
-    for (unsigned int IDcolumn=IDcolumnStart; IDcolumn<IDcolumnEnd; ++IDcolumn)
-    {
-        register float result=0;
-        register float *outputPTR=outputOffset+IDcolumn;
-        register float *imageGradientPTR=gradOffset+IDcolumn;
-        for (unsigned int index=0; index<_filterOutput.getNBrows(); ++index)
-        {
-            result = *(outputPTR) + (*(imageGradientPTR)) * result;
-            *(outputPTR) = _gain*result;
-            outputPTR-=_filterOutput.getNBcolumns();
-            imageGradientPTR-=_filterOutput.getNBcolumns();
-        }
-    }
-#endif
-}
-
-///////////////////////////
-void RetinaColor::_computeGradient(const float *luminance)
-{
-    for (unsigned int idLine=2;idLine<_filterOutput.getNBrows()-2;++idLine)
-    {
-        for (unsigned int idColumn=2;idColumn<_filterOutput.getNBcolumns()-2;++idColumn)
-        {
-            const unsigned int pixelIndex=idColumn+_filterOutput.getNBcolumns()*idLine;
-
-            // horizontal and vertical local gradients
-            const float verticalGrad=fabs(luminance[pixelIndex+_filterOutput.getNBcolumns()]-luminance[pixelIndex-_filterOutput.getNBcolumns()]);
-            const float horizontalGrad=fabs(luminance[pixelIndex+1]-luminance[pixelIndex-1]);
-
-            // neighborhood horizontal and vertical gradients
-            const float verticalGrad_p=fabs(luminance[pixelIndex]-luminance[pixelIndex-2*_filterOutput.getNBcolumns()]);
-            const float horizontalGrad_p=fabs(luminance[pixelIndex]-luminance[pixelIndex-2]);
-            const float verticalGrad_n=fabs(luminance[pixelIndex+2*_filterOutput.getNBcolumns()]-luminance[pixelIndex]);
-            const float horizontalGrad_n=fabs(luminance[pixelIndex+2]-luminance[pixelIndex]);
-
-            const float horizontalGradient=0.5f*horizontalGrad+0.25f*(horizontalGrad_p+horizontalGrad_n);
-            const float verticalGradient=0.5f*verticalGrad+0.25f*(verticalGrad_p+verticalGrad_n);
-
-            // compare local gradient means and fill the appropriate filtering coefficient value that will be used in adaptative filters
-            if (horizontalGradient<verticalGradient)
-            {
-                _imageGradient[pixelIndex+_filterOutput.getNBpixels()]=0.06f;
-                _imageGradient[pixelIndex]=0.57f;
-            }
-            else
-            {
-                _imageGradient[pixelIndex+_filterOutput.getNBpixels()]=0.57f;
-                _imageGradient[pixelIndex]=0.06f;
-            }
-        }
-    }
-}
-
-bool RetinaColor::applyKrauskopfLMS2Acr1cr2Transform(std::valarray<float> &result)
-{
-    bool processSuccess=true;
-    // basic preliminary error check
-    if (result.size()!=_demultiplexedColorFrame.size())
-    {
-        std::cerr<<"RetinaColor::applyKrauskopfLMS2Acr1cr2Transform: input buffer does not match retina buffer size, conversion aborted"<<std::endl;
-        return false;
-    }
-
-    // apply transformation
-    _applyImageColorSpaceConversion(_demultiplexedColorFrame, result, _LMStoACr1Cr2);
-
-    return processSuccess;
-}
-
-bool RetinaColor::applyLMS2LabTransform(std::valarray<float> &result)
-{
-    bool processSuccess=true;
-    // basic preliminary error check
-    if (result.size()!=_demultiplexedColorFrame.size())
-    {
-        std::cerr<<"RetinaColor::applyKrauskopfLMS2Acr1cr2Transform: input buffer does not match retina buffer size, conversion aborted"<<std::endl;
-        return false;
-    }
-
-    // apply transformation
-    _applyImageColorSpaceConversion(_demultiplexedColorFrame, result, _LMStoLab);
-
-    return processSuccess;
-}
-
-// template function able to perform a custom color space transformation
-void RetinaColor::_applyImageColorSpaceConversion(const std::valarray<float> &inputFrameBuffer, std::valarray<float> &outputFrameBuffer, const float *transformTable)
-{
-    // two step methods in order to allow inputFrame and outputFrame to be the same
-    unsigned int nbPixels=(unsigned int)(inputFrameBuffer.size()/3), dbpixels=(unsigned int)(2*inputFrameBuffer.size()/3);
-
-    const float *inputFrame=get_data(inputFrameBuffer);
-    float *outputFrame= &outputFrameBuffer[0];
-
-    for (unsigned int dataIndex=0; dataIndex<nbPixels;++dataIndex, ++outputFrame, ++inputFrame)
-    {
-        // first step, compute each new values
-        float layer1 = *(inputFrame)**(transformTable+0)  +*(inputFrame+nbPixels)**(transformTable+1)  +*(inputFrame+dbpixels)**(transformTable+2);
-        float layer2 = *(inputFrame)**(transformTable+3)  +*(inputFrame+nbPixels)**(transformTable+4)  +*(inputFrame+dbpixels)**(transformTable+5);
-        float layer3 = *(inputFrame)**(transformTable+6)  +*(inputFrame+nbPixels)**(transformTable+7)  +*(inputFrame+dbpixels)**(transformTable+8);
-        // second, affect the output
-        *(outputFrame)          = layer1;
-        *(outputFrame+nbPixels) = layer2;
-        *(outputFrame+dbpixels) = layer3;
-    }
-}
-
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/retinacolor.hpp b/modules/bioinspired/src/retinacolor.hpp
deleted file mode 100644
index 3fb6be8f9..000000000
--- a/modules/bioinspired/src/retinacolor.hpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-/**
-* @class RetinaColor a color multilexing/demultiplexing (demosaicing) based on a human vision inspiration. Different mosaicing strategies can be used, included random sampling !
-* => please take a look at the nice and efficient demosaicing strategy introduced by B.Chaix de Lavarene, take a look at the cited paper for more mathematical details
-* @brief Retina color sampling model which allows classical bayer sampling, random and potentially several other method ! Low color errors on corners !
-* -> Based on the research of:
-*		.Brice Chaix Lavarene (chaix@lis.inpg.fr)
-*		.Jeanny Herault (herault@lis.inpg.fr)
-*		.David Alleyson (david.alleyson@upmf-grenoble.fr)
-*      .collaboration: alexandre benoit (benoit.alexandre.vision@gmail.com or benoit@lis.inpg.fr)
-* Please cite: B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC / Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-*/
-
-#ifndef RETINACOLOR_HPP_
-#define RETINACOLOR_HPP_
-
-#include "basicretinafilter.hpp"
-
-//#define __RETINACOLORDEBUG //define RETINACOLORDEBUG in order to display debug data
-
-namespace cv
-{
-namespace bioinspired
-{
-    class RetinaColor: public BasicRetinaFilter
-    {
-    public:
-        /**
-        * @typedef which allows to select the type of photoreceptors color sampling
-        */
-
-        /**
-        * constructor of the retina color processing model
-        * @param NBrows: number of rows of the input image
-        * @param NBcolumns: number of columns of the input image
-        * @param samplingMethod: the chosen color sampling method
-        */
-        RetinaColor(const unsigned int NBrows, const unsigned int NBcolumns, const int samplingMethod=RETINA_COLOR_BAYER);
-
-        /**
-        * standard destructor
-        */
-        virtual ~RetinaColor();
-
-        /**
-        * function that clears all buffers of the object
-        */
-        void clearAllBuffers();
-
-        /**
-        * resize retina color filter object (resize all allocated buffers)
-        * @param NBrows: the new height size
-        * @param NBcolumns: the new width size
-        */
-        void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-
-        /**
-        * color multiplexing function: a demultiplexed RGB frame of size M*N*3 is transformed into a multiplexed M*N*1 pixels frame where each pixel is either Red, or Green or Blue
-        * @param inputRGBFrame: the input RGB frame to be processed
-        * @return, nothing but the multiplexed frame is available by the use of the getMultiplexedFrame() function
-        */
-        inline void runColorMultiplexing(const std::valarray<float> &inputRGBFrame){runColorMultiplexing(inputRGBFrame, *_multiplexedFrame);};
-
-        /**
-        * color multiplexing function: a demultipleed RGB frame of size M*N*3 is transformed into a multiplexed M*N*1 pixels frame where each pixel is either Red, or Green or Blue if using RGB images
-        * @param demultiplexedInputFrame: the demultiplexed input frame to be processed of size M*N*3
-        * @param multiplexedFrame: the resulting multiplexed frame
-        */
-        void runColorMultiplexing(const std::valarray<float> &demultiplexedInputFrame, std::valarray<float> &multiplexedFrame);
-
-        /**
-        * color demultiplexing function: a multiplexed frame of size M*N*1 pixels is transformed into a RGB demultiplexed M*N*3 pixels frame
-        * @param multiplexedColorFrame: the input multiplexed frame to be processed
-        * @param adaptiveFiltering: specifies if an adaptive filtering has to be perform rather than standard filtering (adaptive filtering allows a better rendering)
-        * @param maxInputValue: the maximum input data value (should be 255 for 8 bits images but it can change in the case of High Dynamic Range Images (HDRI)
-        * @return, nothing but the output demultiplexed frame is available by the use of the getDemultiplexedColorFrame() function, also use getLuminance() and getChrominance() in order to retreive either luminance or chrominance
-        */
-        void runColorDemultiplexing(const std::valarray<float> &multiplexedColorFrame, const bool adaptiveFiltering=false, const float maxInputValue=255.0);
-
-        /**
-        * activate color saturation as the final step of the color demultiplexing process
-        * -> this saturation is a sigmoide function applied to each channel of the demultiplexed image.
-        * @param saturateColors: boolean that activates color saturation (if true) or desactivate (if false)
-        * @param colorSaturationValue: the saturation factor
-        * */
-        void setColorSaturation(const bool saturateColors=true, const float colorSaturationValue=4.0){_saturateColors=saturateColors; _colorSaturationValue=colorSaturationValue;};
-
-        /**
-        * set parameters of the low pass spatio-temporal filter used to retreive the low chrominance
-        * @param beta: gain of the filter (generally set to zero)
-        * @param tau: time constant of the filter (unit is frame for video processing), typically 0 when considering static processing, 1 or more if a temporal smoothing effect is required
-        * @param k: spatial constant of the filter (unit is pixels), typical value is 2.5
-        */
-        void setChrominanceLPfilterParameters(const float beta, const float tau, const float k){setLPfilterParameters(beta, tau, k);};
-
-        /**
-        * apply to the retina color output the Krauskopf transformation which leads to an opponent color system: output colorspace if Acr1cr2 if input of the retina was LMS color space
-        * @param result: the input buffer to fill with the transformed colorspace retina output
-        * @return true if process ended successfully
-        */
-        bool applyKrauskopfLMS2Acr1cr2Transform(std::valarray<float> &result);
-
-        /**
-        * apply to the retina color output the CIE Lab color transformation
-        * @param result: the input buffer to fill with the transformed colorspace retina output
-        * @return true if process ended successfully
-        */
-        bool applyLMS2LabTransform(std::valarray<float> &result);
-
-        /**
-        * @return the multiplexed frame result (use this after function runColorMultiplexing)
-        */
-        inline const std::valarray<float> &getMultiplexedFrame() const {return *_multiplexedFrame;};
-
-        /**
-        * @return the demultiplexed frame result (use this after function runColorDemultiplexing)
-        */
-        inline const std::valarray<float> &getDemultiplexedColorFrame() const {return _demultiplexedColorFrame;};
-
-        /**
-        * @return the luminance of the processed frame (use this after function runColorDemultiplexing)
-        */
-        inline const std::valarray<float> &getLuminance() const {return *_luminance;};
-
-        /**
-        * @return the chrominance of the processed frame (use this after function runColorDemultiplexing)
-        */
-        inline const std::valarray<float> &getChrominance() const {return _chrominance;};
-
-        /**
-        * standard 0 to 255 image clipping function appled to RGB images (of size M*N*3 pixels)
-        * @param inputOutputBuffer: the image to be normalized (rewrites the input), if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param maxOutputValue: the maximum value allowed at the output (values superior to it would be clipped
-        */
-        void clipRGBOutput_0_maxInputValue(float *inputOutputBuffer, const float maxOutputValue=255.0);
-
-        /**
-        * standard 0 to 255 image normalization function appled to RGB images (of size M*N*3 pixels)
-        * @param maxOutputValue: the maximum value allowed at the output (values superior to it would be clipped
-        */
-        void normalizeRGBOutput_0_maxOutputValue(const float maxOutputValue=255.0);
-
-        /**
-        * return the color sampling map: a Nrows*Mcolumns image in which each pixel value is the ofsset adress which gives the adress of the sampled pixel on an Nrows*Mcolumns*3 color image ordered by layers: layer1, layer2, layer3
-        */
-        inline const std::valarray<unsigned int> &getSamplingMap() const {return _colorSampling;};
-
-        /**
-        * function used (to bypass processing) to manually set the color output
-        * @param demultiplexedImage: the color image (luminance+chrominance) which has to be written in the object buffer
-        */
-        inline void setDemultiplexedColorFrame(const std::valarray<float> &demultiplexedImage){_demultiplexedColorFrame=demultiplexedImage;};
-
-    protected:
-
-        // private functions
-        int _samplingMethod;
-        bool _saturateColors;
-        float _colorSaturationValue;
-        // links to parent buffers (more convienient names
-        TemplateBuffer<float> *_luminance;
-        std::valarray<float> *_multiplexedFrame;
-        // instance buffers
-        std::valarray<unsigned int> _colorSampling; // table (size (_nbRows*_nbColumns) which specifies the color of each pixel
-        std::valarray<float> _RGBmosaic;
-        std::valarray<float> _tempMultiplexedFrame;
-        std::valarray<float> _demultiplexedTempBuffer;
-        std::valarray<float> _demultiplexedColorFrame;
-        std::valarray<float> _chrominance;
-        std::valarray<float> _colorLocalDensity;// buffer which contains the local density of the R, G and B photoreceptors for a normalization use
-        std::valarray<float> _imageGradient;
-
-        // variables
-        float _pR, _pG, _pB; // probabilities of color R, G and B
-        bool _objectInit;
-
-        // protected functions
-        void _initColorSampling();
-        void _interpolateImageDemultiplexedImage(float *inputOutputBuffer);
-        void _interpolateSingleChannelImage111(float *inputOutputBuffer);
-        void _interpolateBayerRGBchannels(float *inputOutputBuffer);
-        void _applyRIFfilter(const float *sourceBuffer, float *destinationBuffer);
-        void _getNormalizedContoursImage(const float *inputFrame, float *outputFrame);
-        // -> special adaptive filters dedicated to low pass filtering on the chrominance (skeeps filtering on the edges)
-        void _adaptiveSpatialLPfilter(const float *inputFrame,  float *outputFrame);
-        void _adaptiveHorizontalCausalFilter_addInput(const float *inputFrame, float *outputFrame, const unsigned int IDrowStart, const unsigned int IDrowEnd); // TBB parallelized
-        void _adaptiveVerticalAnticausalFilter_multGain(float *outputFrame, const unsigned int IDcolumnStart, const unsigned int IDcolumnEnd);
-        void _computeGradient(const float *luminance);
-        void _normalizeOutputs_0_maxOutputValue(void);
-
-        // color space transform
-        void _applyImageColorSpaceConversion(const std::valarray<float> &inputFrame, std::valarray<float> &outputFrame, const float *transformTable);
-
-#ifdef MAKE_PARALLEL
-        /******************************************************
-        ** IF some parallelizing thread methods are available, then, main loops are parallelized using these functors
-        ** ==> main idea paralellise main filters loops, then, only the most used methods are parallelized... TODO : increase the number of parallelised methods as necessary
-        ** ==> functors names = Parallel_$$$ where $$$= the name of the serial method that is parallelised
-        ** ==> functors constructors can differ from the parameters used with their related serial functions
-        */
-
-        /* Template :
-        class Parallel_ : public cv::ParallelLoopBody
-        {
-        private:
-
-        public:
-        Parallel_()
-        : {}
-
-        virtual void operator()( const cv::Range& r ) const {
-
-        }
-        }:
-        */
-        class Parallel_adaptiveHorizontalCausalFilter_addInput: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            const float *inputFrame, *imageGradient;
-            unsigned int nbColumns;
-        public:
-            Parallel_adaptiveHorizontalCausalFilter_addInput(const float *inputImg, float *bufferToProcess, const float *imageGrad, const unsigned int nbCols)
-                :outputFrame(bufferToProcess), inputFrame(inputImg), imageGradient(imageGrad), nbColumns(nbCols) {};
-
-            virtual void operator()( const Range& r ) const {
-                register float* outputPTR=outputFrame+r.start*nbColumns;
-                register const float* inputPTR=inputFrame+r.start*nbColumns;
-                register const float *imageGradientPTR= imageGradient+r.start*nbColumns;
-                for (int IDrow=r.start; IDrow!=r.end; ++IDrow)
-                {
-                    register float result=0;
-                    for (unsigned int index=0; index<nbColumns; ++index)
-                    {
-                        result = *(inputPTR++) + (*imageGradientPTR++)* result;
-                        *(outputPTR++) = result;
-                    }
-                }
-            }
-        };
-
-        class Parallel_adaptiveVerticalAnticausalFilter_multGain: public cv::ParallelLoopBody
-        {
-        private:
-            float *outputFrame;
-            const float *imageGradient;
-            unsigned int nbRows, nbColumns;
-            float filterParam_gain;
-        public:
-            Parallel_adaptiveVerticalAnticausalFilter_multGain(float *bufferToProcess, const float *imageGrad, const unsigned int nbRws, const unsigned int nbCols, const float  gain)
-                :outputFrame(bufferToProcess), imageGradient(imageGrad), nbRows(nbRws), nbColumns(nbCols), filterParam_gain(gain){}
-
-            virtual void operator()( const Range& r ) const {
-                float* offset=outputFrame+nbColumns*nbRows-nbColumns;
-                const float* gradOffset= imageGradient+nbColumns*nbRows-nbColumns;
-                for (int IDcolumn=r.start; IDcolumn!=r.end; ++IDcolumn)
-                {
-                    register float result=0;
-                    register float *outputPTR=offset+IDcolumn;
-                    register const float *imageGradientPTR=gradOffset+IDcolumn;
-                    for (unsigned int index=0; index<nbRows; ++index)
-                    {
-                        result = *(outputPTR) + *(imageGradientPTR) * result;
-                        *(outputPTR) = filterParam_gain*result;
-                        outputPTR-=nbColumns;
-                        imageGradientPTR-=nbColumns;
-                    }
-                }
-            }
-        };
-
-        class Parallel_computeGradient: public cv::ParallelLoopBody
-        {
-        private:
-            float *imageGradient;
-            const float *luminance;
-            unsigned int nbColumns, doubleNbColumns, nbRows, nbPixels;
-        public:
-            Parallel_computeGradient(const unsigned int nbCols, const unsigned int nbRws, const float *lum, float *imageGrad)
-            :imageGradient(imageGrad), luminance(lum), nbColumns(nbCols), doubleNbColumns(2*nbCols), nbRows(nbRws), nbPixels(nbRws*nbCols){};
-
-            virtual void operator()( const Range& r ) const {
-                for (int idLine=r.start;idLine!=r.end;++idLine)
-                {
-                    for (unsigned int idColumn=2;idColumn<nbColumns-2;++idColumn)
-                    {
-                        const unsigned int pixelIndex=idColumn+nbColumns*idLine;
-
-                        // horizontal and vertical local gradients
-                        const float verticalGrad=fabs(luminance[pixelIndex+nbColumns]-luminance[pixelIndex-nbColumns]);
-                        const float horizontalGrad=fabs(luminance[pixelIndex+1]-luminance[pixelIndex-1]);
-
-                        // neighborhood horizontal and vertical gradients
-                        const float verticalGrad_p=fabs(luminance[pixelIndex]-luminance[pixelIndex-doubleNbColumns]);
-                        const float horizontalGrad_p=fabs(luminance[pixelIndex]-luminance[pixelIndex-2]);
-                        const float verticalGrad_n=fabs(luminance[pixelIndex+doubleNbColumns]-luminance[pixelIndex]);
-                        const float horizontalGrad_n=fabs(luminance[pixelIndex+2]-luminance[pixelIndex]);
-
-                        const float horizontalGradient=0.5f*horizontalGrad+0.25f*(horizontalGrad_p+horizontalGrad_n);
-                        const float verticalGradient=0.5f*verticalGrad+0.25f*(verticalGrad_p+verticalGrad_n);
-
-                        // compare local gradient means and fill the appropriate filtering coefficient value that will be used in adaptative filters
-                        if (horizontalGradient<verticalGradient)
-                        {
-                            imageGradient[pixelIndex+nbPixels]=0.06f;
-                            imageGradient[pixelIndex]=0.57f;
-                        }
-                        else
-                        {
-                            imageGradient[pixelIndex+nbPixels]=0.57f;
-                            imageGradient[pixelIndex]=0.06f;
-                        }
-                    }
-                }
-            }
-        };
-
-#endif
-    };
-}// end of namespace bioinspired
-}// end of namespace cv
-
-#endif /*RETINACOLOR_HPP_*/
diff --git a/modules/bioinspired/src/retinafasttonemapping.cpp b/modules/bioinspired/src/retinafasttonemapping.cpp
deleted file mode 100644
index 2713d7449..000000000
--- a/modules/bioinspired/src/retinafasttonemapping.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-
-/*#******************************************************************************
- ** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
- **
- ** By downloading, copying, installing or using the software you agree to this license.
- ** If you do not agree to this license, do not download, install,
- ** copy or use the software.
- **
- **
- ** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
- **
- ** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
- **
- **  Creation - enhancement process 2007-2013
- **      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
- **
- ** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
- ** Refer to the following research paper for more information:
- ** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- ** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
- ** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- **
- **
- ** This class is based on image processing tools of the author and already used within the Retina class (this is the same code as method retina::applyFastToneMapping, but in an independent class, it is ligth from a memory requirement point of view). It implements an adaptation of the efficient tone mapping algorithm propose by David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
- ** -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
- **
- **
- **                          License Agreement
- **               For Open Source Computer Vision Library
- **
- ** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
- ** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
- **
- **               For Human Visual System tools (bioinspired)
- ** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
- **
- ** Third party copyrights are property of their respective owners.
- **
- ** Redistribution and use in source and binary forms, with or without modification,
- ** are permitted provided that the following conditions are met:
- **
- ** * Redistributions of source code must retain the above copyright notice,
- **    this list of conditions and the following disclaimer.
- **
- ** * Redistributions in binary form must reproduce the above copyright notice,
- **    this list of conditions and the following disclaimer in the documentation
- **    and/or other materials provided with the distribution.
- **
- ** * The name of the copyright holders may not be used to endorse or promote products
- **    derived from this software without specific prior written permission.
- **
- ** This software is provided by the copyright holders and contributors "as is" and
- ** any express or implied warranties, including, but not limited to, the implied
- ** warranties of merchantability and fitness for a particular purpose are disclaimed.
- ** In no event shall the Intel Corporation or contributors be liable for any direct,
- ** indirect, incidental, special, exemplary, or consequential damages
- ** (including, but not limited to, procurement of substitute goods or services;
- ** loss of use, data, or profits; or business interruption) however caused
- ** and on any theory of liability, whether in contract, strict liability,
- ** or tort (including negligence or otherwise) arising in any way out of
- ** the use of this software, even if advised of the possibility of such damage.
- *******************************************************************************/
-
-/*
- * retinafasttonemapping.cpp
- *
- *  Created on: May 26, 2013
- *      Author: Alexandre Benoit
- */
-
-#include "precomp.hpp"
-#include "basicretinafilter.hpp"
-#include "retinacolor.hpp"
-#include <cstdio>
-#include <sstream>
-#include <valarray>
-
-namespace cv
-{
-namespace bioinspired
-{
-/**
- * @class RetinaFastToneMappingImpl a wrapper class which allows the tone mapping algorithm of Meylan&al(2007) to be used with OpenCV.
- * This algorithm is already implemented in thre Retina class (retina::applyFastToneMapping) but used it does not require all the retina model to be allocated. This allows a light memory use for low memory devices (smartphones, etc.
- * As a summary, these are the model properties:
- * => 2 stages of local luminance adaptation with a different local neighborhood for each.
- * => first stage models the retina photorecetors local luminance adaptation
- * => second stage models th ganglion cells local information adaptation
- * => compared to the initial publication, this class uses spatio-temporal low pass filters instead of spatial only filters.
- * ====> this can help noise robustness and temporal stability for video sequence use cases.
- * for more information, read to the following papers :
- *  Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
- * regarding spatio-temporal filter and the bigger retina model :
- * Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
- */
-
-class RetinaFastToneMappingImpl : public RetinaFastToneMapping
-{
-public:
-    /**
-     * constructor
-     * @param imageInput: the size of the images to process
-     */
-    RetinaFastToneMappingImpl(Size imageInput)
-    {
-        unsigned int nbPixels=imageInput.height*imageInput.width;
-
-        // basic error check
-        if (nbPixels <= 0)
-        throw cv::Exception(-1, "Bad retina size setup : size height and with must be superior to zero", "RetinaImpl::setup", "retinafasttonemapping.cpp", 0);
-
-        // resize buffers
-        _inputBuffer.resize(nbPixels*3); // buffer supports gray images but also 3 channels color buffers... (larger is better...)
-        _imageOutput.resize(nbPixels*3);
-        _temp2.resize(nbPixels);
-        // allocate the main filter with 2 setup sets properties (one for each low pass filter
-        _multiuseFilter = makePtr<BasicRetinaFilter>(imageInput.height, imageInput.width, 2);
-        // allocate the color manager (multiplexer/demultiplexer
-        _colorEngine = makePtr<RetinaColor>(imageInput.height, imageInput.width);
-        // setup filter behaviors with default values
-        setup();
-    }
-
-    /**
-     * basic destructor
-     */
-    virtual ~RetinaFastToneMappingImpl(){};
-
-    /**
-     * method that applies a luminance correction (initially High Dynamic Range (HDR) tone mapping) using only the 2 local adaptation stages of the retina parvocellular channel : photoreceptors level and ganlion cells level. Spatio temporal filtering is applied but limited to temporal smoothing and eventually high frequencies attenuation. This is a lighter method than the one available using the regular retina::run method. It is then faster but it does not include complete temporal filtering nor retina spectral whitening. Then, it can have a more limited effect on images with a very high dynamic range. This is an adptation of the original still image HDR tone mapping algorithm of David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and Susstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N 9, September, 1st, 2007, pp. 2807-2816
-     @param inputImage the input image to process RGB or gray levels
-     @param outputToneMappedImage the output tone mapped image
-     */
-    virtual void applyFastToneMapping(InputArray inputImage, OutputArray outputToneMappedImage)
-    {
-        // first convert input image to the compatible format :
-        const bool colorMode = _convertCvMat2ValarrayBuffer(inputImage.getMat(), _inputBuffer);
-
-        // process tone mapping
-        if (colorMode)
-        {
-            _runRGBToneMapping(_inputBuffer, _imageOutput, true);
-            _convertValarrayBuffer2cvMat(_imageOutput, _multiuseFilter->getNBrows(), _multiuseFilter->getNBcolumns(), true, outputToneMappedImage);
-        }else
-        {
-            _runGrayToneMapping(_inputBuffer, _imageOutput);
-            _convertValarrayBuffer2cvMat(_imageOutput, _multiuseFilter->getNBrows(), _multiuseFilter->getNBcolumns(), false, outputToneMappedImage);
-        }
-
-    }
-
-    /**
-     * setup method that updates tone mapping behaviors by adjusing the local luminance computation area
-     * @param photoreceptorsNeighborhoodRadius the first stage local adaptation area
-     * @param ganglioncellsNeighborhoodRadius the second stage local adaptation area
-     * @param meanLuminanceModulatorK the factor applied to modulate the meanLuminance information (default is 1, see reference paper)
-     */
-    virtual void setup(const float photoreceptorsNeighborhoodRadius=3.f, const float ganglioncellsNeighborhoodRadius=1.f, const float meanLuminanceModulatorK=1.f)
-    {
-        // setup the spatio-temporal properties of each filter
-        _meanLuminanceModulatorK = meanLuminanceModulatorK;
-        _multiuseFilter->setV0CompressionParameter(1.f, 255.f, 128.f);
-        _multiuseFilter->setLPfilterParameters(0.f, 0.f, photoreceptorsNeighborhoodRadius, 1);
-        _multiuseFilter->setLPfilterParameters(0.f, 0.f, ganglioncellsNeighborhoodRadius, 2);
-    }
-
-private:
-    // a filter able to perform local adaptation and low pass spatio-temporal filtering
-    cv::Ptr <BasicRetinaFilter> _multiuseFilter;
-    cv::Ptr <RetinaColor> _colorEngine;
-
-    //!< buffer used to convert input cv::Mat to internal retina buffers format (valarrays)
-    std::valarray<float> _inputBuffer;
-    std::valarray<float> _imageOutput;
-    std::valarray<float> _temp2;
-    float _meanLuminanceModulatorK;
-
-
-void _convertValarrayBuffer2cvMat(const std::valarray<float> &grayMatrixToConvert, const unsigned int nbRows, const unsigned int nbColumns, const bool colorMode, OutputArray outBuffer)
-{
-    // fill output buffer with the valarray buffer
-    const float *valarrayPTR=get_data(grayMatrixToConvert);
-    if (!colorMode)
-    {
-        outBuffer.create(cv::Size(nbColumns, nbRows), CV_8U);
-        Mat outMat = outBuffer.getMat();
-        for (unsigned int i=0;i<nbRows;++i)
-        {
-            for (unsigned int j=0;j<nbColumns;++j)
-            {
-                cv::Point2d pixel(j,i);
-                outMat.at<unsigned char>(pixel)=(unsigned char)*(valarrayPTR++);
-            }
-        }
-    }else
-    {
-        const unsigned int nbPixels=nbColumns*nbRows;
-        const unsigned int doubleNBpixels=nbColumns*nbRows*2;
-        outBuffer.create(cv::Size(nbColumns, nbRows), CV_8UC3);
-        Mat outMat = outBuffer.getMat();
-        for (unsigned int i=0;i<nbRows;++i)
-        {
-            for (unsigned int j=0;j<nbColumns;++j,++valarrayPTR)
-            {
-                cv::Point2d pixel(j,i);
-                cv::Vec3b pixelValues;
-                pixelValues[2]=(unsigned char)*(valarrayPTR);
-                pixelValues[1]=(unsigned char)*(valarrayPTR+nbPixels);
-                pixelValues[0]=(unsigned char)*(valarrayPTR+doubleNBpixels);
-
-                outMat.at<cv::Vec3b>(pixel)=pixelValues;
-            }
-        }
-    }
-}
-
-bool _convertCvMat2ValarrayBuffer(InputArray inputMat, std::valarray<float> &outputValarrayMatrix)
-{
-    const Mat inputMatToConvert=inputMat.getMat();
-    // first check input consistency
-    if (inputMatToConvert.empty())
-        throw cv::Exception(-1, "RetinaImpl cannot be applied, input buffer is empty", "RetinaImpl::run", "RetinaImpl.h", 0);
-
-    // retreive color mode from image input
-    int imageNumberOfChannels = inputMatToConvert.channels();
-
-        // convert to float AND fill the valarray buffer
-    typedef float T; // define here the target pixel format, here, float
-    const int dsttype = DataType<T>::depth; // output buffer is float format
-
-    const unsigned int nbPixels=inputMat.getMat().rows*inputMat.getMat().cols;
-    const unsigned int doubleNBpixels=inputMat.getMat().rows*inputMat.getMat().cols*2;
-
-    if(imageNumberOfChannels==4)
-    {
-    // create a cv::Mat table (for RGBA planes)
-        cv::Mat planes[4] =
-        {
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[doubleNBpixels]),
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[nbPixels]),
-            cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0])
-        };
-        planes[3] = cv::Mat(inputMatToConvert.size(), dsttype);     // last channel (alpha) does not point on the valarray (not usefull in our case)
-        // split color cv::Mat in 4 planes... it fills valarray directely
-        cv::split(Mat_<Vec<T, 4> >(inputMatToConvert), planes);
-    }
-    else if (imageNumberOfChannels==3)
-    {
-        // create a cv::Mat table (for RGB planes)
-        cv::Mat planes[] =
-        {
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[doubleNBpixels]),
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[nbPixels]),
-        cv::Mat(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0])
-        };
-        // split color cv::Mat in 3 planes... it fills valarray directely
-        cv::split(cv::Mat_<Vec<T, 3> >(inputMatToConvert), planes);
-    }
-    else if(imageNumberOfChannels==1)
-    {
-        // create a cv::Mat header for the valarray
-        cv::Mat dst(inputMatToConvert.size(), dsttype, &outputValarrayMatrix[0]);
-        inputMatToConvert.convertTo(dst, dsttype);
-    }
-        else
-            CV_Error(Error::StsUnsupportedFormat, "input image must be single channel (gray levels), bgr format (color) or bgra (color with transparency which won't be considered");
-
-    return imageNumberOfChannels>1; // return bool : false for gray level image processing, true for color mode
-}
-
-
-    // run the initilized retina filter in order to perform gray image tone mapping, after this call all retina outputs are updated
-    void _runGrayToneMapping(const std::valarray<float> &grayImageInput, std::valarray<float> &grayImageOutput)
-    {
-         // apply tone mapping on the multiplexed image
-        // -> photoreceptors local adaptation (large area adaptation)
-        _multiuseFilter->runFilter_LPfilter(grayImageInput, grayImageOutput, 0); // compute low pass filtering modeling the horizontal cells filtering to acess local luminance
-        _multiuseFilter->setV0CompressionParameterToneMapping(1.f, grayImageOutput.max(), _meanLuminanceModulatorK*grayImageOutput.sum()/(float)_multiuseFilter->getNBpixels());
-        _multiuseFilter->runFilter_LocalAdapdation(grayImageInput, grayImageOutput, _temp2); // adapt contrast to local luminance
-
-        // -> ganglion cells local adaptation (short area adaptation)
-        _multiuseFilter->runFilter_LPfilter(_temp2, grayImageOutput, 1); // compute low pass filtering (high cut frequency (remove spatio-temporal noise)
-        _multiuseFilter->setV0CompressionParameterToneMapping(1.f, _temp2.max(), _meanLuminanceModulatorK*grayImageOutput.sum()/(float)_multiuseFilter->getNBpixels());
-        _multiuseFilter->runFilter_LocalAdapdation(_temp2, grayImageOutput, grayImageOutput); // adapt contrast to local luminance
-
-    }
-
- // run the initilized retina filter in order to perform color tone mapping, after this call all retina outputs are updated
-    void _runRGBToneMapping(const std::valarray<float> &RGBimageInput, std::valarray<float> &RGBimageOutput, const bool useAdaptiveFiltering)
-    {
-        // multiplex the image with the color sampling method specified in the constructor
-        _colorEngine->runColorMultiplexing(RGBimageInput);
-
-        // apply tone mapping on the multiplexed image
-        _runGrayToneMapping(_colorEngine->getMultiplexedFrame(), RGBimageOutput);
-
-        // demultiplex tone maped image
-        _colorEngine->runColorDemultiplexing(RGBimageOutput, useAdaptiveFiltering, _multiuseFilter->getMaxInputValue());//_ColorEngine->getMultiplexedFrame());//_ParvoRetinaFilter->getPhotoreceptorsLPfilteringOutput());
-
-        // rescaling result between 0 and 255
-        _colorEngine->normalizeRGBOutput_0_maxOutputValue(255.0);
-
-        // return the result
-        RGBimageOutput=_colorEngine->getDemultiplexedColorFrame();
-    }
-
-};
-
-CV_EXPORTS Ptr<RetinaFastToneMapping> createRetinaFastToneMapping(Size inputSize)
-{
-    return makePtr<RetinaFastToneMappingImpl>(inputSize);
-}
-
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/retinafilter.cpp b/modules/bioinspired/src/retinafilter.cpp
deleted file mode 100644
index e1e24c89b..000000000
--- a/modules/bioinspired/src/retinafilter.cpp
+++ /dev/null
@@ -1,526 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#include "precomp.hpp"
-
-#include "retinafilter.hpp"
-
-// @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC : www.listic.univ-savoie.fr, Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-
-#include <iostream>
-#include <cmath>
-
-namespace cv
-{
-namespace bioinspired
-{
-    // standard constructor without any log sampling of the input frame
-    RetinaFilter::RetinaFilter(const unsigned int sizeRows, const unsigned int sizeColumns, const bool colorMode, const int samplingMethod, const bool useRetinaLogSampling, const double reductionFactor, const double samplingStrenght)
-        :
-    _retinaParvoMagnoMappedFrame(0),
-        _retinaParvoMagnoMapCoefTable(0),
-        _photoreceptorsPrefilter((1-(int)useRetinaLogSampling)*sizeRows+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeRows, reductionFactor), (1-(int)useRetinaLogSampling)*sizeColumns+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeColumns, reductionFactor), 4),
-        _ParvoRetinaFilter((1-(int)useRetinaLogSampling)*sizeRows+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeRows, reductionFactor), (1-(int)useRetinaLogSampling)*sizeColumns+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeColumns, reductionFactor)),
-        _MagnoRetinaFilter((1-(int)useRetinaLogSampling)*sizeRows+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeRows, reductionFactor), (1-(int)useRetinaLogSampling)*sizeColumns+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeColumns, reductionFactor)),
-        _colorEngine((1-(int)useRetinaLogSampling)*sizeRows+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeRows, reductionFactor), (1-(int)useRetinaLogSampling)*sizeColumns+useRetinaLogSampling*ImageLogPolProjection::predictOutputSize(sizeColumns, reductionFactor), samplingMethod),
-        // configure retina photoreceptors log sampling... if necessary
-        _photoreceptorsLogSampling(NULL)
-    {
-
-#ifdef RETINADEBUG
-        std::cout<<"RetinaFilter::size( "<<_photoreceptorsPrefilter.getNBrows()<<", "<<_photoreceptorsPrefilter.getNBcolumns()<<")"<<" =? "<<_photoreceptorsPrefilter.getNBpixels()<<std::endl;
-#endif
-        if (useRetinaLogSampling)
-        {
-            _photoreceptorsLogSampling = new ImageLogPolProjection(sizeRows, sizeColumns, ImageLogPolProjection::RETINALOGPROJECTION, true);
-            if (!_photoreceptorsLogSampling->initProjection(reductionFactor, samplingStrenght))
-            {
-                std::cerr<<"RetinaFilter::Problem initializing photoreceptors log sampling, could not setup retina filter"<<std::endl;
-                delete _photoreceptorsLogSampling;
-                _photoreceptorsLogSampling=NULL;
-            }
-            else
-            {
-#ifdef RETINADEBUG
-                std::cout<<"_photoreceptorsLogSampling::size( "<<_photoreceptorsLogSampling->getNBrows()<<", "<<_photoreceptorsLogSampling->getNBcolumns()<<")"<<" =? "<<_photoreceptorsLogSampling->getNBpixels()<<std::endl;
-#endif
-            }
-        }
-
-        // set default processing activities
-        _useParvoOutput=true;
-        _useMagnoOutput=true;
-
-        _useColorMode=colorMode;
-
-        // create hybrid output and related coefficient table
-        _createHybridTable();
-
-        // set default parameters
-        setGlobalParameters();
-
-        // stability controls values init
-        _setInitPeriodCount();
-        _globalTemporalConstant=25;
-
-        // reset all buffers
-        clearAllBuffers();
-
-
-        //  std::cout<<"RetinaFilter::size( "<<this->getNBrows()<<", "<<this->getNBcolumns()<<")"<<_filterOutput.size()<<" =? "<<_filterOutput.getNBpixels()<<std::endl;
-
-    }
-
-    // destructor
-    RetinaFilter::~RetinaFilter()
-    {
-        if (_photoreceptorsLogSampling!=NULL)
-            delete _photoreceptorsLogSampling;
-    }
-
-    // function that clears all buffers of the object
-    void RetinaFilter::clearAllBuffers()
-    {
-        _photoreceptorsPrefilter.clearAllBuffers();
-        _ParvoRetinaFilter.clearAllBuffers();
-        _MagnoRetinaFilter.clearAllBuffers();
-        _colorEngine.clearAllBuffers();
-        if (_photoreceptorsLogSampling!=NULL)
-            _photoreceptorsLogSampling->clearAllBuffers();
-        // stability controls value init
-        _setInitPeriodCount();
-    }
-
-    /**
-    * resize retina filter object (resize all allocated buffers
-    * @param NBrows: the new height size
-    * @param NBcolumns: the new width size
-    */
-    void RetinaFilter::resize(const unsigned int NBrows, const unsigned int NBcolumns)
-    {
-        unsigned int rows=NBrows, cols=NBcolumns;
-
-        // resize optionnal member and adjust other modules size if required
-        if (_photoreceptorsLogSampling)
-        {
-            _photoreceptorsLogSampling->resize(NBrows, NBcolumns);
-            rows=_photoreceptorsLogSampling->getOutputNBrows();
-            cols=_photoreceptorsLogSampling->getOutputNBcolumns();
-        }
-
-        _photoreceptorsPrefilter.resize(rows, cols);
-        _ParvoRetinaFilter.resize(rows, cols);
-        _MagnoRetinaFilter.resize(rows, cols);
-        _colorEngine.resize(rows, cols);
-
-        // reset parvo magno mapping
-        _createHybridTable();
-
-        // clean buffers
-        clearAllBuffers();
-
-    }
-
-    // stability controls value init
-    void RetinaFilter::_setInitPeriodCount()
-    {
-
-        // find out the maximum temporal constant value and apply a security factor
-        // false value (obviously too long) but appropriate for simple use
-        _globalTemporalConstant=(unsigned int)(_ParvoRetinaFilter.getPhotoreceptorsTemporalConstant()+_ParvoRetinaFilter.getHcellsTemporalConstant()+_MagnoRetinaFilter.getTemporalConstant());
-        // reset frame counter
-        _ellapsedFramesSinceLastReset=0;
-    }
-
-    void RetinaFilter::_createHybridTable()
-    {
-        // create hybrid output and related coefficient table
-        _retinaParvoMagnoMappedFrame.resize(_photoreceptorsPrefilter.getNBpixels());
-
-        _retinaParvoMagnoMapCoefTable.resize(_photoreceptorsPrefilter.getNBpixels()*2);
-
-        // fill _hybridParvoMagnoCoefTable
-        int i, j, halfRows=_photoreceptorsPrefilter.getNBrows()/2, halfColumns=_photoreceptorsPrefilter.getNBcolumns()/2;
-        float *hybridParvoMagnoCoefTablePTR= &_retinaParvoMagnoMapCoefTable[0];
-        float minDistance=MIN(halfRows, halfColumns)*0.7f;
-        for (i=0;i<(int)_photoreceptorsPrefilter.getNBrows();++i)
-        {
-            for (j=0;j<(int)_photoreceptorsPrefilter.getNBcolumns();++j)
-            {
-                float distanceToCenter=std::sqrt(((float)(i-halfRows)*(i-halfRows)+(j-halfColumns)*(j-halfColumns)));
-                if (distanceToCenter<minDistance)
-                {
-                    float a=*(hybridParvoMagnoCoefTablePTR++)=0.5f+0.5f*(float)cos(CV_PI*distanceToCenter/minDistance);
-                    *(hybridParvoMagnoCoefTablePTR++)=1.f-a;
-                }else
-                {
-                    *(hybridParvoMagnoCoefTablePTR++)=0.f;
-                    *(hybridParvoMagnoCoefTablePTR++)=1.f;
-                }
-            }
-        }
-    }
-
-    // setup parameters function and global data filling
-    void RetinaFilter::setGlobalParameters(const float OPLspatialResponse1, const float OPLtemporalresponse1, const float OPLassymetryGain, const float OPLspatialResponse2, const float OPLtemporalresponse2, const float LPfilterSpatialResponse, const float LPfilterGain, const float LPfilterTemporalresponse, const float MovingContoursExtractorCoefficient, const bool normalizeParvoOutput_0_maxOutputValue, const bool normalizeMagnoOutput_0_maxOutputValue, const float maxOutputValue, const float maxInputValue, const float meanValue)
-    {
-        _normalizeParvoOutput_0_maxOutputValue=normalizeParvoOutput_0_maxOutputValue;
-        _normalizeMagnoOutput_0_maxOutputValue=normalizeMagnoOutput_0_maxOutputValue;
-        _maxOutputValue=maxOutputValue;
-        _photoreceptorsPrefilter.setV0CompressionParameter(0.9f, maxInputValue, meanValue);
-        _photoreceptorsPrefilter.setLPfilterParameters(10, 0, 1.5, 1); // keeps low pass filter with high cut frequency in memory (usefull for the tone mapping function)
-        _photoreceptorsPrefilter.setLPfilterParameters(10, 0, 3.0, 2); // keeps low pass filter with low cut frequency in memory (usefull for the tone mapping function)
-        _photoreceptorsPrefilter.setLPfilterParameters(0, 0, 10, 3); // keeps low pass filter with low cut frequency in memory (usefull for the tone mapping function)
-        //this->setV0CompressionParameter(0.6, maxInputValue, meanValue); // keeps log compression sensitivity parameter (usefull for the tone mapping function)
-        _ParvoRetinaFilter.setOPLandParvoFiltersParameters(0,OPLtemporalresponse1, OPLspatialResponse1, OPLassymetryGain, OPLtemporalresponse2, OPLspatialResponse2);
-        _ParvoRetinaFilter.setV0CompressionParameter(0.9f, maxInputValue, meanValue);
-        _MagnoRetinaFilter.setCoefficientsTable(LPfilterGain, LPfilterTemporalresponse, LPfilterSpatialResponse, MovingContoursExtractorCoefficient, 0, 2.0f*LPfilterSpatialResponse);
-        _MagnoRetinaFilter.setV0CompressionParameter(0.7f, maxInputValue, meanValue);
-
-        // stability controls value init
-        _setInitPeriodCount();
-    }
-
-    bool RetinaFilter::checkInput(const std::valarray<float> &input, const bool)
-    {
-
-        BasicRetinaFilter *inputTarget=&_photoreceptorsPrefilter;
-        if (_photoreceptorsLogSampling)
-            inputTarget=_photoreceptorsLogSampling;
-
-        bool test=input.size()==inputTarget->getNBpixels() || input.size()==(inputTarget->getNBpixels()*3) ;
-        if (!test)
-        {
-            std::cerr<<"RetinaFilter::checkInput: input buffer does not match retina buffer size, conversion aborted"<<std::endl;
-            std::cout<<"RetinaFilter::checkInput: input size="<<input.size()<<" / "<<"retina size="<<inputTarget->getNBpixels()<<std::endl;
-            return false;
-        }
-
-        return true;
-    }
-
-    // main function that runs the filter for a given input frame
-    bool RetinaFilter::runFilter(const std::valarray<float> &imageInput, const bool useAdaptiveFiltering, const bool processRetinaParvoMagnoMapping, const bool useColorMode, const bool inputIsColorMultiplexed)
-    {
-        // preliminary check
-        bool processSuccess=true;
-        if (!checkInput(imageInput, useColorMode))
-            return false;
-
-        // run the color multiplexing if needed and compute each suub filter of the retina:
-        // -> local adaptation
-        // -> contours OPL extraction
-        // -> moving contours extraction
-
-        // stability controls value update
-        ++_ellapsedFramesSinceLastReset;
-
-        _useColorMode=useColorMode;
-
-        /* pointer to the appropriate input data after,
-        * by default, if graylevel mode, the input is processed,
-        * if color or something else must be considered, specific preprocessing are applied
-        */
-
-        const std::valarray<float> *selectedPhotoreceptorsLocalAdaptationInput= &imageInput;
-        const std::valarray<float> *selectedPhotoreceptorsColorInput=&imageInput;
-
-        //********** Following is input data specific photoreceptors processing
-        if (_photoreceptorsLogSampling)
-        {
-            _photoreceptorsLogSampling->runProjection(imageInput, useColorMode);
-            selectedPhotoreceptorsColorInput=selectedPhotoreceptorsLocalAdaptationInput=&(_photoreceptorsLogSampling->getSampledFrame());
-        }
-
-        if (useColorMode&& (!inputIsColorMultiplexed)) // not multiplexed color input case
-        {
-            _colorEngine.runColorMultiplexing(*selectedPhotoreceptorsColorInput);
-            selectedPhotoreceptorsLocalAdaptationInput=&(_colorEngine.getMultiplexedFrame());
-        }
-
-        //********** Following is generic Retina processing
-
-        // photoreceptors local adaptation
-        _photoreceptorsPrefilter.runFilter_LocalAdapdation(*selectedPhotoreceptorsLocalAdaptationInput, _ParvoRetinaFilter.getHorizontalCellsOutput());
-        // safety pixel values checks
-        //_photoreceptorsPrefilter.normalizeGrayOutput_0_maxOutputValue(_maxOutputValue);
-
-        // run parvo filter
-        _ParvoRetinaFilter.runFilter(_photoreceptorsPrefilter.getOutput(), _useParvoOutput);
-
-        if (_useParvoOutput)
-        {
-            _ParvoRetinaFilter.normalizeGrayOutputCentredSigmoide(); // models the saturation of the cells, usefull for visualisation of the ON-OFF Parvo Output, Bipolar cells outputs do not change !!!
-            _ParvoRetinaFilter.centerReductImageLuminance(); // best for further spectrum analysis
-
-            if (_normalizeParvoOutput_0_maxOutputValue)
-                _ParvoRetinaFilter.normalizeGrayOutput_0_maxOutputValue(_maxOutputValue);
-        }
-
-        if (_useParvoOutput&&_useMagnoOutput)
-        {
-            _MagnoRetinaFilter.runFilter(_ParvoRetinaFilter.getBipolarCellsON(), _ParvoRetinaFilter.getBipolarCellsOFF());
-            if (_normalizeMagnoOutput_0_maxOutputValue)
-            {
-                _MagnoRetinaFilter.normalizeGrayOutput_0_maxOutputValue(_maxOutputValue);
-            }
-            _MagnoRetinaFilter.normalizeGrayOutputNearZeroCentreredSigmoide();
-        }
-
-        if (_useParvoOutput&&_useMagnoOutput&&processRetinaParvoMagnoMapping)
-        {
-            _processRetinaParvoMagnoMapping();
-            if (_useColorMode)
-                _colorEngine.runColorDemultiplexing(_retinaParvoMagnoMappedFrame, useAdaptiveFiltering, _maxOutputValue);//_ColorEngine->getMultiplexedFrame());//_ParvoRetinaFilter->getPhotoreceptorsLPfilteringOutput());
-
-            return processSuccess;
-        }
-
-        if (_useParvoOutput&&_useColorMode)
-        {
-            _colorEngine.runColorDemultiplexing(_ParvoRetinaFilter.getOutput(), useAdaptiveFiltering, _maxOutputValue);//_ColorEngine->getMultiplexedFrame());//_ParvoRetinaFilter->getPhotoreceptorsLPfilteringOutput());
-            // compute A Cr1 Cr2 to LMS color space conversion
-            //if (true)
-            //  _applyImageColorSpaceConversion(_ColorEngine->getChrominance(), lmsTempBuffer.Buffer(), _LMStoACr1Cr2);
-        }
-
-        return processSuccess;
-    }
-
-    const std::valarray<float> &RetinaFilter::getContours()
-    {
-        if (_useColorMode)
-            return _colorEngine.getLuminance();
-        else
-            return _ParvoRetinaFilter.getOutput();
-    }
-
-    // run the initilized retina filter in order to perform gray image tone mapping, after this call all retina outputs are updated
-    void RetinaFilter::runGrayToneMapping(const std::valarray<float> &grayImageInput, std::valarray<float> &grayImageOutput, const float PhotoreceptorsCompression, const float ganglionCellsCompression)
-    {
-        // preliminary check
-        if (!checkInput(grayImageInput, false))
-            return;
-
-        this->_runGrayToneMapping(grayImageInput, grayImageOutput, PhotoreceptorsCompression, ganglionCellsCompression);
-    }
-
-    // run the initilized retina filter in order to perform gray image tone mapping, after this call all retina outputs are updated
-    void RetinaFilter::_runGrayToneMapping(const std::valarray<float> &grayImageInput, std::valarray<float> &grayImageOutput, const float PhotoreceptorsCompression, const float ganglionCellsCompression)
-    {
-        // stability controls value update
-        ++_ellapsedFramesSinceLastReset;
-
-        std::valarray<float> temp2(grayImageInput.size());
-
-        // apply tone mapping on the multiplexed image
-        // -> photoreceptors local adaptation (large area adaptation)
-        _photoreceptorsPrefilter.runFilter_LPfilter(grayImageInput, grayImageOutput, 2); // compute low pass filtering modeling the horizontal cells filtering to acess local luminance
-        _photoreceptorsPrefilter.setV0CompressionParameterToneMapping(1.f-PhotoreceptorsCompression, grayImageOutput.max(), 1.f*grayImageOutput.sum()/(float)_photoreceptorsPrefilter.getNBpixels());
-        _photoreceptorsPrefilter.runFilter_LocalAdapdation(grayImageInput, grayImageOutput, temp2); // adapt contrast to local luminance
-
-        // -> ganglion cells local adaptation (short area adaptation)
-        _photoreceptorsPrefilter.runFilter_LPfilter(temp2, grayImageOutput, 1); // compute low pass filtering (high cut frequency (remove spatio-temporal noise)
-        _photoreceptorsPrefilter.setV0CompressionParameterToneMapping(1.f-ganglionCellsCompression, temp2.max(), 1.f*temp2.sum()/(float)_photoreceptorsPrefilter.getNBpixels());
-        _photoreceptorsPrefilter.runFilter_LocalAdapdation(temp2, grayImageOutput, grayImageOutput); // adapt contrast to local luminance
-    }
-
-    // run the initilized retina filter in order to perform color tone mapping, after this call all retina outputs are updated
-    void RetinaFilter::runRGBToneMapping(const std::valarray<float> &RGBimageInput, std::valarray<float> &RGBimageOutput, const bool useAdaptiveFiltering, const float PhotoreceptorsCompression, const float ganglionCellsCompression)
-    {
-        // preliminary check
-        if (!checkInput(RGBimageInput, true))
-            return;
-
-        // multiplex the image with the color sampling method specified in the constructor
-        _colorEngine.runColorMultiplexing(RGBimageInput);
-
-        // apply tone mapping on the multiplexed image
-        _runGrayToneMapping(_colorEngine.getMultiplexedFrame(), RGBimageOutput, PhotoreceptorsCompression, ganglionCellsCompression);
-
-        // demultiplex tone maped image
-        _colorEngine.runColorDemultiplexing(RGBimageOutput, useAdaptiveFiltering, _photoreceptorsPrefilter.getMaxInputValue());//_ColorEngine->getMultiplexedFrame());//_ParvoRetinaFilter->getPhotoreceptorsLPfilteringOutput());
-
-        // rescaling result between 0 and 255
-        _colorEngine.normalizeRGBOutput_0_maxOutputValue(255.0);
-
-        // return the result
-        RGBimageOutput=_colorEngine.getDemultiplexedColorFrame();
-    }
-
-    void RetinaFilter::runLMSToneMapping(const std::valarray<float> &, std::valarray<float> &, const bool, const float, const float)
-    {
-        std::cerr<<"not working, sorry"<<std::endl;
-
-        /*  // preliminary check
-        const std::valarray<float> &bufferInput=checkInput(LMSimageInput, true);
-        if (!bufferInput)
-        return NULL;
-
-        if (!_useColorMode)
-        std::cerr<<"RetinaFilter::Can not call tone mapping oeration if the retina filter was created for gray scale images"<<std::endl;
-
-        // create a temporary buffer of size nrows, Mcolumns, 3 layers
-        std::valarray<float> lmsTempBuffer(LMSimageInput);
-        std::cout<<"RetinaFilter::--->min LMS value="<<lmsTempBuffer.min()<<std::endl;
-
-        // setup local adaptation parameter at the photoreceptors level
-        setV0CompressionParameter(PhotoreceptorsCompression, _maxInputValue);
-        // get the local energy of each color channel
-        // ->L
-        _spatiotemporalLPfilter(LMSimageInput, _filterOutput, 1);
-        setV0CompressionParameterToneMapping(PhotoreceptorsCompression, _maxInputValue, this->sum()/_NBpixels);
-        _localLuminanceAdaptation(LMSimageInput, _filterOutput, lmsTempBuffer.Buffer());
-        // ->M
-        _spatiotemporalLPfilter(LMSimageInput+_NBpixels, _filterOutput, 1);
-        setV0CompressionParameterToneMapping(PhotoreceptorsCompression, _maxInputValue, this->sum()/_NBpixels);
-        _localLuminanceAdaptation(LMSimageInput+_NBpixels, _filterOutput, lmsTempBuffer.Buffer()+_NBpixels);
-        // ->S
-        _spatiotemporalLPfilter(LMSimageInput+_NBpixels*2, _filterOutput, 1);
-        setV0CompressionParameterToneMapping(PhotoreceptorsCompression, _maxInputValue, this->sum()/_NBpixels);
-        _localLuminanceAdaptation(LMSimageInput+_NBpixels*2, _filterOutput, lmsTempBuffer.Buffer()+_NBpixels*2);
-
-        // eliminate negative values
-        for (unsigned int i=0;i<lmsTempBuffer.size();++i)
-        if (lmsTempBuffer.Buffer()[i]<0)
-        lmsTempBuffer.Buffer()[i]=0;
-        std::cout<<"RetinaFilter::->min LMS value="<<lmsTempBuffer.min()<<std::endl;
-
-        // compute LMS to A Cr1 Cr2 color space conversion
-        _applyImageColorSpaceConversion(lmsTempBuffer.Buffer(), lmsTempBuffer.Buffer(), _LMStoACr1Cr2);
-
-        TemplateBuffer <float> acr1cr2TempBuffer(_NBrows, _NBcolumns, 3);
-        memcpy(acr1cr2TempBuffer.Buffer(), lmsTempBuffer.Buffer(), sizeof(float)*_NBpixels*3);
-
-        // compute A Cr1 Cr2 to LMS color space conversion
-        _applyImageColorSpaceConversion(acr1cr2TempBuffer.Buffer(), lmsTempBuffer.Buffer(), _ACr1Cr2toLMS);
-
-        // eliminate negative values
-        for (unsigned int i=0;i<lmsTempBuffer.size();++i)
-        if (lmsTempBuffer.Buffer()[i]<0)
-        lmsTempBuffer.Buffer()[i]=0;
-
-        // rewrite output to the appropriate buffer
-        _colorEngine->setDemultiplexedColorFrame(lmsTempBuffer.Buffer());
-        */
-    }
-
-    // return image with center Parvo and peripheral Magno channels
-    void RetinaFilter::_processRetinaParvoMagnoMapping()
-    {
-        register float *hybridParvoMagnoPTR= &_retinaParvoMagnoMappedFrame[0];
-        register const float *parvoOutputPTR= get_data(_ParvoRetinaFilter.getOutput());
-        register const float *magnoXOutputPTR= get_data(_MagnoRetinaFilter.getOutput());
-        register float *hybridParvoMagnoCoefTablePTR= &_retinaParvoMagnoMapCoefTable[0];
-
-        for (unsigned int i=0 ; i<_photoreceptorsPrefilter.getNBpixels() ; ++i, hybridParvoMagnoCoefTablePTR+=2)
-        {
-            float hybridValue=*(parvoOutputPTR++)**(hybridParvoMagnoCoefTablePTR)+*(magnoXOutputPTR++)**(hybridParvoMagnoCoefTablePTR+1);
-            *(hybridParvoMagnoPTR++)=hybridValue;
-        }
-
-        TemplateBuffer<float>::normalizeGrayOutput_0_maxOutputValue(&_retinaParvoMagnoMappedFrame[0], _photoreceptorsPrefilter.getNBpixels());
-
-    }
-
-    bool RetinaFilter::getParvoFoveaResponse(std::valarray<float> &parvoFovealResponse)
-    {
-        if (!_useParvoOutput)
-            return false;
-        if (parvoFovealResponse.size() != _ParvoRetinaFilter.getNBpixels())
-            return false;
-
-        register const float *parvoOutputPTR= get_data(_ParvoRetinaFilter.getOutput());
-        register float *fovealParvoResponsePTR= &parvoFovealResponse[0];
-        register float *hybridParvoMagnoCoefTablePTR= &_retinaParvoMagnoMapCoefTable[0];
-
-        for (unsigned int i=0 ; i<_photoreceptorsPrefilter.getNBpixels() ; ++i, hybridParvoMagnoCoefTablePTR+=2)
-        {
-            *(fovealParvoResponsePTR++)=*(parvoOutputPTR++)**(hybridParvoMagnoCoefTablePTR);
-        }
-
-        return true;
-    }
-
-    // method to retrieve the parafoveal magnocellular pathway response (no energy motion in fovea)
-    bool RetinaFilter::getMagnoParaFoveaResponse(std::valarray<float> &magnoParafovealResponse)
-    {
-        if (!_useMagnoOutput)
-            return false;
-        if (magnoParafovealResponse.size() != _MagnoRetinaFilter.getNBpixels())
-            return false;
-
-        register const float *magnoXOutputPTR= get_data(_MagnoRetinaFilter.getOutput());
-        register float *parafovealMagnoResponsePTR=&magnoParafovealResponse[0];
-        register float *hybridParvoMagnoCoefTablePTR=&_retinaParvoMagnoMapCoefTable[0]+1;
-
-        for (unsigned int i=0 ; i<_photoreceptorsPrefilter.getNBpixels() ; ++i, hybridParvoMagnoCoefTablePTR+=2)
-        {
-            *(parafovealMagnoResponsePTR++)=*(magnoXOutputPTR++)**(hybridParvoMagnoCoefTablePTR);
-        }
-
-        return true;
-    }
-}// end of namespace bioinspired
-}// end of namespace cv
diff --git a/modules/bioinspired/src/retinafilter.hpp b/modules/bioinspired/src/retinafilter.hpp
deleted file mode 100644
index 3e204885f..000000000
--- a/modules/bioinspired/src/retinafilter.hpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-/**
-* @class  RetinaFilter
-* @brief class which describes the retina model developped at the LIS/GIPSA-LAB www.gipsa-lab.inpg.fr:
-* -> performs a contours and moving contours extraction with powerfull local data enhancement as at the retina level
-* Based on Alexandre BENOIT thesis: "Le systeme visuel humain au secours de la vision par ordinateur"
-*
-* => various optimisations and enhancements added after 2007 such as tone mapping capabilities, see reference paper cited in the licence and :
-* Benoit A.,Alleysson D., Herault J., Le Callet P. (2009),  "Spatio-Temporal Tone Mapping Operator based on a Retina model", Computational Color Imaging Workshop (CCIW09),pp 12-22, Saint Etienne, France
-*
-* TYPICAL USE:
-*
-* // create object at a specified picture size
-* Retina *retina;
-* retina =new Retina(frameSizeRows, frameSizeColumns, RGBmode);
-*
-* // init gain, spatial and temporal parameters:
-* retina->setParameters(0.7, 1, 0, 7, 1, 5, 0, 0, 3 , true);
-*
-* // during program execution, call the filter for local luminance correction,  contours extraction, moving contours extraction from an input picture called "FrameBuffer":
-* retina->runfilter(FrameBuffer);
-*
-* // get the different output frames, check in the class description below for more outputs:
-* const std::valarray<float> correctedLuminance=retina->getLocalAdaptation();
-* const std::valarray<float> contours=retina->getContours();
-* const std::valarray<float> movingContours=retina->getMovingContours();
-*
-* // at the end of the program, destroy object:
-* delete retina;
-*
-* @author Alexandre BENOIT, benoit.alexandre.vision@gmail.com, LISTIC / Gipsa-Lab, France: www.gipsa-lab.inpg.fr/
-* Creation date 2007
-*/
-
-#ifndef RETINACLASSES_H_
-#define RETINACLASSES_H_
-
-#include "basicretinafilter.hpp"
-#include "parvoretinafilter.hpp"
-#include "magnoretinafilter.hpp"
-
-// optional includes (depending on the related publications)
-#include "imagelogpolprojection.hpp"
-
-#include "retinacolor.hpp"
-
-//#define __RETINADEBUG // define RETINADEBUG to display debug data
-namespace cv
-{
-namespace bioinspired
-{
-// retina class that process the 3 outputs of the retina filtering stages
-class RetinaFilter//: public BasicRetinaFilter
-{
-public:
-
-    /**
-    * constructor of the retina filter model with log sampling of the input frame (models the photoreceptors log sampling (central high resolution fovea and lower precision borders))
-    * @param sizeRows: number of rows of the input image
-    * @param sizeColumns: number of columns of the input image
-    * @param colorMode: specifies if the retina works with color (true) of stays in grayscale processing (false), can be adjusted online by the use of setColorMode method
-    * @param samplingMethod: specifies which kind of color sampling will be used
-    * @param useRetinaLogSampling: activate retina log sampling, if true, the 2 following parameters can be used
-    * @param reductionFactor: only usefull if param useRetinaLogSampling=true, specifies the reduction factor of the output frame (as the center (fovea) is high resolution and corners can be underscaled, then a reduction of the output is allowed without precision leak
-    * @param samplingStrenght: only usefull if param useRetinaLogSampling=true, specifies the strenght of the log scale that is applied
-    */
-    RetinaFilter(const unsigned int sizeRows, const unsigned int sizeColumns, const bool colorMode=false, const int samplingMethod=RETINA_COLOR_BAYER, const bool useRetinaLogSampling=false, const double reductionFactor=1.0, const double samplingStrenght=10.0);
-
-    /**
-    * standard destructor
-    */
-    ~RetinaFilter();
-
-    /**
-    * function that clears all buffers of the object
-    */
-    void clearAllBuffers();
-
-    /**
-    * resize retina parvo filter object (resize all allocated buffers)
-    * @param NBrows: the new height size
-    * @param NBcolumns: the new width size
-    */
-    void resize(const unsigned int NBrows, const unsigned int NBcolumns);
-
-    /**
-    * Input buffer checker: allows to check if the passed image buffer corresponds to retina filter expectations
-    * @param input: the input image buffer
-    * @param colorMode: specifiy if the input should be considered by the retina as colored of not
-    * @return false if not compatible or it returns true if OK
-    */
-    bool checkInput(const std::valarray<float> &input, const bool colorMode);
-
-    /**
-    * run the initilized retina filter, after this call all retina outputs are updated
-    * @param imageInput: image input buffer, can be grayscale or RGB image respecting the size specified at the constructor level
-    * @param useAdaptiveFiltering: set true if you want to use adaptive color demultilexing (solve some color artefact problems), see RetinaColor for citation references
-    * @param processRetinaParvoMagnoMapping: tels if the main outputs takes into account the mapping of the Parvo and Magno channels on the retina (centred parvo (fovea) and magno outside (parafovea))
-    * @param useColorMode: color information is used if true, warning, if input is only gray level, a buffer overflow error will occur
-    -> note that if color mode is activated and processRetinaParvoMagnoMapping==true, then the demultiplexed color frame (accessible throw getColorOutput() will be a color contours frame in the fovea and gray level moving contours outside
-    @param inputIsColorMultiplexed: set trus if the input data is a multiplexed color image (using Bayer sampling for example), the color sampling method must correspond to the RETINA_COLORSAMPLINGMETHOD passed at constructor!
-    * @return true if process ran well, false in case of failure
-    */
-    bool runFilter(const std::valarray<float> &imageInput, const bool useAdaptiveFiltering=true, const bool processRetinaParvoMagnoMapping=false, const bool useColorMode=false, const bool inputIsColorMultiplexed=false);
-
-    /**
-    * run the initilized retina filter in order to perform color tone mapping applied on an RGB image, after this call the color output of the retina is updated (use function getColorOutput() to grab it)
-    * the algorithm is based on David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and S�sstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N� 9, September, 1st, 2007, pp. 2807-2816
-    * get the resulting gray frame by calling function getParvoColor()
-    * @param grayImageInput: RGB image input buffer respecting the size specified at the constructor level
-    * @param PhotoreceptorsCompression: sets the log compression parameters applied at the photoreceptors level (enhance luminance in dark areas)
-    * @param ganglionCellsCompression: sets the log compression applied at the gnaglion cells output (enhance contrast)
-    */
-    void runGrayToneMapping(const std::valarray<float> &grayImageInput, std::valarray<float> &grayImageOutput, const float PhotoreceptorsCompression=0.6, const float ganglionCellsCompression=0.6);
-
-    /**
-    * run the initilized retina filter in order to perform color tone mapping applied on an RGB image, after this call the color output of the retina is updated (use function getColorOutput() to grab it)
-    * the algorithm is based on David Alleyson, Sabine Susstruck and Laurence Meylan's work, please cite:
-    * -> Meylan L., Alleysson D., and S�sstrunk S., A Model of Retinal Local Adaptation for the Tone Mapping of Color Filter Array Images, Journal of Optical Society of America, A, Vol. 24, N� 9, September, 1st, 2007, pp. 2807-2816
-    * get the resulting RGB frame by calling function getParvoColor()
-    * @param RGBimageInput: RGB image input buffer respecting the size specified at the constructor level
-    * @param useAdaptiveFiltering: set true if you want to use adaptive color demultilexing (solve some color artefact problems), see RetinaColor for citation references
-    * @param PhotoreceptorsCompression: sets the log compression parameters applied at the photoreceptors level (enhance luminance in dark areas)
-    * @param ganglionCellsCompression: sets the log compression applied at the ganglion cells output (enhance contrast)
-    */
-    void runRGBToneMapping(const std::valarray<float> &RGBimageInput, std::valarray<float> &imageOutput, const bool useAdaptiveFiltering, const float PhotoreceptorsCompression=0.6, const float ganglionCellsCompression=0.6);
-
-    /**
-    * run the initilized retina filter in order to perform color tone mapping applied on an RGB image, after this call the color output of the retina is updated (use function getColorOutput() to grab it)
-    * get the resulting RGB frame by calling function getParvoColor()
-    * @param LMSimageInput: RGB image input buffer respecting the size specified at the constructor level
-    * @param useAdaptiveFiltering: set true if you want to use adaptive color demultilexing (solve some color artefact problems), see RetinaColor for citation references
-    * @param PhotoreceptorsCompression: sets the log compression parameters applied at the photoreceptors level (enhance luminance in dark areas)
-    * @param ganglionCellsCompression: sets the log compression applied at the gnaglion cells output (enhance contrast)
-    */
-    void runLMSToneMapping(const std::valarray<float> &LMSimageInput, std::valarray<float> &imageOutput, const bool useAdaptiveFiltering, const float PhotoreceptorsCompression=0.6, const float ganglionCellsCompression=0.6);
-
-    /**
-    * set up function of the retina filter: all the retina is initialized at this step, some specific parameters are set by default, use setOPLandParvoCoefficientsTable() and setMagnoCoefficientsTable in order to setup the retina with more options
-    * @param OPLspatialResponse1: (equal to k1 in setOPLandParvoCoefficientsTable() function) the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-    * @param OPLtemporalresponse1: (equal to tau1 in setOPLandParvoCoefficientsTable() function) the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-    * @param OPLassymetryGain: (equal to beta2 in setOPLandParvoCoefficientsTable() function) gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-    * @param OPLspatialResponse2: (equal to k2 in setOPLandParvoCoefficientsTable() function) the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel
-    * @param OPLtemporalresponse2: (equal to tau2 in setOPLandParvoCoefficientsTable() function) the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-    * @param LPfilterSpatialResponse: (equal to parasolCells_k in setMagnoCoefficientsTable() function) the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-    * @param LPfilterGain: (equal to parasolCells_beta in setMagnoCoefficientsTable() function) the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-    * @param LPfilterTemporalresponse: (equal to parasolCells_tau in setMagnoCoefficientsTable() function) the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-    * @param MovingContoursExtractorCoefficient: (equal to amacrinCellsTemporalCutFrequency in setMagnoCoefficientsTable() function)the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, tipicall value is 5
-    * @param normalizeParvoOutput_0_maxOutputValue: specifies if the Parvo cellular output should be normalized between 0 and maxOutputValue (true) or not (false) in order to remain at a null mean value, true value is recommended for visualisation
-    * @param normalizeMagnoOutput_0_maxOutputValue: specifies if the Magno cellular output should be normalized between 0 and maxOutputValue (true) or not (false), setting true may be hazardous because it can enhace the noise response when nothing is moving
-    * @param maxOutputValue: the maximum amplitude value of the normalized outputs (generally 255 for 8bit per channel pictures)
-    * @param maxInputValue: the maximum pixel value of the input picture (generally 255 for 8bit per channel pictures), specify it in other case (for example High Dynamic Range Images)
-    * @param meanValue: the global mean value of the input data usefull for local adaptation setup
-    */
-    void setGlobalParameters(const float OPLspatialResponse1=0.7, const float OPLtemporalresponse1=1, const float OPLassymetryGain=0, const float OPLspatialResponse2=5, const float OPLtemporalresponse2=1, const float LPfilterSpatialResponse=5, const float LPfilterGain=0, const float LPfilterTemporalresponse=0, const float MovingContoursExtractorCoefficient=5, const bool normalizeParvoOutput_0_maxOutputValue=false, const bool normalizeMagnoOutput_0_maxOutputValue=false, const float maxOutputValue=255.0, const float maxInputValue=255.0, const float meanValue=128.0);
-
-    /**
-    * setup the local luminance adaptation capability
-    * @param V0CompressionParameter: the compression strengh of the photoreceptors local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 160
-    */
-    inline void setPhotoreceptorsLocalAdaptationSensitivity(const float V0CompressionParameter){_photoreceptorsPrefilter.setV0CompressionParameter(1-V0CompressionParameter);_setInitPeriodCount();};
-
-    /**
-    * setup the local luminance adaptation capability
-    * @param V0CompressionParameter: the compression strengh of the parvocellular pathway (details) local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 160
-    */
-    inline void setParvoGanglionCellsLocalAdaptationSensitivity(const float V0CompressionParameter){_ParvoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);_setInitPeriodCount();};
-
-    /**
-    * setup the local luminance adaptation area of integration
-    * @param spatialResponse: the spatial constant of the low pass filter applied on the bipolar cells output in order to compute local contrast mean values
-    * @param temporalResponse: the spatial constant of the low pass filter applied on the bipolar cells output in order to compute local contrast mean values (generally set to zero: immediate response)
-    */
-    inline void setGanglionCellsLocalAdaptationLPfilterParameters(const float spatialResponse, const float temporalResponse){_ParvoRetinaFilter.setGanglionCellsLocalAdaptationLPfilterParameters(temporalResponse, spatialResponse);_setInitPeriodCount();};
-
-    /**
-    * setup the local luminance adaptation capability
-    * @param V0CompressionParameter: the compression strengh of the magnocellular pathway (motion) local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 160
-    */
-    inline void setMagnoGanglionCellsLocalAdaptationSensitivity(const float V0CompressionParameter){_MagnoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);_setInitPeriodCount();};
-
-    /**
-    * setup the OPL and IPL parvo channels
-    * @param beta1: gain of the horizontal cells network, if 0, then the mean value of the output is zero (default value), if the parameter is near 1, the amplitude is boosted but it should only be used for values rescaling... if needed
-    * @param tau1: the time constant of the first order low pass filter of the photoreceptors, use it to cut high temporal frequencies (noise or fast motion), unit is frames, typical value is 1 frame
-    * @param k1: the spatial constant of the first order low pass filter of the photoreceptors, use it to cut high spatial frequencies (noise or thick contours), unit is pixels, typical value is 1 pixel
-    * @param beta2: gain of the horizontal cells network, if 0, then the mean value of the output is zero, if the parameter is near 1, then, the luminance is not filtered and is still reachable at the output, typicall value is 0
-    * @param tau2: the time constant of the first order low pass filter of the horizontal cells, use it to cut low temporal frequencies (local luminance variations), unit is frames, typical value is 1 frame, as the photoreceptors
-    * @param k2: the spatial constant of the first order low pass filter of the horizontal cells, use it to cut low spatial frequencies (local luminance), unit is pixels, typical value is 5 pixel, this value is also used for local contrast computing when computing the local contrast adaptation at the ganglion cells level (Inner Plexiform Layer parvocellular channel model)
-    * @param V0CompressionParameter: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 230
-    */
-    void setOPLandParvoParameters(const float beta1, const float tau1, const float k1, const float beta2, const float tau2, const float k2, const float V0CompressionParameter){_ParvoRetinaFilter.setOPLandParvoFiltersParameters(beta1, tau1, k1, beta2, tau2, k2);_ParvoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);_setInitPeriodCount();};
-
-    /**
-    * set parameters values for the Inner Plexiform Layer (IPL) magnocellular channel
-    * @param parasolCells_beta: the low pass filter gain used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), typical value is 0
-    * @param parasolCells_tau: the low pass filter time constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is frame, typical value is 0 (immediate response)
-    * @param parasolCells_k: the low pass filter spatial constant used for local contrast adaptation at the IPL level of the retina (for ganglion cells local adaptation), unit is pixels, typical value is 5
-    * @param amacrinCellsTemporalCutFrequency: the time constant of the first order high pass fiter of the magnocellular way (motion information channel), unit is frames, tipicall value is 5
-    * @param V0CompressionParameter: the compression strengh of the ganglion cells local adaptation output, set a value between 160 and 250 for best results, a high value increases more the low value sensitivity... and the output saturates faster, recommended value: 200
-    * @param localAdaptintegration_tau: specifies the temporal constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-    * @param localAdaptintegration_k: specifies the spatial constant of the low pas filter involved in the computation of the local "motion mean" for the local adaptation computation
-    */
-    void setMagnoCoefficientsTable(const float parasolCells_beta, const float parasolCells_tau, const float parasolCells_k, const float amacrinCellsTemporalCutFrequency, const float V0CompressionParameter, const float localAdaptintegration_tau, const float localAdaptintegration_k){_MagnoRetinaFilter.setCoefficientsTable(parasolCells_beta, parasolCells_tau, parasolCells_k, amacrinCellsTemporalCutFrequency, localAdaptintegration_tau, localAdaptintegration_k);_MagnoRetinaFilter.setV0CompressionParameter(V0CompressionParameter);_setInitPeriodCount();};
-
-    /**
-    * set if the parvo output should be or not normalized between 0 and 255 (for display purpose generally)
-    * @param normalizeParvoOutput_0_maxOutputValue: true if normalization should be done
-    */
-    inline void activateNormalizeParvoOutput_0_maxOutputValue(const bool normalizeParvoOutput_0_maxOutputValue){_normalizeParvoOutput_0_maxOutputValue=normalizeParvoOutput_0_maxOutputValue;};
-
-    /**
-    * set if the magno output should be or not normalized between 0 and 255 (for display purpose generally), take care, if nothing is moving, then, the noise will be enanced !!!
-    * @param normalizeMagnoOutput_0_maxOutputValue: true if normalization should be done
-    */
-    inline void activateNormalizeMagnoOutput_0_maxOutputValue(const bool normalizeMagnoOutput_0_maxOutputValue){_normalizeMagnoOutput_0_maxOutputValue=normalizeMagnoOutput_0_maxOutputValue;};
-
-    /**
-    * setup the maximum amplitude value of the normalized outputs (generally 255 for 8bit per channel pictures)
-    * @param maxOutputValue: maximum amplitude value of the normalized outputs (generally 255 for 8bit per channel pictures)
-    */
-    inline void setMaxOutputValue(const float maxOutputValue){_maxOutputValue=maxOutputValue;};
-
-    /**
-    * sets the color mode of the frame grabber
-    * @param desiredColorMode: true if the user needs color information, false for graylevels
-    */
-    void setColorMode(const bool desiredColorMode){_useColorMode=desiredColorMode;};
-
-    /**
-    * activate color saturation as the final step of the color demultiplexing process
-    * -> this saturation is a sigmoide function applied to each channel of the demultiplexed image.
-    * @param saturateColors: boolean that activates color saturation (if true) or desactivate (if false)
-    * @param colorSaturationValue: the saturation factor
-    * */
-    inline void setColorSaturation(const bool saturateColors=true, const float colorSaturationValue=4.0){_colorEngine.setColorSaturation(saturateColors, colorSaturationValue);};
-
-    /////////////////////////////////////////////////////////////////
-    // function that retrieve the main retina outputs, one by one, or all in a structure
-
-    /**
-    * @return the input image sampled by the photoreceptors spatial sampling
-    */
-    inline const std::valarray<float> &getPhotoreceptorsSampledFrame() const
-    {
-        CV_Assert(_photoreceptorsLogSampling);
-        return _photoreceptorsLogSampling->getSampledFrame();
-    };
-
-    /**
-    * @return photoreceptors output, locally adapted luminance only, no high frequency spatio-temporal noise reduction at the next retina processing stages, use getPhotoreceptors method to get complete photoreceptors output
-    */
-    inline const std::valarray<float> &getLocalAdaptation() const {return _photoreceptorsPrefilter.getOutput();};
-
-    /**
-    * @return photoreceptors output: locally adapted luminance and high frequency spatio-temporal noise reduction, high luminance is a little saturated at this stage, but this is corrected naturally at the next retina processing stages
-    */
-    inline const std::valarray<float> &getPhotoreceptors() const {return _ParvoRetinaFilter.getPhotoreceptorsLPfilteringOutput();};
-
-    /**
-    * @return the local luminance of the processed frame (it is the horizontal cells output)
-    */
-    inline const std::valarray<float> &getHorizontalCells() const {return _ParvoRetinaFilter.getHorizontalCellsOutput();};
-
-    ///////// CONTOURS part, PARVOCELLULAR RETINA PATHWAY
-    /**
-    * @return true if Parvocellular output is activated, false if not
-    */
-    inline bool areContoursProcessed(){return _useParvoOutput;};
-
-    /**
-    *  method to retrieve the foveal parvocellular pathway response (no details energy in parafovea)
-    * @param parvoParafovealResponse: buffer that will be filled with the response of the magnocellular pathway in the parafoveal area
-    * @return true if process succeeded (if buffer exists, is its size matches retina size, if magno channel is activated and if mapping is initialized
-    */
-    bool getParvoFoveaResponse(std::valarray<float> &parvoFovealResponse);
-
-    /**
-    * @param useParvoOutput: true if Parvocellular output should be activated, false if not
-    */
-    inline void activateContoursProcessing(const bool useParvoOutput){_useParvoOutput=useParvoOutput;};
-
-    /**
-    * @return the parvocellular contours information (details), should be used at the fovea level
-    */
-    const std::valarray<float> &getContours(); // Parvocellular output
-
-    /**
-    * @return the parvocellular contours ON information (details), should be used at the fovea level
-    */
-    inline const std::valarray<float> &getContoursON() const {return _ParvoRetinaFilter.getParvoON();};// Parvocellular ON output
-
-    /**
-    * @return the parvocellular contours OFF information (details), should be used at the fovea level
-    */
-    inline const std::valarray<float> &getContoursOFF() const {return _ParvoRetinaFilter.getParvoOFF();};// Parvocellular OFF output
-
-    ///////// MOVING CONTOURS part, MAGNOCELLULAR RETINA PATHWAY
-    /**
-    * @return true if Magnocellular output is activated, false if not
-    */
-    inline bool areMovingContoursProcessed(){return _useMagnoOutput;};
-
-    /**
-    *  method to retrieve the parafoveal magnocellular pathway response (no motion energy in fovea)
-    * @param magnoParafovealResponse: buffer that will be filled with the response of the magnocellular pathway in the parafoveal area
-    * @return true if process succeeded (if buffer exists, is its size matches retina size, if magno channel is activated and if mapping is initialized
-    */
-    bool getMagnoParaFoveaResponse(std::valarray<float> &magnoParafovealResponse);
-
-    /**
-    * @param useMagnoOutput: true if Magnoocellular output should be activated, false if not
-    */
-    inline void activateMovingContoursProcessing(const bool useMagnoOutput){_useMagnoOutput=useMagnoOutput;};
-
-    /**
-    * @return the magnocellular moving contours information (motion), should be used at the parafovea level without post-processing
-    */
-    inline const std::valarray<float> &getMovingContours() const {return _MagnoRetinaFilter.getOutput();};// Magnocellular output
-
-    /**
-    * @return the magnocellular moving contours information (motion), should be used at the parafovea level with assymetric sigmoide post-processing which saturates motion information
-    */
-    inline const std::valarray<float> &getMovingContoursSaturated() const {return _MagnoRetinaFilter.getMagnoYsaturated();};// Saturated Magnocellular output
-
-    /**
-    * @return the magnocellular moving contours ON information (motion), should be used at the parafovea level without post-processing
-    */
-    inline const std::valarray<float> &getMovingContoursON() const {return _MagnoRetinaFilter.getMagnoON();};// Magnocellular ON output
-
-    /**
-    * @return the magnocellular moving contours OFF information (motion), should be used at the parafovea level without post-processing
-    */
-    inline const std::valarray<float> &getMovingContoursOFF() const {return _MagnoRetinaFilter.getMagnoOFF();};// Magnocellular OFF output
-
-    /**
-    * @return a gray level image with center Parvo and peripheral Magno X channels, WARNING, the result will be ok if you called previously fucntion runFilter(imageInput, processRetinaParvoMagnoMapping=true);
-    *    -> will be accessible even if color mode is activated (but the image is color sampled so quality is poor), but get the same thing but in color by the use of function getParvoColor()
-    */
-    inline const std::valarray<float> &getRetinaParvoMagnoMappedOutput() const {return _retinaParvoMagnoMappedFrame;};// return image with center Parvo and peripheral Magno channels
-
-    /**
-    * color processing dedicated functions
-    * @return the parvo channel (contours, details) of the processed frame, grayscale output
-    */
-    inline const std::valarray<float> &getParvoContoursChannel() const {return _colorEngine.getLuminance();};
-
-    /**
-    * color processing dedicated functions
-    * @return the chrominance of the processed frame (same colorspace as the input output, usually RGB)
-    */
-    inline const std::valarray<float> &getParvoChrominance() const {return _colorEngine.getChrominance();}; // only retreive chrominance
-
-    /**
-    * color processing dedicated functions
-    * @return the parvo + chrominance channels of the processed frame (same colorspace as the input output, usually RGB)
-    */
-    inline const std::valarray<float> &getColorOutput() const {return _colorEngine.getDemultiplexedColorFrame();};// retrieve luminance+chrominance
-
-    /**
-    * apply to the retina color output the Krauskopf transformation which leads to an opponent color system: output colorspace if Acr1cr2 if input of the retina was LMS color space
-    * @param result: the input buffer to fill with the transformed colorspace retina output
-    * @return true if process ended successfully
-    */
-    inline bool applyKrauskopfLMS2Acr1cr2Transform(std::valarray<float> &result){return _colorEngine.applyKrauskopfLMS2Acr1cr2Transform(result);};
-
-    /**
-    * apply to the retina color output the Krauskopf transformation which leads to an opponent color system: output colorspace if Acr1cr2 if input of the retina was LMS color space
-    * @param result: the input buffer to fill with the transformed colorspace retina output
-    * @return true if process ended successfully
-    */
-    inline bool applyLMS2LabTransform(std::valarray<float> &result){return _colorEngine.applyLMS2LabTransform(result);};
-
-    /**
-    * color processing dedicated functions
-    * @return the retina initialized mode, true if color mode (RGB), false if grayscale
-    */
-    inline bool isColorMode(){return _useColorMode;}; // return true if RGB mode, false if gray level mode
-
-    /**
-    * @return the irregular low pass filter ouput at the photoreceptors level
-    */
-    inline const std::valarray<float> &getIrregularLPfilteredInputFrame() const {return _photoreceptorsLogSampling->getIrregularLPfilteredInputFrame();};
-
-    /**
-    * @return true if color mode is activated, false if gray levels processing
-    */
-    bool getColorMode(){return _useColorMode;};
-
-    /**
-    *
-    * @return true if a sufficient number of processed frames has been done since the last parameters update in order to get the stable state (r�gime permanent)
-    */
-    inline bool isInitTransitionDone(){if (_ellapsedFramesSinceLastReset<_globalTemporalConstant)return false; return true;};
-
-    /**
-    * find a distance in the image input space when the distance is known in the retina log sampled space...read again if it is not clear enough....sorry, i should sleep
-    * @param projectedRadiusLength: the distance to image center in the retina log sampled space
-    * @return the distance to image center in the input image space
-    */
-    inline float getRetinaSamplingBackProjection(const float projectedRadiusLength)
-    {
-        if (_photoreceptorsLogSampling)
-            return (float)_photoreceptorsLogSampling->getOriginalRadiusLength(projectedRadiusLength);
-        return projectedRadiusLength;
-    };
-
-    /////////////////:
-    // retina dimensions getters
-
-    /**
-    * @return number of rows of the filter
-    */
-    inline unsigned int getInputNBrows(){if (_photoreceptorsLogSampling) return _photoreceptorsLogSampling->getNBrows();else return _photoreceptorsPrefilter.getNBrows();};
-
-    /**
-    * @return number of columns of the filter
-    */
-    inline unsigned int getInputNBcolumns(){if (_photoreceptorsLogSampling) return _photoreceptorsLogSampling->getNBcolumns();else return _photoreceptorsPrefilter.getNBcolumns();};
-
-    /**
-    * @return number of pixels of the filter
-    */
-    inline unsigned int getInputNBpixels(){if (_photoreceptorsLogSampling) return _photoreceptorsLogSampling->getNBpixels();else return _photoreceptorsPrefilter.getNBpixels();};
-
-    /**
-    * @return the height of the frame output
-    */
-    inline unsigned int getOutputNBrows(){return _photoreceptorsPrefilter.getNBrows();};
-
-    /**
-    * @return the width of the frame output
-    */
-    inline unsigned int getOutputNBcolumns(){return _photoreceptorsPrefilter.getNBcolumns();};
-
-    /**
-    * @return the numbers of output pixels (width*height) of the images used by the object
-    */
-    inline unsigned int getOutputNBpixels(){return _photoreceptorsPrefilter.getNBpixels();};
-
-
-private:
-
-    // processing activation flags
-    bool _useParvoOutput;
-    bool _useMagnoOutput;
-
-
-    // filter stability controls
-    unsigned int _ellapsedFramesSinceLastReset;
-    unsigned int _globalTemporalConstant;
-
-    // private template buffers and related access pointers
-    std::valarray<float> _retinaParvoMagnoMappedFrame;
-    std::valarray<float> _retinaParvoMagnoMapCoefTable;
-    // private objects of the class
-    BasicRetinaFilter _photoreceptorsPrefilter;
-    ParvoRetinaFilter _ParvoRetinaFilter;
-    MagnoRetinaFilter _MagnoRetinaFilter;
-    RetinaColor       _colorEngine;
-    ImageLogPolProjection *_photoreceptorsLogSampling;
-
-    bool _useMinimalMemoryForToneMappingONLY;
-
-    bool _normalizeParvoOutput_0_maxOutputValue;
-    bool _normalizeMagnoOutput_0_maxOutputValue;
-    float _maxOutputValue;
-    bool _useColorMode;
-
-
-
-    // private functions
-    void _setInitPeriodCount();
-    void _createHybridTable();
-    void _processRetinaParvoMagnoMapping();
-    void _runGrayToneMapping(const std::valarray<float> &grayImageInput, std::valarray<float> &grayImageOutput ,const float PhotoreceptorsCompression=0.6, const float ganglionCellsCompression=0.6);
-
-
-};
-
-}// end of namespace bioinspired
-}// end of namespace cv
-
-#endif /*RETINACLASSES_H_*/
diff --git a/modules/bioinspired/src/templatebuffer.hpp b/modules/bioinspired/src/templatebuffer.hpp
deleted file mode 100644
index 827eb709f..000000000
--- a/modules/bioinspired/src/templatebuffer.hpp
+++ /dev/null
@@ -1,555 +0,0 @@
-/*#******************************************************************************
-** IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-**
-** By downloading, copying, installing or using the software you agree to this license.
-** If you do not agree to this license, do not download, install,
-** copy or use the software.
-**
-**
-** bioinspired : interfaces allowing OpenCV users to integrate Human Vision System models. Presented models originate from Jeanny Herault's original research and have been reused and adapted by the author&collaborators for computed vision applications since his thesis with Alice Caplier at Gipsa-Lab.
-** Use: extract still images & image sequences features, from contours details to motion spatio-temporal features, etc. for high level visual scene analysis. Also contribute to image enhancement/compression such as tone mapping.
-**
-** Maintainers : Listic lab (code author current affiliation & applications) and Gipsa Lab (original research origins & applications)
-**
-**  Creation - enhancement process 2007-2011
-**      Author: Alexandre Benoit (benoit.alexandre.vision@gmail.com), LISTIC lab, Annecy le vieux, France
-**
-** Theses algorithm have been developped by Alexandre BENOIT since his thesis with Alice Caplier at Gipsa-Lab (www.gipsa-lab.inpg.fr) and the research he pursues at LISTIC Lab (www.listic.univ-savoie.fr).
-** Refer to the following research paper for more information:
-** Benoit A., Caplier A., Durette B., Herault, J., "USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011
-** This work have been carried out thanks to Jeanny Herault who's research and great discussions are the basis of all this work, please take a look at his book:
-** Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891.
-**
-** The retina filter includes the research contributions of phd/research collegues from which code has been redrawn by the author :
-** _take a look at the retinacolor.hpp module to discover Brice Chaix de Lavarene color mosaicing/demosaicing and the reference paper:
-** ====> B. Chaix de Lavarene, D. Alleysson, B. Durette, J. Herault (2007). "Efficient demosaicing through recursive filtering", IEEE International Conference on Image Processing ICIP 2007
-** _take a look at imagelogpolprojection.hpp to discover retina spatial log sampling which originates from Barthelemy Durette phd with Jeanny Herault. A Retina / V1 cortex projection is also proposed and originates from Jeanny's discussions.
-** ====> more informations in the above cited Jeanny Heraults's book.
-**
-**                          License Agreement
-**               For Open Source Computer Vision Library
-**
-** Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-** Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-**
-**               For Human Visual System tools (bioinspired)
-** Copyright (C) 2007-2011, LISTIC Lab, Annecy le Vieux and GIPSA Lab, Grenoble, France, all rights reserved.
-**
-** Third party copyrights are property of their respective owners.
-**
-** Redistribution and use in source and binary forms, with or without modification,
-** are permitted provided that the following conditions are met:
-**
-** * Redistributions of source code must retain the above copyright notice,
-**    this list of conditions and the following disclaimer.
-**
-** * Redistributions in binary form must reproduce the above copyright notice,
-**    this list of conditions and the following disclaimer in the documentation
-**    and/or other materials provided with the distribution.
-**
-** * The name of the copyright holders may not be used to endorse or promote products
-**    derived from this software without specific prior written permission.
-**
-** This software is provided by the copyright holders and contributors "as is" and
-** any express or implied warranties, including, but not limited to, the implied
-** warranties of merchantability and fitness for a particular purpose are disclaimed.
-** In no event shall the Intel Corporation or contributors be liable for any direct,
-** indirect, incidental, special, exemplary, or consequential damages
-** (including, but not limited to, procurement of substitute goods or services;
-** loss of use, data, or profits; or business interruption) however caused
-** and on any theory of liability, whether in contract, strict liability,
-** or tort (including negligence or otherwise) arising in any way out of
-** the use of this software, even if advised of the possibility of such damage.
-*******************************************************************************/
-
-#ifndef __TEMPLATEBUFFER_HPP__
-#define __TEMPLATEBUFFER_HPP__
-
-#include <valarray>
-#include <cstdlib>
-#include <iostream>
-#include <cmath>
-
-
-//#define __TEMPLATEBUFFERDEBUG //define TEMPLATEBUFFERDEBUG in order to display debug information
-
-namespace cv
-{
-namespace bioinspired
-{
-//// If a parallelization method is available then, you should define MAKE_PARALLEL, in the other case, the classical serial code will be used
-#define MAKE_PARALLEL
-// ==> then include required includes
-#ifdef MAKE_PARALLEL
-
-// ==> declare usefull generic tools
-template <class type>
-class Parallel_clipBufferValues: public cv::ParallelLoopBody
-{
-private:
-    type *bufferToClip;
-    type minValue, maxValue;
-
-public:
-    Parallel_clipBufferValues(type* bufferToProcess, const type min, const type max)
-        : bufferToClip(bufferToProcess), minValue(min), maxValue(max){}
-
-    virtual void operator()( const cv::Range &r ) const {
-        register type *inputOutputBufferPTR=bufferToClip+r.start;
-        for (register int jf = r.start; jf != r.end; ++jf, ++inputOutputBufferPTR)
-        {
-            if (*inputOutputBufferPTR>maxValue)
-                *inputOutputBufferPTR=maxValue;
-            else if (*inputOutputBufferPTR<minValue)
-                *inputOutputBufferPTR=minValue;
-        }
-    }
-};
-#endif
-
-    /**
-    * @class TemplateBuffer
-    * @brief this class is a simple template memory buffer which contains basic functions to get information on or normalize the buffer content
-    * note that thanks to the parent STL template class "valarray", it is possible to perform easily operations on the full array such as addition, product etc.
-    * @author Alexandre BENOIT (benoit.alexandre.vision@gmail.com), helped by Gelu IONESCU (gelu.ionescu@lis.inpg.fr)
-    * creation date: september 2007
-    */
-    template <class type> class TemplateBuffer : public std::valarray<type>
-    {
-    public:
-
-        /**
-        * constructor for monodimensional array
-        * @param dim: the size of the vector
-        */
-        TemplateBuffer(const size_t dim=0)
-            : std::valarray<type>((type)0, dim)
-        {
-            _NBrows=1;
-            _NBcolumns=dim;
-            _NBdepths=1;
-            _NBpixels=dim;
-            _doubleNBpixels=2*dim;
-        }
-
-        /**
-        * constructor by copy for monodimensional array
-        * @param pVal: the pointer to a buffer to copy
-        * @param dim: the size of the vector
-        */
-        TemplateBuffer(const type* pVal, const size_t dim)
-            : std::valarray<type>(pVal, dim)
-        {
-            _NBrows=1;
-            _NBcolumns=dim;
-            _NBdepths=1;
-            _NBpixels=dim;
-            _doubleNBpixels=2*dim;
-        }
-
-        /**
-        * constructor for bidimensional array
-        * @param dimRows: the size of the vector
-        * @param dimColumns: the size of the vector
-        * @param depth: the number of layers of the buffer in its third dimension (3 of color images, 1 for gray images.
-        */
-        TemplateBuffer(const size_t dimRows, const size_t dimColumns, const size_t depth=1)
-            : std::valarray<type>((type)0, dimRows*dimColumns*depth)
-        {
-#ifdef TEMPLATEBUFFERDEBUG
-            std::cout<<"TemplateBuffer::TemplateBuffer: new buffer, size="<<dimRows<<", "<<dimColumns<<", "<<depth<<"valarraySize="<<this->size()<<std::endl;
-#endif
-            _NBrows=dimRows;
-            _NBcolumns=dimColumns;
-            _NBdepths=depth;
-            _NBpixels=dimRows*dimColumns;
-            _doubleNBpixels=2*dimRows*dimColumns;
-            //_createTableIndex();
-#ifdef TEMPLATEBUFFERDEBUG
-            std::cout<<"TemplateBuffer::TemplateBuffer: construction successful"<<std::endl;
-#endif
-
-        }
-
-        /**
-        * copy constructor
-        * @param toCopy
-        * @return thenconstructed instance
-        *emplateBuffer(const TemplateBuffer &toCopy)
-        :_NBrows(toCopy.getNBrows()),_NBcolumns(toCopy.getNBcolumns()),_NBdepths(toCopy.getNBdephs()), _NBpixels(toCopy.getNBpixels()), _doubleNBpixels(toCopy.getNBpixels()*2)
-        //std::valarray<type>(toCopy)
-        {
-        memcpy(Buffer(), toCopy.Buffer(), this->size());
-        }*/
-        /**
-        * destructor
-        */
-        virtual ~TemplateBuffer()
-        {
-#ifdef TEMPLATEBUFFERDEBUG
-            std::cout<<"~TemplateBuffer"<<std::endl;
-#endif
-        }
-
-        /**
-        * delete the buffer content (set zeros)
-        */
-        inline void setZero(){std::valarray<type>::operator=(0);};//memset(Buffer(), 0, sizeof(type)*_NBpixels);};
-
-        /**
-        * @return the numbers of rows (height) of the images used by the object
-        */
-        inline unsigned int getNBrows(){return (unsigned int)_NBrows;};
-
-        /**
-        * @return the numbers of columns (width) of the images used by the object
-        */
-        inline unsigned int getNBcolumns(){return (unsigned int)_NBcolumns;};
-
-        /**
-        * @return the numbers of pixels (width*height) of the images used by the object
-        */
-        inline unsigned int getNBpixels(){return (unsigned int)_NBpixels;};
-
-        /**
-        * @return the numbers of pixels (width*height) of the images used by the object
-        */
-        inline unsigned int getDoubleNBpixels(){return (unsigned int)_doubleNBpixels;};
-
-        /**
-        * @return the numbers of depths (3rd dimension: 1 for gray images, 3 for rgb images) of the images used by the object
-        */
-        inline unsigned int getDepthSize(){return (unsigned int)_NBdepths;};
-
-        /**
-        * resize the buffer and recompute table index etc.
-        */
-        void resizeBuffer(const size_t dimRows, const size_t dimColumns, const size_t depth=1)
-        {
-            this->resize(dimRows*dimColumns*depth);
-            _NBrows=dimRows;
-            _NBcolumns=dimColumns;
-            _NBdepths=depth;
-            _NBpixels=dimRows*dimColumns;
-            _doubleNBpixels=2*dimRows*dimColumns;
-        }
-
-        inline TemplateBuffer<type> & operator=(const std::valarray<type> &b)
-        {
-            //std::cout<<"TemplateBuffer<type> & operator= affect vector: "<<std::endl;
-            std::valarray<type>::operator=(b);
-            return *this;
-        }
-
-        inline TemplateBuffer<type> & operator=(const type &b)
-        {
-            //std::cout<<"TemplateBuffer<type> & operator= affect value: "<<b<<std::endl;
-            std::valarray<type>::operator=(b);
-            return *this;
-        }
-
-        /*  inline const type  &operator[](const unsigned int &b)
-        {
-        return (*this)[b];
-        }
-        */
-        /**
-        * @return the buffer adress in non const mode
-        */
-        inline type*    Buffer()            {    return &(*this)[0];    }
-
-        ///////////////////////////////////////////////////////
-        // Standard Image manipulation functions
-
-        /**
-        * standard 0 to 255 image normalization function
-        * @param inputOutputBuffer: the image to be normalized (rewrites the input), if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param nbPixels: specifies the number of pixel on which the normalization should be performed, if 0, then all pixels specified in the constructor are processed
-        * @param maxOutputValue: the maximum output value
-        */
-        static void normalizeGrayOutput_0_maxOutputValue(type *inputOutputBuffer, const size_t nbPixels, const type maxOutputValue=(type)255.0);
-
-        /**
-        * standard 0 to 255 image normalization function
-        * @param inputOutputBuffer: the image to be normalized (rewrites the input), if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param nbPixels: specifies the number of pixel on which the normalization should be performed, if 0, then all pixels specified in the constructor are processed
-        * @param maxOutputValue: the maximum output value
-        */
-        void normalizeGrayOutput_0_maxOutputValue(const type maxOutputValue=(type)255.0){normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue);};
-
-        /**
-        * sigmoide image normalization function (saturates min and max values)
-        * @param meanValue: specifies the mean value of th pixels to be processed
-        * @param sensitivity: strenght of the sigmoide
-        * @param inputPicture: the image to be normalized if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param outputBuffer: the ouput buffer on which the result is writed, if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param maxOutputValue: the maximum output value
-        */
-        static void normalizeGrayOutputCentredSigmoide(const type meanValue, const type sensitivity, const type maxOutputValue, type *inputPicture, type *outputBuffer, const unsigned int nbPixels);
-
-        /**
-        * sigmoide image normalization function on the current buffer (saturates min and max values)
-        * @param meanValue: specifies the mean value of th pixels to be processed
-        * @param sensitivity: strenght of the sigmoide
-        * @param maxOutputValue: the maximum output value
-        */
-        inline void normalizeGrayOutputCentredSigmoide(const type meanValue=(type)0.0, const type sensitivity=(type)2.0, const type maxOutputValue=(type)255.0){ (void)maxOutputValue; normalizeGrayOutputCentredSigmoide(meanValue, sensitivity, 255.0, this->Buffer(), this->Buffer(), this->getNBpixels());};
-
-        /**
-        * sigmoide image normalization function (saturates min and max values), in this function, the sigmoide is centered on low values (high saturation of the medium and high values
-        * @param inputPicture: the image to be normalized if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param outputBuffer: the ouput buffer on which the result is writed, if no parameter, then, the built in buffer reachable by getOutput() function is normalized
-        * @param sensitivity: strenght of the sigmoide
-        * @param maxOutputValue: the maximum output value
-        */
-        void normalizeGrayOutputNearZeroCentreredSigmoide(type *inputPicture=(type*)NULL, type *outputBuffer=(type*)NULL, const type sensitivity=(type)40, const type maxOutputValue=(type)255.0);
-
-        /**
-        * center and reduct the image (image-mean)/std
-        * @param inputOutputBuffer: the image to be normalized if no parameter, the result is rewrited on it
-        */
-        void centerReductImageLuminance(type *inputOutputBuffer=(type*)NULL);
-
-        /**
-        * @return standard deviation of the buffer
-        */
-        double getStandardDeviation()
-        {
-            double standardDeviation=0;
-            double meanValue=getMean();
-
-            type *bufferPTR=Buffer();
-            for (unsigned int i=0;i<this->size();++i)
-            {
-                double diff=(*(bufferPTR++)-meanValue);
-                standardDeviation+=diff*diff;
-            }
-            return std::sqrt(standardDeviation/this->size());
-        };
-
-        /**
-        * Clip buffer histogram
-        * @param minRatio: the minimum ratio of the lower pixel values, range=[0,1] and lower than maxRatio
-        * @param maxRatio: the aximum ratio of the higher pixel values, range=[0,1] and higher than minRatio
-        */
-        void clipHistogram(double minRatio, double maxRatio, double maxOutputValue)
-        {
-
-            if (minRatio>=maxRatio)
-            {
-                std::cerr<<"TemplateBuffer::clipHistogram: minRatio must be inferior to maxRatio, buffer unchanged"<<std::endl;
-                return;
-            }
-
-            /*    minRatio=min(max(minRatio, 1.0),0.0);
-            maxRatio=max(max(maxRatio, 0.0),1.0);
-            */
-
-            // find the pixel value just above the threshold
-            const double maxThreshold=this->max()*maxRatio;
-            const double minThreshold=(this->max()-this->min())*minRatio+this->min();
-
-            type *bufferPTR=this->Buffer();
-
-            double deltaH=maxThreshold;
-            double deltaL=maxThreshold;
-
-            double updatedHighValue=maxThreshold;
-            double updatedLowValue=maxThreshold;
-
-            for (unsigned int i=0;i<this->size();++i)
-            {
-                double curentValue=(double)*(bufferPTR++);
-
-                // updating "closest to the high threshold" pixel value
-                double highValueTest=maxThreshold-curentValue;
-                if (highValueTest>0)
-                {
-                    if (deltaH>highValueTest)
-                    {
-                        deltaH=highValueTest;
-                        updatedHighValue=curentValue;
-                    }
-                }
-
-                // updating "closest to the low threshold" pixel value
-                double lowValueTest=curentValue-minThreshold;
-                if (lowValueTest>0)
-                {
-                    if (deltaL>lowValueTest)
-                    {
-                        deltaL=lowValueTest;
-                        updatedLowValue=curentValue;
-                    }
-                }
-            }
-
-            std::cout<<"Tdebug"<<std::endl;
-            std::cout<<"deltaL="<<deltaL<<", deltaH="<<deltaH<<std::endl;
-            std::cout<<"this->max()"<<this->max()<<"maxThreshold="<<maxThreshold<<"updatedHighValue="<<updatedHighValue<<std::endl;
-            std::cout<<"this->min()"<<this->min()<<"minThreshold="<<minThreshold<<"updatedLowValue="<<updatedLowValue<<std::endl;
-            // clipping values outside than the updated thresholds
-            bufferPTR=this->Buffer();
-#ifdef MAKE_PARALLEL // call the TemplateBuffer multitreaded clipping method
-            parallel_for_(cv::Range(0,this->size()), Parallel_clipBufferValues<type>(bufferPTR, updatedLowValue, updatedHighValue));
-#else
-
-            for (unsigned int i=0;i<this->size();++i, ++bufferPTR)
-            {
-                if (*bufferPTR<updatedLowValue)
-                    *bufferPTR=updatedLowValue;
-                else if (*bufferPTR>updatedHighValue)
-                    *bufferPTR=updatedHighValue;
-            }
-#endif
-            normalizeGrayOutput_0_maxOutputValue(this->Buffer(), this->size(), maxOutputValue);
-
-        }
-
-        /**
-        * @return the mean value of the vector
-        */
-        inline double getMean(){return this->sum()/this->size();};
-
-    protected:
-        size_t _NBrows;
-        size_t _NBcolumns;
-        size_t _NBdepths;
-        size_t _NBpixels;
-        size_t _doubleNBpixels;
-        // utilities
-        static type _abs(const type x);
-
-    };
-
-    ///////////////////////////////////////////////////////////////////////
-    /// normalize output between 0 and 255, can be applied on images of different size that the declared size if nbPixels parameters is setted up;
-    template <class type>
-    void TemplateBuffer<type>::normalizeGrayOutput_0_maxOutputValue(type *inputOutputBuffer, const size_t processedPixels, const type maxOutputValue)
-    {
-        type maxValue=inputOutputBuffer[0], minValue=inputOutputBuffer[0];
-
-        // get the min and max value
-        register type *inputOutputBufferPTR=inputOutputBuffer;
-        for (register size_t j = 0; j<processedPixels; ++j)
-        {
-            type pixValue = *(inputOutputBufferPTR++);
-            if (maxValue < pixValue)
-                maxValue = pixValue;
-            else if (minValue > pixValue)
-                minValue = pixValue;
-        }
-        // change the range of the data to 0->255
-
-        type factor = maxOutputValue/(maxValue-minValue);
-        type offset = (type)(-minValue*factor);
-
-        inputOutputBufferPTR=inputOutputBuffer;
-        for (register size_t j = 0; j < processedPixels; ++j, ++inputOutputBufferPTR)
-            *inputOutputBufferPTR=*(inputOutputBufferPTR)*factor+offset;
-
-    }
-    // normalize data with a sigmoide close to 0 (saturates values for those superior to 0)
-    template <class type>
-    void TemplateBuffer<type>::normalizeGrayOutputNearZeroCentreredSigmoide(type *inputBuffer, type *outputBuffer, const type sensitivity, const type maxOutputValue)
-    {
-        if (inputBuffer==NULL)
-            inputBuffer=Buffer();
-        if (outputBuffer==NULL)
-            outputBuffer=Buffer();
-
-        type X0cube=sensitivity*sensitivity*sensitivity;
-
-        register type *inputBufferPTR=inputBuffer;
-        register type *outputBufferPTR=outputBuffer;
-
-        for (register size_t j = 0; j < _NBpixels; ++j, ++inputBufferPTR)
-        {
-
-            type currentCubeLuminance=*inputBufferPTR**inputBufferPTR**inputBufferPTR;
-            *(outputBufferPTR++)=maxOutputValue*currentCubeLuminance/(currentCubeLuminance+X0cube);
-        }
-    }
-
-    // normalize and adjust luminance with a centered to 128 sigmode
-    template <class type>
-    void TemplateBuffer<type>::normalizeGrayOutputCentredSigmoide(const type meanValue, const type sensitivity, const type maxOutputValue, type *inputBuffer, type *outputBuffer, const unsigned int nbPixels)
-    {
-
-        if (sensitivity==1.0)
-        {
-            std::cerr<<"TemplateBuffer::TemplateBuffer<type>::normalizeGrayOutputCentredSigmoide error: 2nd parameter (sensitivity) must not equal 0, copying original data..."<<std::endl;
-            memcpy(outputBuffer, inputBuffer, sizeof(type)*nbPixels);
-            return;
-        }
-
-        type X0=maxOutputValue/(sensitivity-(type)1.0);
-
-        register type *inputBufferPTR=inputBuffer;
-        register type *outputBufferPTR=outputBuffer;
-
-        for (register size_t j = 0; j < nbPixels; ++j, ++inputBufferPTR)
-            *(outputBufferPTR++)=(meanValue+(meanValue+X0)*(*(inputBufferPTR)-meanValue)/(_abs(*(inputBufferPTR)-meanValue)+X0));
-
-    }
-
-    // center and reduct the image (image-mean)/std
-    template <class type>
-    void TemplateBuffer<type>::centerReductImageLuminance(type *inputOutputBuffer)
-    {
-        // if outputBuffer unsassigned, the rewrite the buffer
-        if (inputOutputBuffer==NULL)
-            inputOutputBuffer=Buffer();
-        type meanValue=0, stdValue=0;
-
-        // compute mean value
-        for (register size_t j = 0; j < _NBpixels; ++j)
-            meanValue+=inputOutputBuffer[j];
-        meanValue/=((type)_NBpixels);
-
-        // compute std value
-        register type *inputOutputBufferPTR=inputOutputBuffer;
-        for (size_t index=0;index<_NBpixels;++index)
-        {
-            type inputMinusMean=*(inputOutputBufferPTR++)-meanValue;
-            stdValue+=inputMinusMean*inputMinusMean;
-        }
-
-        stdValue=std::sqrt(stdValue/((type)_NBpixels));
-        // adjust luminance in regard of mean and std value;
-        inputOutputBufferPTR=inputOutputBuffer;
-        for (size_t index=0;index<_NBpixels;++index, ++inputOutputBufferPTR)
-            *inputOutputBufferPTR=(*(inputOutputBufferPTR)-meanValue)/stdValue;
-    }
-
-
-    template <class type>
-    type TemplateBuffer<type>::_abs(const type x)
-    {
-
-        if (x>0)
-            return x;
-        else
-            return -x;
-    }
-
-    template < >
-    inline int TemplateBuffer<int>::_abs(const int x)
-    {
-        return std::abs(x);
-    }
-    template < >
-    inline double TemplateBuffer<double>::_abs(const double x)
-    {
-        return std::fabs(x);
-    }
-
-    template < >
-    inline float TemplateBuffer<float>::_abs(const float x)
-    {
-        return std::fabs(x);
-    }
-
-}// end of namespace bioinspired
-}// end of namespace cv
-#endif
diff --git a/modules/bioinspired/test/test_main.cpp b/modules/bioinspired/test/test_main.cpp
deleted file mode 100644
index 6b2499344..000000000
--- a/modules/bioinspired/test/test_main.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "test_precomp.hpp"
-
-CV_TEST_MAIN("cv")
diff --git a/modules/bioinspired/test/test_precomp.hpp b/modules/bioinspired/test/test_precomp.hpp
deleted file mode 100644
index b1672149a..000000000
--- a/modules/bioinspired/test/test_precomp.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#ifndef __OPENCV_TEST_PRECOMP_HPP__
-#define __OPENCV_TEST_PRECOMP_HPP__
-
-#include "opencv2/ts.hpp"
-#include "opencv2/bioinspired.hpp"
-#include <iostream>
-
-#endif
diff --git a/modules/bioinspired/test/test_retina_ocl.cpp b/modules/bioinspired/test/test_retina_ocl.cpp
deleted file mode 100644
index bfccdd557..000000000
--- a/modules/bioinspired/test/test_retina_ocl.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/opencv_modules.hpp"
-#include "opencv2/bioinspired.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
-
-#include "opencv2/core/ocl.hpp" // cv::ocl::haveOpenCL
-
-#if defined(HAVE_OPENCV_OCL)
-
-#include "opencv2/ocl.hpp"
-#define RETINA_ITERATIONS 5
-
-static double checkNear(const cv::Mat &m1, const cv::Mat &m2)
-{
-    return cv::norm(m1, m2, cv::NORM_INF);
-}
-
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-static int oclInit = false;
-static int oclAvailable = false;
-
-PARAM_TEST_CASE(Retina_OCL, bool, int, bool, double, double)
-{
-    bool colorMode;
-    int colorSamplingMethod;
-    bool useLogSampling;
-    double reductionFactor;
-    double samplingStrength;
-
-    virtual void SetUp()
-    {
-        colorMode           = GET_PARAM(0);
-        colorSamplingMethod = GET_PARAM(1);
-        useLogSampling      = GET_PARAM(2);
-        reductionFactor     = GET_PARAM(3);
-        samplingStrength    = GET_PARAM(4);
-
-        if (!oclInit)
-        {
-            if (cv::ocl::haveOpenCL())
-            {
-                try
-                {
-                    const cv::ocl::DeviceInfo& dev = cv::ocl::Context::getContext()->getDeviceInfo();
-                    std::cout << "Device name:" << dev.deviceName << std::endl;
-                    oclAvailable = true;
-                }
-                catch (...)
-                {
-                    std::cout << "Device name: N/A" << std::endl;
-                }
-            }
-            oclInit = true;
-        }
-    }
-};
-
-TEST_P(Retina_OCL, Accuracy)
-{
-    if (!oclAvailable)
-    {
-        std::cout << "SKIP test" << std::endl;
-        return;
-    }
-
-    using namespace cv;
-    Mat input = imread(cvtest::TS::ptr()->get_data_path() + "shared/lena.png", colorMode);
-    CV_Assert(!input.empty());
-    ocl::oclMat ocl_input(input);
-
-    Ptr<bioinspired::Retina> ocl_retina = bioinspired::createRetina_OCL(
-        input.size(),
-        colorMode,
-        colorSamplingMethod,
-        useLogSampling,
-        reductionFactor,
-        samplingStrength);
-
-    Ptr<bioinspired::Retina> gold_retina = bioinspired::createRetina(
-        input.size(),
-        colorMode,
-        colorSamplingMethod,
-        useLogSampling,
-        reductionFactor,
-        samplingStrength);
-
-    Mat gold_parvo;
-    Mat gold_magno;
-    ocl::oclMat ocl_parvo;
-    ocl::oclMat ocl_magno;
-
-    for(int i = 0; i < RETINA_ITERATIONS; i ++)
-    {
-        ocl_retina->run(ocl_input);
-        gold_retina->run(input);
-
-        gold_retina->getParvo(gold_parvo);
-        gold_retina->getMagno(gold_magno);
-
-        ocl_retina->getParvo(ocl_parvo);
-        ocl_retina->getMagno(ocl_magno);
-
-        int eps = colorMode ? 2 : 1;
-
-        EXPECT_LE(checkNear(gold_parvo, (Mat)ocl_parvo), eps);
-        EXPECT_LE(checkNear(gold_magno, (Mat)ocl_magno), eps);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Contrib, Retina_OCL, testing::Combine(
-                            testing::Bool(),
-                            testing::Values((int)cv::bioinspired::RETINA_COLOR_BAYER),
-                            testing::Values(false/*,true*/),
-                            testing::Values(1.0, 0.5),
-                            testing::Values(10.0, 5.0)));
-#endif
diff --git a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
index cb30dc36d..36af8362f 100644
--- a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
+++ b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
@@ -756,6 +756,7 @@ They are
 :math:`[R_1, -t]`,
 :math:`[R_2, t]`,
 :math:`[R_2, -t]`.
+By decomposing ``E``, you can only get the direction of the translation, so the function returns unit ``t``.
 
 
 recoverPose
@@ -1260,11 +1261,11 @@ stereoCalibrate
 -------------------
 Calibrates the stereo camera.
 
-.. ocv:function:: double stereoCalibrate( InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2, InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1, InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2, Size imageSize, OutputArray R, OutputArray T, OutputArray E, OutputArray F, TermCriteria criteria=TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6), int flags=CALIB_FIX_INTRINSIC )
+.. ocv:function:: double stereoCalibrate( InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2, InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1, InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2, Size imageSize, OutputArray R, OutputArray T, OutputArray E, OutputArray F, int flags=CALIB_FIX_INTRINSIC ,TermCriteria criteria=TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6))
 
-.. ocv:pyfunction:: cv2.stereoCalibrate(objectPoints, imagePoints1, imagePoints2, cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2, imageSize[, R[, T[, E[, F[, criteria[, flags]]]]]]) -> retval, cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2, R, T, E, F
+.. ocv:pyfunction:: cv2.stereoCalibrate(objectPoints, imagePoints1, imagePoints2, cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2, imageSize[, R[, T[, E[, F[, flags[, criteria]]]]]]) -> retval, cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2, R, T, E, F
 
-.. ocv:cfunction:: double cvStereoCalibrate( const CvMat* object_points, const CvMat* image_points1, const CvMat* image_points2, const CvMat* npoints, CvMat* camera_matrix1, CvMat* dist_coeffs1, CvMat* camera_matrix2, CvMat* dist_coeffs2, CvSize image_size, CvMat* R, CvMat* T, CvMat* E=0, CvMat* F=0, CvTermCriteria term_crit=cvTermCriteria( CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,1e-6), int flags=CV_CALIB_FIX_INTRINSIC )
+.. ocv:cfunction:: double cvStereoCalibrate( const CvMat* object_points, const CvMat* image_points1, const CvMat* image_points2, const CvMat* npoints, CvMat* camera_matrix1, CvMat* dist_coeffs1, CvMat* camera_matrix2, CvMat* dist_coeffs2, CvSize image_size, CvMat* R, CvMat* T, CvMat* E=0, CvMat* F=0, int flags=CV_CALIB_FIX_INTRINSIC, CvTermCriteria term_crit=cvTermCriteria( CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,1e-6) )
 
     :param objectPoints: Vector of vectors of the calibration pattern points.
 
diff --git a/modules/calib3d/include/opencv2/calib3d.hpp b/modules/calib3d/include/opencv2/calib3d.hpp
index 1a8777b93..8b9b69c3a 100644
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -203,8 +203,8 @@ CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
                                      InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
                                      InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
                                      Size imageSize, OutputArray R,OutputArray T, OutputArray E, OutputArray F,
-                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6),
-                                     int flags = CALIB_FIX_INTRINSIC );
+                                     int flags = CALIB_FIX_INTRINSIC,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
 
 
 //! computes the rectification transformation for a stereo camera from its intrinsic and extrinsic parameters
diff --git a/modules/calib3d/include/opencv2/calib3d/calib3d_c.h b/modules/calib3d/include/opencv2/calib3d/calib3d_c.h
index a505d526d..75125f30f 100644
--- a/modules/calib3d/include/opencv2/calib3d/calib3d_c.h
+++ b/modules/calib3d/include/opencv2/calib3d/calib3d_c.h
@@ -276,9 +276,9 @@ CVAPI(double) cvStereoCalibrate( const CvMat* object_points, const CvMat* image_
                                CvMat* camera_matrix2, CvMat* dist_coeffs2,
                                CvSize image_size, CvMat* R, CvMat* T,
                                CvMat* E CV_DEFAULT(0), CvMat* F CV_DEFAULT(0),
+                               int flags CV_DEFAULT(CV_CALIB_FIX_INTRINSIC),
                                CvTermCriteria term_crit CV_DEFAULT(cvTermCriteria(
-                                   CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,1e-6)),
-                               int flags CV_DEFAULT(CV_CALIB_FIX_INTRINSIC));
+                                   CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,30,1e-6)) );
 
 #define CV_CALIB_ZERO_DISPARITY 1024
 
diff --git a/modules/calib3d/src/calibinit.cpp b/modules/calib3d/src/calibinit.cpp
index 844fde499..ca2121ba2 100644
--- a/modules/calib3d/src/calibinit.cpp
+++ b/modules/calib3d/src/calibinit.cpp
@@ -1998,7 +1998,7 @@ bool cv::findCirclesGrid( InputArray _image, Size patternSize,
       {
         isFound = boxFinder.findHoles();
       }
-      catch (cv::Exception)
+      catch (const cv::Exception &)
       {
 
       }
diff --git a/modules/calib3d/src/calibration.cpp b/modules/calib3d/src/calibration.cpp
index b93b3f7eb..05ae60b99 100644
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -1635,8 +1635,8 @@ double cvStereoCalibrate( const CvMat* _objectPoints, const CvMat* _imagePoints1
                         CvMat* _cameraMatrix2, CvMat* _distCoeffs2,
                         CvSize imageSize, CvMat* matR, CvMat* matT,
                         CvMat* matE, CvMat* matF,
-                        CvTermCriteria termCrit,
-                        int flags )
+                        int flags,
+                        CvTermCriteria termCrit )
 {
     const int NINTRINSIC = 16;
     Ptr<CvMat> npoints, err, J_LR, Je, Ji, imagePoints[2], objectPoints, RT0;
@@ -3278,8 +3278,8 @@ double cv::stereoCalibrate( InputArrayOfArrays _objectPoints,
                           InputOutputArray _cameraMatrix1, InputOutputArray _distCoeffs1,
                           InputOutputArray _cameraMatrix2, InputOutputArray _distCoeffs2,
                           Size imageSize, OutputArray _Rmat, OutputArray _Tmat,
-                          OutputArray _Emat, OutputArray _Fmat, TermCriteria criteria,
-                          int flags )
+                          OutputArray _Emat, OutputArray _Fmat, int flags ,
+                          TermCriteria criteria)
 {
     int rtype = CV_64F;
     Mat cameraMatrix1 = _cameraMatrix1.getMat();
@@ -3322,7 +3322,7 @@ double cv::stereoCalibrate( InputArrayOfArrays _objectPoints,
 
     double err = cvStereoCalibrate(&c_objPt, &c_imgPt, &c_imgPt2, &c_npoints, &c_cameraMatrix1,
         &c_distCoeffs1, &c_cameraMatrix2, &c_distCoeffs2, imageSize,
-        &c_matR, &c_matT, p_matE, p_matF, criteria, flags );
+        &c_matR, &c_matT, p_matE, p_matF, flags, criteria );
 
     cameraMatrix1.copyTo(_cameraMatrix1);
     cameraMatrix2.copyTo(_cameraMatrix2);
diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp
index f44807e47..0b89e181c 100644
--- a/modules/calib3d/src/circlesgrid.cpp
+++ b/modules/calib3d/src/circlesgrid.cpp
@@ -218,6 +218,7 @@ void CirclesGridClusterFinder::findCorners(const std::vector<cv::Point2f> &hull2
 
 void CirclesGridClusterFinder::findOutsideCorners(const std::vector<cv::Point2f> &corners, std::vector<cv::Point2f> &outsideCorners)
 {
+  CV_Assert(!corners.empty());
   outsideCorners.clear();
   //find two pairs of the most nearest corners
   int i, j, n = (int)corners.size();
diff --git a/modules/calib3d/src/compat_ptsetreg.cpp b/modules/calib3d/src/compat_ptsetreg.cpp
index e8f410858..50ba34777 100644
--- a/modules/calib3d/src/compat_ptsetreg.cpp
+++ b/modules/calib3d/src/compat_ptsetreg.cpp
@@ -57,6 +57,7 @@ CvLevMarq::CvLevMarq()
     criteria = cvTermCriteria(0,0,0);
     iters = 0;
     completeSymmFlag = false;
+    errNorm = prevErrNorm = DBL_MAX;
 }
 
 CvLevMarq::CvLevMarq( int nparams, int nerrs, CvTermCriteria criteria0, bool _completeSymmFlag )
@@ -101,7 +102,7 @@ void CvLevMarq::init( int nparams, int nerrs, CvTermCriteria criteria0, bool _co
         J.reset(cvCreateMat( nerrs, nparams, CV_64F ));
         err.reset(cvCreateMat( nerrs, 1, CV_64F ));
     }
-    prevErrNorm = DBL_MAX;
+    errNorm = prevErrNorm = DBL_MAX;
     lambdaLg10 = -3;
     criteria = criteria0;
     if( criteria.type & CV_TERMCRIT_ITER )
diff --git a/modules/calib3d/src/epnp.h b/modules/calib3d/src/epnp.h
index fe0160630..dd42b01ae 100644
--- a/modules/calib3d/src/epnp.h
+++ b/modules/calib3d/src/epnp.h
@@ -74,7 +74,6 @@ class epnp {
   int number_of_correspondences;
 
   double cws[4][3], ccs[4][3];
-  double cws_determinant;
   int max_nr;
   double * A1, * A2;
 };
diff --git a/modules/calib3d/src/levmarq.cpp b/modules/calib3d/src/levmarq.cpp
index 55704132c..d3eb7b556 100644
--- a/modules/calib3d/src/levmarq.cpp
+++ b/modules/calib3d/src/levmarq.cpp
@@ -80,7 +80,7 @@ namespace cv
 class LMSolverImpl : public LMSolver
 {
 public:
-    LMSolverImpl() : maxIters(100) { init(); };
+    LMSolverImpl() : maxIters(100) { init(); }
     LMSolverImpl(const Ptr<LMSolver::Callback>& _cb, int _maxIters) : cb(_cb), maxIters(_maxIters) { init(); }
 
     void init()
@@ -215,7 +215,7 @@ CV_INIT_ALGORITHM(LMSolverImpl, "LMSolver",
                   obj.info()->addParam(obj, "epsx", obj.epsx);
                   obj.info()->addParam(obj, "epsf", obj.epsf);
                   obj.info()->addParam(obj, "maxIters", obj.maxIters);
-                  obj.info()->addParam(obj, "printInterval", obj.printInterval));
+                  obj.info()->addParam(obj, "printInterval", obj.printInterval))
 
 Ptr<LMSolver> createLMSolver(const Ptr<LMSolver::Callback>& cb, int maxIters)
 {
diff --git a/modules/calib3d/src/ptsetreg.cpp b/modules/calib3d/src/ptsetreg.cpp
index aa361a911..3c43624e7 100644
--- a/modules/calib3d/src/ptsetreg.cpp
+++ b/modules/calib3d/src/ptsetreg.cpp
@@ -260,7 +260,6 @@ public:
 
     Ptr<PointSetRegistrator::Callback> cb;
     int modelPoints;
-    int maxBasicSolutions;
     bool checkPartialSubsets;
     double threshold;
     double confidence;
@@ -386,11 +385,11 @@ public:
 CV_INIT_ALGORITHM(RANSACPointSetRegistrator, "PointSetRegistrator.RANSAC",
                   obj.info()->addParam(obj, "threshold", obj.threshold);
                   obj.info()->addParam(obj, "confidence", obj.confidence);
-                  obj.info()->addParam(obj, "maxIters", obj.maxIters));
+                  obj.info()->addParam(obj, "maxIters", obj.maxIters))
 
 CV_INIT_ALGORITHM(LMeDSPointSetRegistrator, "PointSetRegistrator.LMeDS",
                   obj.info()->addParam(obj, "confidence", obj.confidence);
-                  obj.info()->addParam(obj, "maxIters", obj.maxIters));
+                  obj.info()->addParam(obj, "maxIters", obj.maxIters))
 
 Ptr<PointSetRegistrator> createRANSACPointSetRegistrator(const Ptr<PointSetRegistrator::Callback>& _cb,
                                                          int _modelPoints, double _threshold,
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index ee131db76..f16bfe633 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -252,7 +252,7 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
     int width1 = width - rofs - ndisp + 1;
     int ftzero = state.preFilterCap;
     int textureThreshold = state.textureThreshold;
-    int uniquenessRatio = state.uniquenessRatio*256/100;
+    int uniquenessRatio = state.uniquenessRatio;
     short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
 
     ushort *sad, *hsad0, *hsad, *hsad_sub;
@@ -274,7 +274,7 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
     sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
     hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
     htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
-    cbuf0 = (uchar*)alignPtr(htext + height + wsz2 + 2 + dy0*ndisp, ALIGN);
+    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
 
     for( x = 0; x < TABSZ; x++ )
         tab[x] = (uchar)std::abs(x - ftzero);
@@ -427,28 +427,19 @@ static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
                 continue;
             }
 
-            __m128i minsad82 = _mm_unpackhi_epi64(minsad8, minsad8);
-            __m128i mind82 = _mm_unpackhi_epi64(mind8, mind8);
-            mask = _mm_cmpgt_epi16(minsad8, minsad82);
-            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
-            minsad8 = _mm_min_epi16(minsad8, minsad82);
-
-            minsad82 = _mm_shufflelo_epi16(minsad8, _MM_SHUFFLE(3,2,3,2));
-            mind82 = _mm_shufflelo_epi16(mind8, _MM_SHUFFLE(3,2,3,2));
-            mask = _mm_cmpgt_epi16(minsad8, minsad82);
-            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
-            minsad8 = _mm_min_epi16(minsad8, minsad82);
-
-            minsad82 = _mm_shufflelo_epi16(minsad8, 1);
-            mind82 = _mm_shufflelo_epi16(mind8, 1);
-            mask = _mm_cmpgt_epi16(minsad8, minsad82);
-            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
-            mind = (short)_mm_cvtsi128_si32(mind8);
-            minsad = sad[mind];
+            ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
+            _mm_store_si128((__m128i*)minsad_buf, minsad8);
+            _mm_store_si128((__m128i*)mind_buf, mind8);
+            for( d = 0; d < 8; d++ )
+                if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
+                {
+                    minsad = minsad_buf[d];
+                    mind = mind_buf[d];
+                }
 
             if( uniquenessRatio > 0 )
             {
-                int thresh = minsad + ((minsad * uniquenessRatio) >> 8);
+                int thresh = minsad + (minsad * uniquenessRatio/100);
                 __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
                 __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
                 __m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
diff --git a/modules/calib3d/test/test_cameracalibration.cpp b/modules/calib3d/test/test_cameracalibration.cpp
index fb8238237..7d0bdaeb1 100644
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -290,8 +290,8 @@ int CV_CameraCalibrationTest::compare(double* val, double* ref_val, int len,
 void CV_CameraCalibrationTest::run( int start_from )
 {
     int code = cvtest::TS::OK;
-    char            filepath[200];
-    char            filename[200];
+    cv::String            filepath;
+    cv::String            filename;
 
     CvSize          imageSize;
     CvSize          etalonSize;
@@ -337,12 +337,12 @@ void CV_CameraCalibrationTest::run( int start_from )
     int progress = 0;
     int values_read = -1;
 
-    sprintf( filepath, "%scameracalibration/", ts->get_data_path().c_str() );
-    sprintf( filename, "%sdatafiles.txt", filepath );
-    datafile = fopen( filename, "r" );
+    filepath = cv::format("%scv/cameracalibration/", ts->get_data_path().c_str() );
+    filename = cv::format("%sdatafiles.txt", filepath.c_str() );
+    datafile = fopen( filename.c_str(), "r" );
     if( datafile == 0 )
     {
-        ts->printf( cvtest::TS::LOG, "Could not open file with list of test files: %s\n", filename );
+        ts->printf( cvtest::TS::LOG, "Could not open file with list of test files: %s\n", filename.c_str() );
         code = cvtest::TS::FAIL_MISSING_TEST_DATA;
         goto _exit_;
     }
@@ -354,15 +354,15 @@ void CV_CameraCalibrationTest::run( int start_from )
     {
         values_read = fscanf(datafile,"%s",i_dat_file);
         CV_Assert(values_read == 1);
-        sprintf(filename, "%s%s", filepath, i_dat_file);
-        file = fopen(filename,"r");
+        filename = cv::format("%s%s", filepath.c_str(), i_dat_file);
+        file = fopen(filename.c_str(),"r");
 
         ts->update_context( this, currTest, true );
 
         if( file == 0 )
         {
             ts->printf( cvtest::TS::LOG,
-                "Can't open current test file: %s\n",filename);
+                "Can't open current test file: %s\n",filename.c_str());
             if( numTests == 1 )
             {
                 code = cvtest::TS::FAIL_MISSING_TEST_DATA;
@@ -480,7 +480,7 @@ void CV_CameraCalibrationTest::run( int start_from )
         values_read = fscanf(file,"%lf",goodDistortion+2); CV_Assert(values_read == 1);
         values_read = fscanf(file,"%lf",goodDistortion+3); CV_Assert(values_read == 1);
 
-        /* Read good Rot matrixes */
+        /* Read good Rot matrices */
         for( currImage = 0; currImage < numImages; currImage++ )
         {
             for( i = 0; i < 3; i++ )
@@ -1382,17 +1382,18 @@ void CV_StereoCalibrationTest::run( int )
 
     for(int testcase = 1; testcase <= ntests; testcase++)
     {
-        char filepath[1000];
+        cv::String filepath;
         char buf[1000];
-        sprintf( filepath, "%sstereo/case%d/stereo_calib.txt", ts->get_data_path().c_str(), testcase );
-        f = fopen(filepath, "rt");
+        filepath = cv::format("%scv/stereo/case%d/stereo_calib.txt", ts->get_data_path().c_str(), testcase );
+        f = fopen(filepath.c_str(), "rt");
         Size patternSize;
         vector<string> imglist;
 
         if( !f || !fgets(buf, sizeof(buf)-3, f) || sscanf(buf, "%d%d", &patternSize.width, &patternSize.height) != 2 )
         {
-            ts->printf( cvtest::TS::LOG, "The file %s can not be opened or has invalid content\n", filepath );
+            ts->printf( cvtest::TS::LOG, "The file %s can not be opened or has invalid content\n", filepath.c_str() );
             ts->set_failed_test_info( f ? cvtest::TS::FAIL_INVALID_TEST_DATA : cvtest::TS::FAIL_MISSING_TEST_DATA );
+            fclose(f);
             return;
         }
 
@@ -1405,7 +1406,7 @@ void CV_StereoCalibrationTest::run( int )
                 buf[--len] = '\0';
             if( buf[0] == '#')
                 continue;
-            sprintf(filepath, "%sstereo/case%d/%s", ts->get_data_path().c_str(), testcase, buf );
+            filepath = cv::format("%scv/stereo/case%d/%s", ts->get_data_path().c_str(), testcase, buf );
             imglist.push_back(string(filepath));
         }
         fclose(f);
@@ -1733,7 +1734,7 @@ double CV_StereoCalibrationTest_C::calibrateStereoCamera( const vector<vector<Po
 
     return cvStereoCalibrate(&_objPt, &_imgPt, &_imgPt2, &_npoints, &_cameraMatrix1,
         &_distCoeffs1, &_cameraMatrix2, &_distCoeffs2, imageSize,
-        &matR, &matT, &matE, &matF, criteria, flags );
+        &matR, &matT, &matE, &matF, flags, criteria );
 }
 
 void CV_StereoCalibrationTest_C::rectify( const Mat& cameraMatrix1, const Mat& distCoeffs1,
@@ -1830,7 +1831,7 @@ double CV_StereoCalibrationTest_CPP::calibrateStereoCamera( const vector<vector<
 {
     return stereoCalibrate( objectPoints, imagePoints1, imagePoints2,
                     cameraMatrix1, distCoeffs1, cameraMatrix2, distCoeffs2,
-                    imageSize, R, T, E, F, criteria, flags );
+                    imageSize, R, T, E, F, flags, criteria );
 }
 
 void CV_StereoCalibrationTest_CPP::rectify( const Mat& cameraMatrix1, const Mat& distCoeffs1,
diff --git a/modules/calib3d/test/test_cameracalibration_artificial.cpp b/modules/calib3d/test/test_cameracalibration_artificial.cpp
index 1ff13c9e8..07e5894b9 100644
--- a/modules/calib3d/test/test_cameracalibration_artificial.cpp
+++ b/modules/calib3d/test/test_cameracalibration_artificial.cpp
@@ -85,7 +85,8 @@ Mat calcRvec(const vector<Point3f>& points, const Size& cornerSize)
 class CV_CalibrateCameraArtificialTest : public cvtest::BaseTest
 {
 public:
-    CV_CalibrateCameraArtificialTest()
+    CV_CalibrateCameraArtificialTest() :
+        r(0)
     {
     }
     ~CV_CalibrateCameraArtificialTest() {}
diff --git a/modules/calib3d/test/test_cameracalibration_badarg.cpp b/modules/calib3d/test/test_cameracalibration_badarg.cpp
index 3edab8bec..f8443d1ec 100644
--- a/modules/calib3d/test/test_cameracalibration_badarg.cpp
+++ b/modules/calib3d/test/test_cameracalibration_badarg.cpp
@@ -55,7 +55,7 @@ public:
     ~CV_CameraCalibrationBadArgTest() {}
 protected:
     void run(int);
-    void run_func(void) {};
+    void run_func(void) {}
 
     const static int M = 1;
 
@@ -334,7 +334,7 @@ public:
     CV_Rodrigues2BadArgTest() {}
     ~CV_Rodrigues2BadArgTest() {}
 protected:
-    void run_func(void) {};
+    void run_func(void) {}
 
     struct C_Caller
     {
@@ -459,10 +459,10 @@ public:
         Size imsSize(800, 600);
         camMat << 300.f, 0.f, imsSize.width/2.f, 0, 300.f, imsSize.height/2.f, 0.f, 0.f, 1.f;
         distCoeffs << 1.2f, 0.2f, 0.f, 0.f, 0.f;
-    };
-    ~CV_ProjectPoints2BadArgTest() {} ;
+    }
+    ~CV_ProjectPoints2BadArgTest() {}
 protected:
-    void run_func(void) {};
+    void run_func(void) {}
 
     Mat_<float> camMat;
     Mat_<float> distCoeffs;
diff --git a/modules/calib3d/test/test_chessboardgenerator.hpp b/modules/calib3d/test/test_chessboardgenerator.hpp
index 97d0fedf5..9898b0735 100644
--- a/modules/calib3d/test/test_chessboardgenerator.hpp
+++ b/modules/calib3d/test/test_chessboardgenerator.hpp
@@ -34,7 +34,7 @@ private:
     Mat rvec, tvec;
 };
 
-};
+}
 
 
 #endif
diff --git a/modules/calib3d/test/test_chesscorners.cpp b/modules/calib3d/test/test_chesscorners.cpp
index 6769e47c6..fd3da2e44 100644
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@@ -185,13 +185,13 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
     switch( pattern )
     {
         case CHESSBOARD:
-            folder = string(ts->get_data_path()) + "cameracalibration/";
+            folder = string(ts->get_data_path()) + "cv/cameracalibration/";
             break;
         case CIRCLES_GRID:
-            folder = string(ts->get_data_path()) + "cameracalibration/circles/";
+            folder = string(ts->get_data_path()) + "cv/cameracalibration/circles/";
             break;
         case ASYMMETRIC_CIRCLES_GRID:
-            folder = string(ts->get_data_path()) + "cameracalibration/asymmetric_circles/";
+            folder = string(ts->get_data_path()) + "cv/cameracalibration/asymmetric_circles/";
             break;
     }
 
@@ -309,8 +309,9 @@ void CV_ChessboardDetectorTest::run_batch( const string& filename )
         progress = update_progress( progress, idx, max_idx, 0 );
     }
 
-    sum_error /= count;
-    ts->printf(cvtest::TS::LOG, "Average error is %f\n", sum_error);
+    if (count != 0)
+        sum_error /= count;
+    ts->printf(cvtest::TS::LOG, "Average error is %f (%d patterns have been found)\n", sum_error, count);
 }
 
 double calcErrorMinError(const Size& cornSz, const vector<Point2f>& corners_found, const vector<Point2f>& corners_generated)
diff --git a/modules/calib3d/test/test_chesscorners_badarg.cpp b/modules/calib3d/test/test_chesscorners_badarg.cpp
index 318912eeb..520d8df63 100644
--- a/modules/calib3d/test/test_chesscorners_badarg.cpp
+++ b/modules/calib3d/test/test_chesscorners_badarg.cpp
@@ -89,7 +89,14 @@ protected:
     }
 };
 
-CV_ChessboardDetectorBadArgTest::CV_ChessboardDetectorBadArgTest() {}
+CV_ChessboardDetectorBadArgTest::CV_ChessboardDetectorBadArgTest()
+{
+    cpp = false;
+    flags = 0;
+    out_corners = NULL;
+    out_corner_count = NULL;
+    drawCorners = was_found = false;
+}
 
 /* ///////////////////// chess_corner_test ///////////////////////// */
 void CV_ChessboardDetectorBadArgTest::run( int /*start_from */)
diff --git a/modules/calib3d/test/test_chesscorners_timing.cpp b/modules/calib3d/test/test_chesscorners_timing.cpp
index 47653f88d..61287ab67 100644
--- a/modules/calib3d/test/test_chesscorners_timing.cpp
+++ b/modules/calib3d/test/test_chesscorners_timing.cpp
@@ -62,8 +62,8 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
     int code = cvtest::TS::OK;
 
     /* test parameters */
-    char   filepath[1000];
-    char   filename[1000];
+    std::string   filepath;
+    std::string   filename;
 
     CvMat*  _v = 0;
     CvPoint2D32f* v;
@@ -75,9 +75,9 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
     int  idx, max_idx;
     int  progress = 0;
 
-    sprintf( filepath, "%scameracalibration/", ts->get_data_path().c_str() );
-    sprintf( filename, "%schessboard_timing_list.dat", filepath );
-    CvFileStorage* fs = cvOpenFileStorage( filename, 0, CV_STORAGE_READ );
+    filepath = cv::format("%scv/cameracalibration/", ts->get_data_path().c_str() );
+    filename = cv::format("%schessboard_timing_list.dat", filepath.c_str() );
+    CvFileStorage* fs = cvOpenFileStorage( filename.c_str(), 0, CV_STORAGE_READ );
     CvFileNode* board_list = fs ? cvGetFileNodeByName( fs, 0, "boards" ) : 0;
 
     if( !fs || !board_list || !CV_NODE_IS_SEQ(board_list->tag) ||
@@ -105,14 +105,14 @@ void CV_ChessboardDetectorTimingTest::run( int start_from )
         ts->update_context( this, idx-1, true );
 
         /* read the image */
-        sprintf( filename, "%s%s", filepath, imgname );
+        filename = cv::format("%s%s", filepath.c_str(), imgname );
 
         cv::Mat img2 = cv::imread( filename );
         img = img2;
 
         if( img2.empty() )
         {
-            ts->printf( cvtest::TS::LOG, "one of chessboard images can't be read: %s\n", filename );
+            ts->printf( cvtest::TS::LOG, "one of chessboard images can't be read: %s\n", filename.c_str() );
             if( max_idx == 1 )
             {
                 code = cvtest::TS::FAIL_MISSING_TEST_DATA;
diff --git a/modules/calib3d/test/test_cornerssubpix.cpp b/modules/calib3d/test/test_cornerssubpix.cpp
index 4426d5ea5..d1f077675 100644
--- a/modules/calib3d/test/test_cornerssubpix.cpp
+++ b/modules/calib3d/test/test_cornerssubpix.cpp
@@ -211,6 +211,7 @@ void CV_ChessboardSubpixelTest::run( int )
 
         progress = update_progress( progress, i-1, runs_count, 0 );
     }
+    ASSERT_NE(0, count);
     sum_dist /= count;
     ts->printf(cvtest::TS::LOG, "Average error after findCornerSubpix: %f\n", sum_dist);
 
diff --git a/modules/calib3d/test/test_fundam.cpp b/modules/calib3d/test/test_fundam.cpp
index 7e6f9a8e7..749faf125 100644
--- a/modules/calib3d/test/test_fundam.cpp
+++ b/modules/calib3d/test/test_fundam.cpp
@@ -808,6 +808,7 @@ CV_FundamentalMatTest::CV_FundamentalMatTest()
     method = 0;
     img_size = 10;
     cube_size = 10;
+    dims = 0;
     min_f = 1;
     max_f = 3;
     sigma = 0;//0.1;
@@ -1086,7 +1087,6 @@ protected:
     int img_size;
     int cube_size;
     int dims;
-    int e_result;
     double min_f, max_f;
     double sigma;
 };
@@ -1124,9 +1124,10 @@ CV_EssentialMatTest::CV_EssentialMatTest()
     method = 0;
     img_size = 10;
     cube_size = 10;
+    dims = 0;
     min_f = 1;
     max_f = 3;
-
+    sigma = 0;
 }
 
 
diff --git a/modules/calib3d/test/test_main.cpp b/modules/calib3d/test/test_main.cpp
index 6b2499344..6f9ac2e0d 100644
--- a/modules/calib3d/test/test_main.cpp
+++ b/modules/calib3d/test/test_main.cpp
@@ -1,3 +1,3 @@
 #include "test_precomp.hpp"
 
-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("")
diff --git a/modules/calib3d/test/test_stereomatching.cpp b/modules/calib3d/test/test_stereomatching.cpp
index 8e1120e47..8beb9f905 100644
--- a/modules/calib3d/test/test_stereomatching.cpp
+++ b/modules/calib3d/test/test_stereomatching.cpp
@@ -398,7 +398,7 @@ protected:
 
 void CV_StereoMatchingTest::run(int)
 {
-    string dataPath = ts->get_data_path();
+    string dataPath = ts->get_data_path() + "cv/";
     string algorithmName = name;
     assert( !algorithmName.empty() );
     if( dataPath.empty() )
diff --git a/modules/calib3d/test/test_undistort.cpp b/modules/calib3d/test/test_undistort.cpp
index 959d8bec7..cd6af1295 100644
--- a/modules/calib3d/test/test_undistort.cpp
+++ b/modules/calib3d/test/test_undistort.cpp
@@ -75,6 +75,9 @@ CV_DefaultNewCameraMatrixTest::CV_DefaultNewCameraMatrixTest()
     test_array[INPUT].push_back(NULL);
     test_array[OUTPUT].push_back(NULL);
     test_array[REF_OUTPUT].push_back(NULL);
+
+    matrix_type = 0;
+    center_principal_point = false;
 }
 
 void CV_DefaultNewCameraMatrixTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
@@ -200,6 +203,9 @@ CV_UndistortPointsTest::CV_UndistortPointsTest()
     test_array[OUTPUT].push_back(NULL); // distorted dst points
     test_array[TEMP].push_back(NULL); // dst points
     test_array[REF_OUTPUT].push_back(NULL);
+
+    useCPlus = useDstMat = false;
+    zero_new_cam = zero_distortion = zero_R = false;
 }
 
 void CV_UndistortPointsTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
@@ -605,6 +611,11 @@ CV_InitUndistortRectifyMapTest::CV_InitUndistortRectifyMapTest()
     test_array[INPUT].push_back(NULL); // new camera matrix
     test_array[OUTPUT].push_back(NULL); // distorted dst points
     test_array[REF_OUTPUT].push_back(NULL);
+
+    useCPlus = false;
+    zero_distortion = zero_new_cam = zero_R = false;
+    _mapx = _mapy = NULL;
+    mat_type = 0;
 }
 
 void CV_InitUndistortRectifyMapTest::get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types )
diff --git a/modules/calib3d/test/test_undistort_badarg.cpp b/modules/calib3d/test/test_undistort_badarg.cpp
index 60460a55a..f3f762fa6 100644
--- a/modules/calib3d/test/test_undistort_badarg.cpp
+++ b/modules/calib3d/test/test_undistort_badarg.cpp
@@ -78,6 +78,8 @@ private:
 
 CV_UndistortPointsBadArgTest::CV_UndistortPointsBadArgTest ()
 {
+    useCPlus = false;
+    _camera_mat = matR = matP = _distortion_coeffs = _src_points = _dst_points = NULL;
 }
 
 void CV_UndistortPointsBadArgTest::run_func()
@@ -311,6 +313,8 @@ private:
 
 CV_InitUndistortRectifyMapBadArgTest::CV_InitUndistortRectifyMapBadArgTest ()
 {
+    useCPlus = false;
+    _camera_mat = matR = _new_camera_mat = _distortion_coeffs = _mapx = _mapy = NULL;
 }
 
 void CV_InitUndistortRectifyMapBadArgTest::run_func()
@@ -431,6 +435,8 @@ private:
 
 CV_UndistortBadArgTest::CV_UndistortBadArgTest ()
 {
+    useCPlus = false;
+    _camera_mat = _new_camera_mat = _distortion_coeffs = _src = _dst = NULL;
 }
 
 void CV_UndistortBadArgTest::run_func()
diff --git a/modules/contrib/include/opencv2/contrib.hpp b/modules/contrib/include/opencv2/contrib.hpp
index 75c6f3db0..1770c0b53 100644
--- a/modules/contrib/include/opencv2/contrib.hpp
+++ b/modules/contrib/include/opencv2/contrib.hpp
@@ -55,7 +55,7 @@ class CV_EXPORTS Octree
 public:
     struct Node
     {
-        Node() {}
+        Node() { memset(this, 0, sizeof(Node)); }
         int begin, end;
         float x_min, x_max, y_min, y_max, z_min, z_max;
         int maxLevels;
@@ -523,7 +523,7 @@ public:
     // Initializes a LDA with num_components (default 0) and specifies how
     // samples are aligned (default dataAsRow=true).
     LDA(int num_components = 0) :
-        _num_components(num_components) {};
+        _num_components(num_components) { }
 
     // Initializes and performs a Discriminant Analysis with Fisher's
     // Optimization Criterion on given data in src and corresponding labels
@@ -561,7 +561,7 @@ public:
     Mat reconstruct(InputArray src);
 
     // Returns the eigenvectors of this LDA.
-    Mat eigenvectors() const { return _eigenvectors; };
+    Mat eigenvectors() const { return _eigenvectors; }
 
     // Returns the eigenvalues of this LDA.
     Mat eigenvalues() const { return _eigenvalues; }
diff --git a/modules/contrib/src/adaptiveskindetector.cpp b/modules/contrib/src/adaptiveskindetector.cpp
index 092e48cb4..c4fbbe000 100644
--- a/modules/contrib/src/adaptiveskindetector.cpp
+++ b/modules/contrib/src/adaptiveskindetector.cpp
@@ -55,7 +55,7 @@ void CvAdaptiveSkinDetector::initData(IplImage *src, int widthDivider, int heigh
     imgGrayFrame = cvCreateImage(imageSize, IPL_DEPTH_8U, 1);
     imgLastGrayFrame = cvCreateImage(imageSize, IPL_DEPTH_8U, 1);
     imgHSVFrame = cvCreateImage(imageSize, IPL_DEPTH_8U, 3);
-};
+}
 
 CvAdaptiveSkinDetector::CvAdaptiveSkinDetector(int samplingDivider, int morphingMethod)
 {
@@ -80,7 +80,7 @@ CvAdaptiveSkinDetector::CvAdaptiveSkinDetector(int samplingDivider, int morphing
     imgLastGrayFrame = NULL;
     imgSaturationFrame = NULL;
     imgHSVFrame = NULL;
-};
+}
 
 CvAdaptiveSkinDetector::~CvAdaptiveSkinDetector()
 {
@@ -93,7 +93,7 @@ CvAdaptiveSkinDetector::~CvAdaptiveSkinDetector()
     cvReleaseImage(&imgGrayFrame);
     cvReleaseImage(&imgLastGrayFrame);
     cvReleaseImage(&imgHSVFrame);
-};
+}
 
 void CvAdaptiveSkinDetector::process(IplImage *inputBGRImage, IplImage *outputHueMask)
 {
@@ -190,7 +190,7 @@ void CvAdaptiveSkinDetector::process(IplImage *inputBGRImage, IplImage *outputHu
 
     if (outputHueMask != NULL)
         cvCopy(imgFilteredFrame, outputHueMask);
-};
+}
 
 
 //------------------------- Histogram for Adaptive Skin Detector -------------------------//
@@ -202,12 +202,12 @@ CvAdaptiveSkinDetector::Histogram::Histogram()
     float *ranges[] = { range };
     fHistogram = cvCreateHist(1, histogramSize, CV_HIST_ARRAY, ranges, 1);
     cvClearHist(fHistogram);
-};
+}
 
 CvAdaptiveSkinDetector::Histogram::~Histogram()
 {
     cvReleaseHist(&fHistogram);
-};
+}
 
 int CvAdaptiveSkinDetector::Histogram::findCoverageIndex(double surfaceToCover, int defaultValue)
 {
@@ -221,7 +221,7 @@ int CvAdaptiveSkinDetector::Histogram::findCoverageIndex(double surfaceToCover,
         }
     }
     return defaultValue;
-};
+}
 
 void CvAdaptiveSkinDetector::Histogram::findCurveThresholds(int &x1, int &x2, double percent)
 {
@@ -244,7 +244,7 @@ void CvAdaptiveSkinDetector::Histogram::findCurveThresholds(int &x1, int &x2, do
         x2 = GSD_HUE_UT;
     else
         x2 += GSD_HUE_LT;
-};
+}
 
 void CvAdaptiveSkinDetector::Histogram::mergeWith(CvAdaptiveSkinDetector::Histogram *source, double weight)
 {
@@ -285,4 +285,4 @@ void CvAdaptiveSkinDetector::Histogram::mergeWith(CvAdaptiveSkinDetector::Histog
             }
         }
     }
-};
+}
diff --git a/modules/contrib/src/ba.cpp b/modules/contrib/src/ba.cpp
index 9a8002e96..d8ed2f944 100644
--- a/modules/contrib/src/ba.cpp
+++ b/modules/contrib/src/ba.cpp
@@ -940,7 +940,7 @@ static void fjac(int /*i*/, int /*j*/, CvMat *point_params, CvMat* cam_params, C
 
 #endif
 
-};
+}
 static void func(int /*i*/, int /*j*/, CvMat *point_params, CvMat* cam_params, CvMat* estim, void* /*data*/) {
   //just do projections
   CvMat _Mi;
@@ -979,17 +979,17 @@ static void func(int /*i*/, int /*j*/, CvMat *point_params, CvMat* cam_params, C
   cvTranspose( _mp2, estim );
   cvReleaseMat( &_mp );
   cvReleaseMat( &_mp2 );
-};
+}
 
 static void fjac_new(int i, int j, Mat& point_params, Mat& cam_params, Mat& A, Mat& B, void* data) {
   CvMat _point_params = point_params, _cam_params = cam_params, _Al = A, _Bl = B;
   fjac(i,j, &_point_params, &_cam_params, &_Al, &_Bl, data);
-};
+}
 
 static void func_new(int i, int j, Mat& point_params, Mat& cam_params, Mat& estim, void* data)  {
   CvMat _point_params = point_params, _cam_params = cam_params, _estim = estim;
   func(i,j,&_point_params,&_cam_params,&_estim,data);
-};
+}
 
 void LevMarqSparse::bundleAdjust( std::vector<Point3d>& points, //positions of points in global coordinate system (input and output)
           const std::vector<std::vector<Point2d> >& imagePoints, //projections of 3d points for every camera
diff --git a/modules/contrib/src/facerec.cpp b/modules/contrib/src/facerec.cpp
index c9fd97608..c6f154af2 100644
--- a/modules/contrib/src/facerec.cpp
+++ b/modules/contrib/src/facerec.cpp
@@ -833,7 +833,7 @@ void LBPH::predict(InputArray _src, int &minClass, double &minDist) const {
     minDist = DBL_MAX;
     minClass = -1;
     for(size_t sampleIdx = 0; sampleIdx < _histograms.size(); sampleIdx++) {
-        double dist = compareHist(_histograms[sampleIdx], query, HISTCMP_CHISQR);
+        double dist = compareHist(_histograms[sampleIdx], query, HISTCMP_CHISQR_ALT);
         if((dist < minDist) && (dist < _threshold)) {
             minDist = dist;
             minClass = _labels.at<int>((int) sampleIdx);
@@ -872,7 +872,7 @@ CV_INIT_ALGORITHM(Eigenfaces, "FaceRecognizer.Eigenfaces",
                   obj.info()->addParam(obj, "labels", obj._labels, true);
                   obj.info()->addParam(obj, "eigenvectors", obj._eigenvectors, true);
                   obj.info()->addParam(obj, "eigenvalues", obj._eigenvalues, true);
-                  obj.info()->addParam(obj, "mean", obj._mean, true));
+                  obj.info()->addParam(obj, "mean", obj._mean, true))
 
 CV_INIT_ALGORITHM(Fisherfaces, "FaceRecognizer.Fisherfaces",
                   obj.info()->addParam(obj, "ncomponents", obj._num_components);
@@ -881,7 +881,7 @@ CV_INIT_ALGORITHM(Fisherfaces, "FaceRecognizer.Fisherfaces",
                   obj.info()->addParam(obj, "labels", obj._labels, true);
                   obj.info()->addParam(obj, "eigenvectors", obj._eigenvectors, true);
                   obj.info()->addParam(obj, "eigenvalues", obj._eigenvalues, true);
-                  obj.info()->addParam(obj, "mean", obj._mean, true));
+                  obj.info()->addParam(obj, "mean", obj._mean, true))
 
 CV_INIT_ALGORITHM(LBPH, "FaceRecognizer.LBPH",
                   obj.info()->addParam(obj, "radius", obj._radius);
@@ -890,7 +890,7 @@ CV_INIT_ALGORITHM(LBPH, "FaceRecognizer.LBPH",
                   obj.info()->addParam(obj, "grid_y", obj._grid_y);
                   obj.info()->addParam(obj, "threshold", obj._threshold);
                   obj.info()->addParam(obj, "histograms", obj._histograms, true);
-                  obj.info()->addParam(obj, "labels", obj._labels, true));
+                  obj.info()->addParam(obj, "labels", obj._labels, true))
 
 bool initModule_contrib()
 {
diff --git a/modules/contrib/src/fuzzymeanshifttracker.cpp b/modules/contrib/src/fuzzymeanshifttracker.cpp
index 2ae6b7195..ce247d96a 100644
--- a/modules/contrib/src/fuzzymeanshifttracker.cpp
+++ b/modules/contrib/src/fuzzymeanshifttracker.cpp
@@ -41,7 +41,7 @@ CvFuzzyPoint::CvFuzzyPoint(double _x, double _y)
 {
     x = _x;
     y = _y;
-};
+}
 
 bool CvFuzzyCurve::between(double x, double x1, double x2)
 {
@@ -51,37 +51,37 @@ bool CvFuzzyCurve::between(double x, double x1, double x2)
         return true;
 
     return false;
-};
+}
 
 CvFuzzyCurve::CvFuzzyCurve()
 {
     value = 0;
-};
+}
 
 CvFuzzyCurve::~CvFuzzyCurve()
 {
     // nothing to do
-};
+}
 
 void CvFuzzyCurve::setCentre(double _centre)
 {
     centre = _centre;
-};
+}
 
 double CvFuzzyCurve::getCentre()
 {
     return centre;
-};
+}
 
 void CvFuzzyCurve::clear()
 {
     points.clear();
-};
+}
 
 void CvFuzzyCurve::addPoint(double x, double y)
 {
     points.push_back(CvFuzzyPoint(x, y));
-};
+}
 
 double CvFuzzyCurve::calcValue(double param)
 {
@@ -102,41 +102,41 @@ double CvFuzzyCurve::calcValue(double param)
         }
     }
     return 0;
-};
+}
 
 double CvFuzzyCurve::getValue()
 {
     return value;
-};
+}
 
 void CvFuzzyCurve::setValue(double _value)
 {
     value = _value;
-};
+}
 
 
 CvFuzzyFunction::CvFuzzyFunction()
 {
     // nothing to do
-};
+}
 
 CvFuzzyFunction::~CvFuzzyFunction()
 {
     curves.clear();
-};
+}
 
 void CvFuzzyFunction::addCurve(CvFuzzyCurve *curve, double value)
 {
     curves.push_back(*curve);
     curve->setValue(value);
-};
+}
 
 void CvFuzzyFunction::resetValues()
 {
     int numCurves = (int)curves.size();
     for (int i = 0; i < numCurves; i++)
         curves[i].setValue(0);
-};
+}
 
 double CvFuzzyFunction::calcValue()
 {
@@ -153,7 +153,7 @@ double CvFuzzyFunction::calcValue()
         return s1/s2;
     else
         return 0;
-};
+}
 
 CvFuzzyCurve *CvFuzzyFunction::newCurve()
 {
@@ -161,14 +161,14 @@ CvFuzzyCurve *CvFuzzyFunction::newCurve()
     c = new CvFuzzyCurve();
     addCurve(c);
     return c;
-};
+}
 
 CvFuzzyRule::CvFuzzyRule()
 {
     fuzzyInput1 = NULL;
     fuzzyInput2 = NULL;
     fuzzyOutput = NULL;
-};
+}
 
 CvFuzzyRule::~CvFuzzyRule()
 {
@@ -180,14 +180,14 @@ CvFuzzyRule::~CvFuzzyRule()
 
     if (fuzzyOutput != NULL)
         delete fuzzyOutput;
-};
+}
 
 void CvFuzzyRule::setRule(CvFuzzyCurve *c1, CvFuzzyCurve *c2, CvFuzzyCurve *o1)
 {
     fuzzyInput1 = c1;
     fuzzyInput2 = c2;
     fuzzyOutput = o1;
-};
+}
 
 double CvFuzzyRule::calcValue(double param1, double param2)
 {
@@ -203,31 +203,31 @@ double CvFuzzyRule::calcValue(double param1, double param2)
     }
     else
         return v1;
-};
+}
 
 CvFuzzyCurve *CvFuzzyRule::getOutputCurve()
 {
     return fuzzyOutput;
-};
+}
 
 CvFuzzyController::CvFuzzyController()
 {
     // nothing to do
-};
+}
 
 CvFuzzyController::~CvFuzzyController()
 {
     int size = (int)rules.size();
     for(int i = 0; i < size; i++)
         delete rules[i];
-};
+}
 
 void CvFuzzyController::addRule(CvFuzzyCurve *c1, CvFuzzyCurve *c2, CvFuzzyCurve *o1)
 {
     CvFuzzyRule *f = new CvFuzzyRule();
     rules.push_back(f);
     f->setRule(c1, c2, o1);
-};
+}
 
 double CvFuzzyController::calcOutput(double param1, double param2)
 {
@@ -243,7 +243,7 @@ double CvFuzzyController::calcOutput(double param1, double param2)
     }
     v = list.calcValue();
     return v;
-};
+}
 
 CvFuzzyMeanShiftTracker::FuzzyResizer::FuzzyResizer()
 {
@@ -299,12 +299,12 @@ CvFuzzyMeanShiftTracker::FuzzyResizer::FuzzyResizer()
     fuzzyController.addRule(i1L, NULL, oS);
     fuzzyController.addRule(i1M, NULL, oZE);
     fuzzyController.addRule(i1H, NULL, oE);
-};
+}
 
 int CvFuzzyMeanShiftTracker::FuzzyResizer::calcOutput(double edgeDensity, double density)
 {
     return (int)fuzzyController.calcOutput(edgeDensity, density);
-};
+}
 
 CvFuzzyMeanShiftTracker::SearchWindow::SearchWindow()
 {
@@ -329,7 +329,7 @@ CvFuzzyMeanShiftTracker::SearchWindow::SearchWindow()
     depthLow = 0;
     depthHigh = 0;
     fuzzyResizer = NULL;
-};
+}
 
 CvFuzzyMeanShiftTracker::SearchWindow::~SearchWindow()
 {
@@ -355,7 +355,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::setSize(int _x, int _y, int _width,
 
     if (y + height > maxHeight)
         height = maxHeight - y;
-};
+}
 
 void CvFuzzyMeanShiftTracker::SearchWindow::initDepthValues(IplImage *maskImage, IplImage *depthMap)
 {
@@ -409,7 +409,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::initDepthValues(IplImage *maskImage,
         depthHigh = 32000;
         depthLow = 0;
     }
-};
+}
 
 bool CvFuzzyMeanShiftTracker::SearchWindow::shift()
 {
@@ -422,7 +422,7 @@ bool CvFuzzyMeanShiftTracker::SearchWindow::shift()
     {
         return false;
     }
-};
+}
 
 void CvFuzzyMeanShiftTracker::SearchWindow::extractInfo(IplImage *maskImage, IplImage *depthMap, bool initDepth)
 {
@@ -528,7 +528,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::extractInfo(IplImage *maskImage, Ipl
         ellipseAngle = 0;
         density = 0;
     }
-};
+}
 
 void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsEdgeDensityLinear(int &resizeDx, int &resizeDy, int &resizeDw, int &resizeDh) {
     int x1 = horizontalEdgeTop;
@@ -572,7 +572,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsEdgeDensityLinear(in
     } else {
         resizeDw = - resizeDx;
     }
-};
+}
 
 void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsInnerDensity(int &resizeDx, int &resizeDy, int &resizeDw, int &resizeDh)
 {
@@ -588,7 +588,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsInnerDensity(int &re
     resizeDy = (int)(py*dy);
     resizeDw = (int)((1-px)*dx);
     resizeDh = (int)((1-py)*dy);
-};
+}
 
 void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsEdgeDensityFuzzy(int &resizeDx, int &resizeDy, int &resizeDw, int &resizeDh)
 {
@@ -627,7 +627,7 @@ void CvFuzzyMeanShiftTracker::SearchWindow::getResizeAttribsEdgeDensityFuzzy(int
         resizeDy = int(-dy1);
         resizeDh = int(dy1+dy2);
     }
-};
+}
 
 bool CvFuzzyMeanShiftTracker::SearchWindow::meanShift(IplImage *maskImage, IplImage *depthMap, int maxIteration, bool initDepth)
 {
@@ -640,7 +640,7 @@ bool CvFuzzyMeanShiftTracker::SearchWindow::meanShift(IplImage *maskImage, IplIm
     } while (++numShifts < maxIteration);
 
     return false;
-};
+}
 
 void CvFuzzyMeanShiftTracker::findOptimumSearchWindow(SearchWindow &searchWindow, IplImage *maskImage, IplImage *depthMap, int maxIteration, int resizeMethod, bool initDepth)
 {
@@ -680,17 +680,17 @@ void CvFuzzyMeanShiftTracker::findOptimumSearchWindow(SearchWindow &searchWindow
 
         searchWindow.setSize(searchWindow.x + resizeDx, searchWindow.y + resizeDy, searchWindow.width + resizeDw, searchWindow.height + resizeDh);
     }
-};
+}
 
 CvFuzzyMeanShiftTracker::CvFuzzyMeanShiftTracker()
 {
     searchMode = tsSetWindow;
-};
+}
 
 CvFuzzyMeanShiftTracker::~CvFuzzyMeanShiftTracker()
 {
     // nothing to do
-};
+}
 
 void CvFuzzyMeanShiftTracker::track(IplImage *maskImage, IplImage *depthMap, int resizeMethod, bool resetSearch, int minKernelMass)
 {
@@ -718,4 +718,4 @@ void CvFuzzyMeanShiftTracker::track(IplImage *maskImage, IplImage *depthMap, int
             else
                 searchMode = tsTracking;
     }
-};
+}
diff --git a/modules/contrib/src/rgbdodometry.cpp b/modules/contrib/src/rgbdodometry.cpp
index 6f86f17ec..a4e64f993 100644
--- a/modules/contrib/src/rgbdodometry.cpp
+++ b/modules/contrib/src/rgbdodometry.cpp
@@ -114,7 +114,7 @@ void computeProjectiveMatrix( const Mat& ksi, Mat& Rt )
 {
     CV_Assert( ksi.size() == Size(1,6) && ksi.type() == CV_64FC1 );
 
-#if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 3
+#if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 3 && (!defined _MSC_VER || !defined _M_X64 || _MSC_VER > 1500)
     const double* ksi_ptr = reinterpret_cast<const double*>(ksi.ptr(0));
     Eigen::Matrix<double,4,4> twist, g;
     twist << 0.,          -ksi_ptr[2], ksi_ptr[1],  ksi_ptr[3],
diff --git a/modules/contrib/src/spinimages.cpp b/modules/contrib/src/spinimages.cpp
index 4e58472bf..63cfcb5ab 100644
--- a/modules/contrib/src/spinimages.cpp
+++ b/modules/contrib/src/spinimages.cpp
@@ -709,7 +709,7 @@ void cv::SpinImageModel::defaultParams()
 
     T_GeometriccConsistency = 0.25f;
     T_GroupingCorespondances = 0.25f;
-};
+}
 
 Mat cv::SpinImageModel::packRandomScaledSpins(bool separateScale, size_t xCount, size_t yCount) const
 {
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 2e57d2ed9..e5898023f 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -2,12 +2,15 @@ set(the_description "The Core Functionality")
 ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" OPTIONAL opencv_cudev)
 ocv_module_include_directories(${ZLIB_INCLUDE_DIRS})
 
+if(HAVE_WINRT_CX)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW")
+endif()
 if(HAVE_WINRT)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
 if(HAVE_CUDA)
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function -Wshadow)
 endif()
 
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
diff --git a/modules/core/doc/basic_structures.rst b/modules/core/doc/basic_structures.rst
index 886a886df..bd29b829c 100644
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@@ -316,6 +316,7 @@ RotatedRect
         RotatedRect();
         RotatedRect(const Point2f& center, const Size2f& size, float angle);
         RotatedRect(const CvBox2D& box);
+        RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
 
         //! returns 4 vertices of the rectangle
         void points(Point2f pts[]) const;
@@ -338,7 +339,11 @@ The class represents rotated (i.e. not up-right) rectangles on a plane. Each rec
         :param size: Width and height of the rectangle.
         :param angle: The rotation angle in a clockwise direction. When the angle is 0, 90, 180, 270 etc., the rectangle becomes an up-right rectangle.
         :param box: The rotated rectangle parameters as the obsolete CvBox2D structure.
+    .. ocv:function:: RotatedRect::RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3)
 
+        :param point1:
+        :param point2:
+        :param point3: Any 3 end points of the RotatedRect. They must be given in order (either clockwise or anticlockwise).
     .. ocv:function:: void RotatedRect::points( Point2f pts[] ) const
     .. ocv:function:: Rect RotatedRect::boundingRect() const
 
@@ -1615,7 +1620,7 @@ The method copies the matrix data to another matrix. Before copying the data, th
 
 so that the destination matrix is reallocated if needed. While ``m.copyTo(m);`` works flawlessly, the function does not handle the case of a partial overlap between the source and the destination matrices.
 
-When the operation mask is specified, and the ``Mat::create`` call shown above reallocated the matrix, the newly allocated matrix is initialized with all zeros before copying the data.
+When the operation mask is specified, if the ``Mat::create`` call shown above reallocates the matrix, the newly allocated matrix is initialized with all zeros before copying the data.
 
 .. _Mat::convertTo:
 
diff --git a/modules/core/doc/drawing_functions.rst b/modules/core/doc/drawing_functions.rst
index 6968d580e..06e51cf41 100644
--- a/modules/core/doc/drawing_functions.rst
+++ b/modules/core/doc/drawing_functions.rst
@@ -34,7 +34,7 @@ circle
 ----------
 Draws a circle.
 
-.. ocv:function:: void circle( Mat& img, Point center, int radius, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
+.. ocv:function:: void circle( InputOutputArray img, Point center, int radius, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
 
 .. ocv:pyfunction:: cv2.circle(img, center, radius, color[, thickness[, lineType[, shift]]]) -> img
 
@@ -83,9 +83,9 @@ ellipse
 -----------
 Draws a simple or thick elliptic arc or fills an ellipse sector.
 
-.. ocv:function:: void ellipse( Mat& img, Point center, Size axes, double angle, double startAngle, double endAngle, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
+.. ocv:function:: void ellipse( InputOutputArray img, Point center, Size axes, double angle, double startAngle, double endAngle, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
 
-.. ocv:function:: void ellipse( Mat& img, const RotatedRect& box, const Scalar& color, int thickness=1, int lineType=LINE_8 )
+.. ocv:function:: void ellipse( InputOutputArray img, const RotatedRect& box, const Scalar& color, int thickness=1, int lineType=LINE_8 )
 
 .. ocv:pyfunction:: cv2.ellipse(img, center, axes, angle, startAngle, endAngle, color[, thickness[, lineType[, shift]]]) -> img
 
@@ -331,7 +331,7 @@ line
 --------
 Draws a line segment connecting two points.
 
-.. ocv:function:: void line( Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
+.. ocv:function:: void line( InputOutputArray img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
 
 .. ocv:pyfunction:: cv2.line(img, pt1, pt2, color[, thickness[, lineType[, shift]]]) -> img
 
@@ -417,7 +417,7 @@ rectangle
 -------------
 Draws a simple, thick, or filled up-right rectangle.
 
-.. ocv:function:: void rectangle( Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
+.. ocv:function:: void rectangle( InputOutputArray img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
 
 .. ocv:function:: void rectangle( Mat& img, Rect rec, const Scalar& color, int thickness=1, int lineType=LINE_8, int shift=0 )
 
@@ -570,7 +570,7 @@ putText
 -----------
 Draws a text string.
 
-.. ocv:function:: void putText( Mat& img, const String& text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false )
+.. ocv:function:: void putText( InputOutputArray img, const String& text, Point org, int fontFace, double fontScale, Scalar color, int thickness=1, int lineType=LINE_8, bool bottomLeftOrigin=false )
 
 .. ocv:pyfunction:: cv2.putText(img, text, org, fontFace, fontScale, color[, thickness[, lineType[, bottomLeftOrigin]]]) -> None
 
diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst
index a894d0768..c936457af 100644
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@@ -903,7 +903,7 @@ So, the function chooses an operation mode depending on the flags and size of th
 
     * When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input.
 
-    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
+    * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS``         flag), each row of the output matrix looks like the first row of the matrix above.
 
  * If the input array is complex and either ``DFT_INVERSE``     or ``DFT_REAL_OUTPUT``     are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``.
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 5e72764cb..12d11b006 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -507,11 +507,11 @@ CV_EXPORTS_W void randn(InputOutputArray dst, InputArray mean, InputArray stddev
 CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG* rng = 0);
 
 //! draws the line segment (pt1, pt2) in the image
-CV_EXPORTS_W void line(CV_IN_OUT Mat& img, Point pt1, Point pt2, const Scalar& color,
+CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
                      int thickness = 1, int lineType = LINE_8, int shift = 0);
 
 //! draws the rectangle outline or a solid rectangle with the opposite corners pt1 and pt2 in the image
-CV_EXPORTS_W void rectangle(CV_IN_OUT Mat& img, Point pt1, Point pt2,
+CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
                           const Scalar& color, int thickness = 1,
                           int lineType = LINE_8, int shift = 0);
 
@@ -521,18 +521,18 @@ CV_EXPORTS void rectangle(CV_IN_OUT Mat& img, Rect rec,
                           int lineType = LINE_8, int shift = 0);
 
 //! draws the circle outline or a solid circle in the image
-CV_EXPORTS_W void circle(CV_IN_OUT Mat& img, Point center, int radius,
+CV_EXPORTS_W void circle(InputOutputArray img, Point center, int radius,
                        const Scalar& color, int thickness = 1,
                        int lineType = LINE_8, int shift = 0);
 
 //! draws an elliptic arc, ellipse sector or a rotated ellipse in the image
-CV_EXPORTS_W void ellipse(CV_IN_OUT Mat& img, Point center, Size axes,
+CV_EXPORTS_W void ellipse(InputOutputArray img, Point center, Size axes,
                         double angle, double startAngle, double endAngle,
                         const Scalar& color, int thickness = 1,
                         int lineType = LINE_8, int shift = 0);
 
 //! draws a rotated ellipse in the image
-CV_EXPORTS_W void ellipse(CV_IN_OUT Mat& img, const RotatedRect& box, const Scalar& color,
+CV_EXPORTS_W void ellipse(InputOutputArray img, const RotatedRect& box, const Scalar& color,
                         int thickness = 1, int lineType = LINE_8);
 
 //! draws a filled convex polygon in the image
@@ -582,7 +582,7 @@ CV_EXPORTS_W void ellipse2Poly( Point center, Size axes, int angle,
                                 CV_OUT std::vector<Point>& pts );
 
 //! renders text string in the image
-CV_EXPORTS_W void putText( Mat& img, const String& text, Point org,
+CV_EXPORTS_W void putText( InputOutputArray img, const String& text, Point org,
                          int fontFace, double fontScale, Scalar color,
                          int thickness = 1, int lineType = LINE_8,
                          bool bottomLeftOrigin = false );
diff --git a/modules/core/include/opencv2/core/affine.hpp b/modules/core/include/opencv2/core/affine.hpp
index fefcef8f1..7284525c8 100644
--- a/modules/core/include/opencv2/core/affine.hpp
+++ b/modules/core/include/opencv2/core/affine.hpp
@@ -55,9 +55,9 @@ namespace cv
     {
     public:
         typedef T float_type;
-        typedef cv::Matx<float_type, 3, 3> Mat3;
-        typedef cv::Matx<float_type, 4, 4> Mat4;
-        typedef cv::Vec<float_type, 3> Vec3;
+        typedef Matx<float_type, 3, 3> Mat3;
+        typedef Matx<float_type, 4, 4> Mat4;
+        typedef Vec<float_type, 3> Vec3;
 
         Affine3();
 
@@ -70,11 +70,11 @@ namespace cv
         //Rodrigues vector
         Affine3(const Vec3& rvec, const Vec3& t = Vec3::all(0));
 
-        //Combines all contructors above. Supports 4x4, 3x3, 1x3, 3x1 sizes of data matrix
-        explicit Affine3(const cv::Mat& data, const Vec3& t = Vec3::all(0));
+        //Combines all contructors above. Supports 4x4, 4x3, 3x3, 1x3, 3x1 sizes of data matrix
+        explicit Affine3(const Mat& data, const Vec3& t = Vec3::all(0));
 
-        //Euler angles
-        Affine3(float_type alpha, float_type beta, float_type gamma, const Vec3& t = Vec3::all(0));
+        //From 16th element array
+        explicit Affine3(const float_type* vals);
 
         static Affine3 Identity();
 
@@ -87,9 +87,6 @@ namespace cv
         //Combines rotation methods above. Suports 3x3, 1x3, 3x1 sizes of data matrix;
         void rotation(const Mat& data);
 
-        //Euler angles
-        void rotation(float_type alpha, float_type beta, float_type gamma);
-
         void linear(const Mat3& L);
         void translation(const Vec3& t);
 
@@ -105,6 +102,9 @@ namespace cv
         // a.rotate(R) is equivalent to Affine(R, 0) * a;
         Affine3 rotate(const Mat3& R) const;
 
+        // a.rotate(R) is equivalent to Affine(rvec, 0) * a;
+        Affine3 rotate(const Vec3& rvec) const;
+
         // a.translate(t) is equivalent to Affine(E, t) * a;
         Affine3 translate(const Vec3& t) const;
 
@@ -113,6 +113,8 @@ namespace cv
 
         template <typename Y> operator Affine3<Y>() const;
 
+        template <typename Y> Affine3<Y> cast() const;
+
         Mat4 matrix;
 
 #if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
@@ -132,10 +134,26 @@ namespace cv
     typedef Affine3<float> Affine3f;
     typedef Affine3<double> Affine3d;
 
-    static cv::Vec3f operator*(const cv::Affine3f& affine, const cv::Vec3f& vector);
-    static cv::Vec3d operator*(const cv::Affine3d& affine, const cv::Vec3d& vector);
-}
+    static Vec3f operator*(const Affine3f& affine, const Vec3f& vector);
+    static Vec3d operator*(const Affine3d& affine, const Vec3d& vector);
 
+    template<typename _Tp> class DataType< Affine3<_Tp> >
+    {
+    public:
+        typedef Affine3<_Tp>                               value_type;
+        typedef Affine3<typename DataType<_Tp>::work_type> work_type;
+        typedef _Tp                                        channel_type;
+
+        enum { generic_type = 0,
+               depth        = DataType<channel_type>::depth,
+               channels     = 16,
+               fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+               type         = CV_MAKETYPE(depth, channels)
+             };
+
+        typedef Vec<channel_type, channels> vec_type;
+    };
+}
 
 
 ///////////////////////////////////////////////////////////////////////////////////
@@ -179,6 +197,12 @@ cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
         data.copyTo(matrix);
         return;
     }
+    else if (data.cols == 4 && data.rows == 3)
+    {
+        rotation(data(Rect(0, 0, 3, 3)));
+        translation(data(Rect(3, 0, 1, 3)));
+        return;
+    }
 
     rotation(data);
     translation(t);
@@ -187,13 +211,8 @@ cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
 }
 
 template<typename T> inline
-cv::Affine3<T>::Affine3(float_type alpha, float_type beta, float_type gamma, const Vec3& t)
-{
-    rotation(alpha, beta, gamma);
-    translation(t);
-    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
-    matrix.val[15] = 1;
-}
+cv::Affine3<T>::Affine3(const float_type* vals) : matrix(vals)
+{}
 
 template<typename T> inline
 cv::Affine3<T> cv::Affine3<T>::Identity()
@@ -261,12 +280,6 @@ void cv::Affine3<T>::rotation(const cv::Mat& data)
         CV_Assert(!"Input marix can be 3x3, 1x3 or 3x1");
 }
 
-template<typename T> inline
-void cv::Affine3<T>::rotation(float_type alpha, float_type beta, float_type gamma)
-{
-    rotation(Vec3(alpha, beta, gamma));
-}
-
 template<typename T> inline
 void cv::Affine3<T>::linear(const Mat3& L)
 {
@@ -382,6 +395,12 @@ cv::Affine3<T> cv::Affine3<T>::rotate(const Mat3& R) const
     return result;
 }
 
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Vec3& _rvec) const
+{
+    return rotate(Affine3f(_rvec).rotation());
+}
+
 template<typename T> inline
 cv::Affine3<T> cv::Affine3<T>::translate(const Vec3& t) const
 {
@@ -404,6 +423,12 @@ cv::Affine3<T>::operator Affine3<Y>() const
     return Affine3<Y>(matrix);
 }
 
+template<typename T> template <typename Y> inline
+cv::Affine3<Y> cv::Affine3<T>::cast() const
+{
+    return Affine3<Y>(matrix);
+}
+
 template<typename T> inline
 cv::Affine3<T> cv::operator*(const cv::Affine3<T>& affine1, const cv::Affine3<T>& affine2)
 {
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 6e783005f..31cae3916 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -502,7 +502,6 @@ class CV_EXPORTS Mat;
 class CV_EXPORTS MatExpr;
 
 class CV_EXPORTS UMat;
-class CV_EXPORTS UMatExpr;
 
 class CV_EXPORTS SparseMat;
 typedef Mat MatND;
diff --git a/modules/core/include/opencv2/core/bufferpool.hpp b/modules/core/include/opencv2/core/bufferpool.hpp
new file mode 100644
index 000000000..c2de95a9f
--- /dev/null
+++ b/modules/core/include/opencv2/core/bufferpool.hpp
@@ -0,0 +1,26 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#ifndef __OPENCV_CORE_BUFFER_POOL_HPP__
+#define __OPENCV_CORE_BUFFER_POOL_HPP__
+
+namespace cv
+{
+
+class BufferPoolController
+{
+protected:
+    ~BufferPoolController() { }
+public:
+    virtual size_t getReservedSize() const = 0;
+    virtual size_t getMaxReservedSize() const = 0;
+    virtual void setMaxReservedSize(size_t size) = 0;
+    virtual void freeAllReservedBuffers() = 0;
+};
+
+}
+
+#endif // __OPENCV_CORE_BUFFER_POOL_HPP__
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index fa3fbd681..8108a61e6 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -444,7 +444,7 @@ CV_INLINE int cvIsInf( double value )
    // atomic increment on the linux version of the Intel(tm) compiler
 #  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
 #elif defined __GNUC__
-#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
 #    ifdef __ATOMIC_ACQ_REL
 #      define CV_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
 #    else
@@ -459,6 +459,7 @@ CV_INLINE int cvIsInf( double value )
 #    endif
 #  endif
 #elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
 #  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
 #else
    CV_INLINE CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
diff --git a/modules/core/include/opencv2/core/directx.hpp b/modules/core/include/opencv2/core/directx.hpp
index 2a8991ad5..d7e38a12a 100644
--- a/modules/core/include/opencv2/core/directx.hpp
+++ b/modules/core/include/opencv2/core/directx.hpp
@@ -67,10 +67,10 @@ namespace ocl {
 using namespace cv::ocl;
 
 // TODO static functions in the Context class
-CV_EXPORTS Context2& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device);
-CV_EXPORTS Context2& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device);
-CV_EXPORTS Context2& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex);
-CV_EXPORTS Context2& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9);
+CV_EXPORTS Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device);
+CV_EXPORTS Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device);
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex);
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9);
 
 } // namespace cv::directx::ocl
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 401467534..eb206fc2f 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -51,6 +51,7 @@
 #include "opencv2/core/matx.hpp"
 #include "opencv2/core/types.hpp"
 
+#include "opencv2/core/bufferpool.hpp"
 
 namespace cv
 {
@@ -84,10 +85,8 @@ public:
         OPENGL_BUFFER     = 7 << KIND_SHIFT,
         CUDA_MEM          = 8 << KIND_SHIFT,
         GPU_MAT           = 9 << KIND_SHIFT,
-        OCL_MAT           =10 << KIND_SHIFT,
-        UMAT              =11 << KIND_SHIFT,
-        STD_VECTOR_UMAT   =12 << KIND_SHIFT,
-        UEXPR             =13 << KIND_SHIFT
+        UMAT              =10 << KIND_SHIFT,
+        STD_VECTOR_UMAT   =11 << KIND_SHIFT
     };
 
     _InputArray();
@@ -108,11 +107,11 @@ public:
     template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
     _InputArray(const UMat& um);
     _InputArray(const std::vector<UMat>& umv);
-    _InputArray(const UMatExpr& uexpr);
 
     virtual Mat getMat(int idx=-1) const;
     virtual UMat getUMat(int idx=-1) const;
     virtual void getMatVector(std::vector<Mat>& mv) const;
+    virtual void getUMatVector(std::vector<UMat>& umv) const;
     virtual cuda::GpuMat getGpuMat() const;
     virtual ogl::Buffer getOGlBuffer() const;
     void* getObj() const;
@@ -127,13 +126,14 @@ public:
     virtual int depth(int i=-1) const;
     virtual int channels(int i=-1) const;
     virtual bool isContinuous(int i=-1) const;
+    virtual bool isSubmatrix(int i=-1) const;
     virtual bool empty() const;
     virtual void copyTo(const _OutputArray& arr) const;
     virtual size_t offset(int i=-1) const;
     virtual size_t step(int i=-1) const;
     bool isMat() const;
     bool isUMat() const;
-    bool isMatVectot() const;
+    bool isMatVector() const;
     bool isUMatVector() const;
     bool isMatx();
 
@@ -205,6 +205,7 @@ public:
     virtual bool fixedType() const;
     virtual bool needed() const;
     virtual Mat& getMatRef(int i=-1) const;
+    virtual UMat& getUMatRef(int i=-1) const;
     virtual cuda::GpuMat& getGpuMatRef() const;
     virtual ogl::Buffer& getOGlBufferRef() const;
     virtual cuda::CudaMem& getCudaMemRef() const;
@@ -214,7 +215,7 @@ public:
     virtual void createSameSize(const _InputArray& arr, int mtype) const;
     virtual void release() const;
     virtual void clear() const;
-    virtual void setTo(const _InputArray& value) const;
+    virtual void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
 };
 
 
@@ -265,6 +266,18 @@ CV_EXPORTS InputOutputArray noArray();
 
 /////////////////////////////////// MatAllocator //////////////////////////////////////
 
+//! Usage flags for allocator
+enum UMatUsageFlags
+{
+    USAGE_DEFAULT = 0,
+
+    // default allocation policy is platform and usage specific
+    USAGE_ALLOCATE_HOST_MEMORY = 1 << 0,
+    USAGE_ALLOCATE_DEVICE_MEMORY = 1 << 1,
+
+    __UMAT_USAGE_FLAGS_32BIT = 0x7fffffff // Binary compatibility hint
+};
+
 struct CV_EXPORTS UMatData;
 
 /*!
@@ -282,8 +295,8 @@ public:
     //                      uchar*& datastart, uchar*& data, size_t* step) = 0;
     //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0;
     virtual UMatData* allocate(int dims, const int* sizes, int type,
-                               void* data, size_t* step, int flags) const = 0;
-    virtual bool allocate(UMatData* data, int accessflags) const = 0;
+                               void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const = 0;
+    virtual bool allocate(UMatData* data, int accessflags, UMatUsageFlags usageFlags) const = 0;
     virtual void deallocate(UMatData* data) const = 0;
     virtual void map(UMatData* data, int accessflags) const;
     virtual void unmap(UMatData* data) const;
@@ -296,6 +309,9 @@ public:
     virtual void copy(UMatData* srcdata, UMatData* dstdata, int dims, const size_t sz[],
                       const size_t srcofs[], const size_t srcstep[],
                       const size_t dstofs[], const size_t dststep[], bool sync) const;
+
+    // default implementation returns DummyBufferPoolController
+    virtual BufferPoolController* getBufferPoolController() const;
 };
 
 
@@ -360,11 +376,12 @@ struct CV_EXPORTS UMatData
     int refcount;
     uchar* data;
     uchar* origdata;
-    size_t size;
+    size_t size, capacity;
 
     int flags;
     void* handle;
     void* userdata;
+    int allocatorFlags_;
 };
 
 
@@ -667,7 +684,7 @@ public:
     Mat& operator = (const MatExpr& expr);
 
     //! retrieve UMat from Mat
-    UMat getUMat(int accessFlags) const;
+    UMat getUMat(int accessFlags, UMatUsageFlags usageFlags = USAGE_DEFAULT) const;
 
     //! returns a new matrix header for the specified row
     Mat row(int y) const;
@@ -1128,25 +1145,22 @@ typedef Mat_<Vec2d> Mat2d;
 typedef Mat_<Vec3d> Mat3d;
 typedef Mat_<Vec4d> Mat4d;
 
-
-class CV_EXPORTS UMatExpr;
-
 class CV_EXPORTS UMat
 {
 public:
     //! default constructor
-    UMat();
+    UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT);
     //! constructs 2D matrix of the specified size and type
     // (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-    UMat(int rows, int cols, int type);
-    UMat(Size size, int type);
+    UMat(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
     //! constucts 2D matrix and fills it with the specified value _s.
-    UMat(int rows, int cols, int type, const Scalar& s);
-    UMat(Size size, int type, const Scalar& s);
+    UMat(int rows, int cols, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
 
     //! constructs n-dimensional matrix
-    UMat(int ndims, const int* sizes, int type);
-    UMat(int ndims, const int* sizes, int type, const Scalar& s);
+    UMat(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(int ndims, const int* sizes, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
 
     //! copy constructor
     UMat(const UMat& m);
@@ -1172,7 +1186,6 @@ public:
     ~UMat();
     //! assignment operators
     UMat& operator = (const UMat& m);
-    UMat& operator = (const UMatExpr& expr);
 
     Mat getMat(int flags) const;
 
@@ -1216,32 +1229,30 @@ public:
     UMat reshape(int cn, int newndims, const int* newsz) const;
 
     //! matrix transposition by means of matrix expressions
-    UMatExpr t() const;
+    UMat t() const;
     //! matrix inversion by means of matrix expressions
-    UMatExpr inv(int method=DECOMP_LU) const;
+    UMat inv(int method=DECOMP_LU) const;
     //! per-element matrix multiplication by means of matrix expressions
-    UMatExpr mul(InputArray m, double scale=1) const;
+    UMat mul(InputArray m, double scale=1) const;
 
-    //! computes cross-product of 2 3D vectors
-    UMat cross(InputArray m) const;
     //! computes dot-product
     double dot(InputArray m) const;
 
     //! Matlab-style matrix initialization
-    static UMatExpr zeros(int rows, int cols, int type);
-    static UMatExpr zeros(Size size, int type);
-    static UMatExpr zeros(int ndims, const int* sz, int type);
-    static UMatExpr ones(int rows, int cols, int type);
-    static UMatExpr ones(Size size, int type);
-    static UMatExpr ones(int ndims, const int* sz, int type);
-    static UMatExpr eye(int rows, int cols, int type);
-    static UMatExpr eye(Size size, int type);
+    static UMat zeros(int rows, int cols, int type);
+    static UMat zeros(Size size, int type);
+    static UMat zeros(int ndims, const int* sz, int type);
+    static UMat ones(int rows, int cols, int type);
+    static UMat ones(Size size, int type);
+    static UMat ones(int ndims, const int* sz, int type);
+    static UMat eye(int rows, int cols, int type);
+    static UMat eye(Size size, int type);
 
     //! allocates new matrix data unless the matrix already has specified size and type.
     // previous data is unreferenced if needed.
-    void create(int rows, int cols, int type);
-    void create(Size size, int type);
-    void create(int ndims, const int* sizes, int type);
+    void create(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
 
     //! increases the reference counter; use with care to avoid memleaks
     void addref();
@@ -1313,6 +1324,7 @@ public:
 
     //! custom allocator
     MatAllocator* allocator;
+    UMatUsageFlags usageFlags; // usage flags for allocator
     //! and the standard allocator
     static MatAllocator* getStdAllocator();
 
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 9c2f595b6..d289e3a2f 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -60,7 +60,7 @@ inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
 
 inline void* _InputArray::getObj() const { return obj; }
 
-inline _InputArray::_InputArray() { init(0, 0); }
+inline _InputArray::_InputArray() { init(NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
 inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
 inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
@@ -110,7 +110,7 @@ inline _InputArray::~_InputArray() {}
 
 inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
 inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
-inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
 inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
 inline bool _InputArray::isMatx()  { return kind() == _InputArray::MATX; }
 
@@ -186,6 +186,12 @@ inline _OutputArray::_OutputArray(const Mat& m)
 inline _OutputArray::_OutputArray(const std::vector<Mat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
 
+inline _OutputArray::_OutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_WRITE, &m); }
+
+inline _OutputArray::_OutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+
 inline _OutputArray::_OutputArray(const cuda::GpuMat& d_mat)
 { init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_WRITE, &d_mat); }
 
@@ -267,6 +273,12 @@ inline _InputOutputArray::_InputOutputArray(const Mat& m)
 inline _InputOutputArray::_InputOutputArray(const std::vector<Mat>& vec)
 { init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_RW, &vec); }
 
+inline _InputOutputArray::_InputOutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_RW, &m); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_RW, &vec); }
+
 inline _InputOutputArray::_InputOutputArray(const cuda::GpuMat& d_mat)
 { init(FIXED_TYPE + FIXED_SIZE + GPU_MAT + ACCESS_RW, &d_mat); }
 
@@ -360,7 +372,7 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
       data((uchar*)_data), datastart((uchar*)_data), dataend(0), datalimit(0),
       allocator(0), u(0), size(&rows)
 {
-    size_t esz = CV_ELEM_SIZE(_type);
+    size_t esz = CV_ELEM_SIZE(_type), esz1 = CV_ELEM_SIZE1(_type);
     size_t minstep = cols * esz;
     if( _step == AUTO_STEP )
     {
@@ -371,6 +383,12 @@ Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
     {
         if( rows == 1 ) _step = minstep;
         CV_DbgAssert( _step >= minstep );
+
+        if (_step % esz1 != 0)
+        {
+            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
+        }
+
         flags |= _step == minstep ? CONTINUOUS_FLAG : 0;
     }
     step[0] = _step;
@@ -385,7 +403,7 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
       data((uchar*)_data), datastart((uchar*)_data), dataend(0), datalimit(0),
       allocator(0), u(0), size(&rows)
 {
-    size_t esz = CV_ELEM_SIZE(_type);
+    size_t esz = CV_ELEM_SIZE(_type), esz1 = CV_ELEM_SIZE1(_type);
     size_t minstep = cols*esz;
     if( _step == AUTO_STEP )
     {
@@ -396,6 +414,12 @@ Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
     {
         if( rows == 1 ) _step = minstep;
         CV_DbgAssert( _step >= minstep );
+
+        if (_step % esz1 != 0)
+        {
+            CV_Error(Error::BadStep, "Step must be a multiple of esz1");
+        }
+
         flags |= _step == minstep ? CONTINUOUS_FLAG : 0;
     }
     step[0] = _step;
@@ -1906,7 +1930,7 @@ SparseMat_<_Tp>::SparseMat_(const SparseMat& m)
     if( m.type() == DataType<_Tp>::type )
         *this = (const SparseMat_<_Tp>&)m;
     else
-        m.convertTo(this, DataType<_Tp>::type);
+        m.convertTo(*this, DataType<_Tp>::type);
 }
 
 template<typename _Tp> inline
@@ -3046,50 +3070,50 @@ const Mat_<_Tp>& operator /= (const Mat_<_Tp>& a, const MatExpr& b)
 //////////////////////////////// UMat ////////////////////////////////
 
 inline
-UMat::UMat()
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {}
 
 inline
-UMat::UMat(int _rows, int _cols, int _type)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(int _rows, int _cols, int _type, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create(_rows, _cols, _type);
 }
 
 inline
-UMat::UMat(int _rows, int _cols, int _type, const Scalar& _s)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(int _rows, int _cols, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create(_rows, _cols, _type);
     *this = _s;
 }
 
 inline
-UMat::UMat(Size _sz, int _type)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(Size _sz, int _type, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create( _sz.height, _sz.width, _type );
 }
 
 inline
-UMat::UMat(Size _sz, int _type, const Scalar& _s)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(Size _sz, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create(_sz.height, _sz.width, _type);
     *this = _s;
 }
 
 inline
-UMat::UMat(int _dims, const int* _sz, int _type)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(int _dims, const int* _sz, int _type, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create(_dims, _sz, _type);
 }
 
 inline
-UMat::UMat(int _dims, const int* _sz, int _type, const Scalar& _s)
-: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+UMat::UMat(int _dims, const int* _sz, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
+: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
 {
     create(_dims, _sz, _type);
     *this = _s;
@@ -3098,10 +3122,9 @@ UMat::UMat(int _dims, const int* _sz, int _type, const Scalar& _s)
 inline
 UMat::UMat(const UMat& m)
 : flags(m.flags), dims(m.dims), rows(m.rows), cols(m.cols), allocator(m.allocator),
-u(m.u), offset(m.offset), size(&rows)
+  usageFlags(m.usageFlags), u(m.u), offset(m.offset), size(&rows)
 {
-    if( u )
-        CV_XADD(&(u->urefcount), 1);
+    addref();
     if( m.dims <= 2 )
     {
         step[0] = m.step[0]; step[1] = m.step[1];
@@ -3117,7 +3140,7 @@ u(m.u), offset(m.offset), size(&rows)
 template<typename _Tp> inline
 UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
 : flags(MAGIC_VAL | DataType<_Tp>::type | CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
-cols(1), allocator(0), u(0), offset(0), size(&rows)
+cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
 {
     if(vec.empty())
         return;
@@ -3136,8 +3159,7 @@ UMat& UMat::operator = (const UMat& m)
 {
     if( this != &m )
     {
-        if( m.u )
-            CV_XADD(&(m.u->urefcount), 1);
+        const_cast<UMat&>(m).addref();
         release();
         flags = m.flags;
         if( dims <= 2 && m.dims <= 2 )
@@ -3151,6 +3173,8 @@ UMat& UMat::operator = (const UMat& m)
         else
             copySize(m);
         allocator = m.allocator;
+        if (usageFlags == USAGE_DEFAULT)
+            usageFlags = m.usageFlags;
         u = m.u;
         offset = m.offset;
     }
@@ -3211,19 +3235,19 @@ void UMat::assignTo( UMat& m, int _type ) const
 }
 
 inline
-void UMat::create(int _rows, int _cols, int _type)
+void UMat::create(int _rows, int _cols, int _type, UMatUsageFlags _usageFlags)
 {
     _type &= TYPE_MASK;
     if( dims <= 2 && rows == _rows && cols == _cols && type() == _type && u )
         return;
     int sz[] = {_rows, _cols};
-    create(2, sz, _type);
+    create(2, sz, _type, _usageFlags);
 }
 
 inline
-void UMat::create(Size _sz, int _type)
+void UMat::create(Size _sz, int _type, UMatUsageFlags _usageFlags)
 {
-    create(_sz.height, _sz.width, _type);
+    create(_sz.height, _sz.width, _type, _usageFlags);
 }
 
 inline
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 7caf4c28d..c6b0cf2d0 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -51,14 +51,16 @@ CV_EXPORTS bool useOpenCL();
 CV_EXPORTS bool haveAmdBlas();
 CV_EXPORTS bool haveAmdFft();
 CV_EXPORTS void setUseOpenCL(bool flag);
-CV_EXPORTS void finish2();
+CV_EXPORTS void finish();
 
-class CV_EXPORTS Context2;
+class CV_EXPORTS Context;
 class CV_EXPORTS Device;
 class CV_EXPORTS Kernel;
 class CV_EXPORTS Program;
-class CV_EXPORTS ProgramSource2;
+class CV_EXPORTS ProgramSource;
 class CV_EXPORTS Queue;
+class CV_EXPORTS PlatformInfo;
+class CV_EXPORTS Image2D;
 
 class CV_EXPORTS Device
 {
@@ -84,9 +86,12 @@ public:
 
     String name() const;
     String extensions() const;
+    String version() const;
     String vendor() const;
     String OpenCL_C_Version() const;
     String OpenCLVersion() const;
+    int deviceVersionMajor() const;
+    int deviceVersionMinor() const;
     String driverVersion() const;
     void* ptr() const;
 
@@ -201,34 +206,31 @@ protected:
 };
 
 
-class CV_EXPORTS Context2
+class CV_EXPORTS Context
 {
 public:
-    Context2();
-    explicit Context2(int dtype);
-    ~Context2();
-    Context2(const Context2& c);
-    Context2& operator = (const Context2& c);
+    Context();
+    explicit Context(int dtype);
+    ~Context();
+    Context(const Context& c);
+    Context& operator = (const Context& c);
 
+    bool create();
     bool create(int dtype);
     size_t ndevices() const;
     const Device& device(size_t idx) const;
-    Program getProg(const ProgramSource2& prog,
+    Program getProg(const ProgramSource& prog,
                     const String& buildopt, String& errmsg);
 
-    static Context2& getDefault(bool initialize = true);
+    static Context& getDefault(bool initialize = true);
     void* ptr() const;
 
-    struct Impl;
-    inline struct Impl* _getImpl() const { return p; };
+    friend void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device);
 protected:
+    struct Impl;
     Impl* p;
 };
 
-
-// TODO Move to internal header
-void initializeContextFromHandle(Context2& ctx, void* platform, void* context, void* device);
-
 class CV_EXPORTS Platform
 {
 public:
@@ -240,23 +242,25 @@ public:
     void* ptr() const;
     static Platform& getDefault();
 
-    struct Impl;
-    inline struct Impl* _getImpl() const { return p; };
+    friend void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device);
 protected:
+    struct Impl;
     Impl* p;
 };
 
+// TODO Move to internal header
+void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device);
 
 class CV_EXPORTS Queue
 {
 public:
     Queue();
-    explicit Queue(const Context2& c, const Device& d=Device());
+    explicit Queue(const Context& c, const Device& d=Device());
     ~Queue();
     Queue(const Queue& q);
     Queue& operator = (const Queue& q);
 
-    bool create(const Context2& c=Context2(), const Device& d=Device());
+    bool create(const Context& c=Context(), const Device& d=Device());
     void finish();
     void* ptr() const;
     static Queue& getDefault();
@@ -310,7 +314,7 @@ class CV_EXPORTS Kernel
 public:
     Kernel();
     Kernel(const char* kname, const Program& prog);
-    Kernel(const char* kname, const ProgramSource2& prog,
+    Kernel(const char* kname, const ProgramSource& prog,
            const String& buildopts = String(), String* errmsg=0);
     ~Kernel();
     Kernel(const Kernel& k);
@@ -318,10 +322,11 @@ public:
 
     bool empty() const;
     bool create(const char* kname, const Program& prog);
-    bool create(const char* kname, const ProgramSource2& prog,
+    bool create(const char* kname, const ProgramSource& prog,
                 const String& buildopts, String* errmsg=0);
 
     int set(int i, const void* value, size_t sz);
+    int set(int i, const Image2D& image2D);
     int set(int i, const UMat& m);
     int set(int i, const KernelArg& arg);
     template<typename _Tp> int set(int i, const _Tp& value)
@@ -488,6 +493,7 @@ public:
     bool runTask(bool sync, const Queue& q=Queue());
 
     size_t workGroupSize() const;
+    size_t preferedWorkGroupSizeMultiple() const;
     bool compileWorkGroupSize(size_t wsz[]) const;
     size_t localMemSize() const;
 
@@ -502,7 +508,7 @@ class CV_EXPORTS Program
 {
 public:
     Program();
-    Program(const ProgramSource2& src,
+    Program(const ProgramSource& src,
             const String& buildflags, String& errmsg);
     explicit Program(const String& buf);
     Program(const Program& prog);
@@ -510,12 +516,12 @@ public:
     Program& operator = (const Program& prog);
     ~Program();
 
-    bool create(const ProgramSource2& src,
+    bool create(const ProgramSource& src,
                 const String& buildflags, String& errmsg);
     bool read(const String& buf, const String& buildflags);
     bool write(String& buf) const;
 
-    const ProgramSource2& source() const;
+    const ProgramSource& source() const;
     void* ptr() const;
 
     String getPrefix() const;
@@ -527,17 +533,17 @@ protected:
 };
 
 
-class CV_EXPORTS ProgramSource2
+class CV_EXPORTS ProgramSource
 {
 public:
     typedef uint64 hash_t;
 
-    ProgramSource2();
-    explicit ProgramSource2(const String& prog);
-    explicit ProgramSource2(const char* prog);
-    ~ProgramSource2();
-    ProgramSource2(const ProgramSource2& prog);
-    ProgramSource2& operator = (const ProgramSource2& prog);
+    ProgramSource();
+    explicit ProgramSource(const String& prog);
+    explicit ProgramSource(const char* prog);
+    ~ProgramSource();
+    ProgramSource(const ProgramSource& prog);
+    ProgramSource& operator = (const ProgramSource& prog);
 
     const String& source() const;
     hash_t hash() const;
@@ -547,9 +553,51 @@ protected:
     Impl* p;
 };
 
+class CV_EXPORTS PlatformInfo
+{
+public:
+    PlatformInfo();
+    explicit PlatformInfo(void* id);
+    ~PlatformInfo();
+
+    PlatformInfo(const PlatformInfo& i);
+    PlatformInfo& operator =(const PlatformInfo& i);
+
+    String name() const;
+    String vendor() const;
+    String version() const;
+    int deviceNumber() const;
+    void getDevice(Device& device, int d) const;
+
+protected:
+    struct Impl;
+    Impl* p;
+};
+
 CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1);
+CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
+
+class CV_EXPORTS Image2D
+{
+public:
+    Image2D();
+    explicit Image2D(const UMat &src);
+    Image2D(const Image2D & i);
+    ~Image2D();
+
+    Image2D & operator = (const Image2D & i);
+
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+
+CV_EXPORTS MatAllocator* getOpenCLAllocator();
 
 }}
 
diff --git a/modules/core/include/opencv2/core/opencl/ocl_defs.hpp b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
new file mode 100644
index 000000000..4acfa7ae6
--- /dev/null
+++ b/modules/core/include/opencv2/core/opencl/ocl_defs.hpp
@@ -0,0 +1,46 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+//#define CV_OPENCL_RUN_VERBOSE
+
+#ifdef HAVE_OPENCL
+
+#ifdef CV_OPENCL_RUN_VERBOSE
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::useOpenCL() && (condition) && func)                    \
+        {                                                                   \
+            printf("%s: OpenCL implementation is running\n", CV_Func);      \
+            fflush(stdout);                                                 \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            printf("%s: Plain implementation is running\n", CV_Func);       \
+            fflush(stdout);                                                 \
+        }                                                                   \
+    }
+#elif defined CV_OPENCL_RUN_ASSERT
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::useOpenCL() && (condition))                            \
+        {                                                                   \
+            CV_Assert(func);                                                \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+    }
+#else
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    if (cv::ocl::useOpenCL() && (condition) && func)                        \
+        return __VA_ARGS__;
+#endif
+
+#else
+#define CV_OCL_RUN_(condition, func, ...)
+#endif
+
+#define CV_OCL_RUN(condition, func) CV_OCL_RUN_(condition, func)
diff --git a/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp b/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp
deleted file mode 100644
index 8191e235d..000000000
--- a/modules/core/include/opencv2/core/opencl/runtime/ocl_runtime.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __OPENCV_CORE_OCL_RUNTIME_HPP__
-#define __OPENCV_CORE_OCL_RUNTIME_HPP__
-
-#ifdef HAVE_OPENCL
-
-#if defined(HAVE_OPENCL_STATIC)
-
-#if defined __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#else // HAVE_OPENCL_STATIC
-
-#include "ocl_runtime_opencl.hpp"
-
-#endif // HAVE_OPENCL_STATIC
-
-#ifndef CL_DEVICE_DOUBLE_FP_CONFIG
-#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
-#endif
-
-#ifndef CL_DEVICE_HALF_FP_CONFIG
-#define CL_DEVICE_HALF_FP_CONFIG 0x1033
-#endif
-
-#ifndef CL_VERSION_1_2
-#define CV_REQUIRE_OPENCL_1_2_ERROR CV_ErrorNoReturn(cv::Error::OpenCLApiCallError, "OpenCV compiled without OpenCL v1.2 support, so we can't use functionality from OpenCL v1.2")
-#endif
-
-#endif // HAVE_OPENCL
-
-#endif // __OPENCV_CORE_OCL_RUNTIME_HPP__
diff --git a/modules/core/include/opencv2/core/operations.hpp b/modules/core/include/opencv2/core/operations.hpp
index 954604a9d..d2f49d7ee 100644
--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -394,7 +394,9 @@ template<typename _Tp> static inline _Tp randu()
   return (_Tp)theRNG();
 }
 
+///////////////////////////////// Formatted string generation /////////////////////////////////
 
+CV_EXPORTS String format( const char* fmt, ... );
 
 ///////////////////////////////// Formatted output of cv::Mat /////////////////////////////////
 
@@ -421,6 +423,12 @@ int print(const Mat& mtx, FILE* stream = stdout)
     return print(Formatter::get()->format(mtx), stream);
 }
 
+static inline
+int print(const UMat& mtx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(mtx.getMat(ACCESS_READ)), stream);
+}
+
 template<typename _Tp> static inline
 int print(const std::vector<Point_<_Tp> >& vec, FILE* stream = stdout)
 {
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index e2b49749e..0b09c6fb0 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -392,6 +392,7 @@ public:
     //! various constructors
     RotatedRect();
     RotatedRect(const Point2f& center, const Size2f& size, float angle);
+    RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
 
     //! returns 4 vertices of the rectangle
     void points(Point2f pts[]) const;
diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h
index 38b0f340a..161a4bd10 100644
--- a/modules/core/include/opencv2/core/types_c.h
+++ b/modules/core/include/opencv2/core/types_c.h
@@ -158,7 +158,7 @@ enum {
  CV_StsVecLengthErr=           -28, /* incorrect vector length */
  CV_StsFilterStructContentErr= -29, /* incorr. filter structure content */
  CV_StsKernelStructContentErr= -30, /* incorr. transform kernel content */
- CV_StsFilterOffsetErr=        -31, /* incorrect filter ofset value */
+ CV_StsFilterOffsetErr=        -31, /* incorrect filter offset value */
  CV_StsBadSize=                -201, /* the input/output structure size is incorrect  */
  CV_StsDivByZero=              -202, /* division by zero */
  CV_StsInplaceNotSupported=    -203, /* in-place operation is not supported */
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 2d7d3130e..3e844ccf4 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -85,7 +85,7 @@ template<typename _Tp, size_t fixed_size = 1024/sizeof(_Tp)+8> class AutoBuffer
 public:
     typedef _Tp value_type;
 
-    //! the default contructor
+    //! the default constructor
     AutoBuffer();
     //! constructor taking the real buffer size
     AutoBuffer(size_t _size);
@@ -340,6 +340,8 @@ class CV_EXPORTS CommandLineParser
     CommandLineParser(const CommandLineParser& parser);
     CommandLineParser& operator = (const CommandLineParser& parser);
 
+    ~CommandLineParser();
+
     String getPathToApplication() const;
 
     template <typename T>
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 8ee691a18..ce4482579 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -47,13 +47,81 @@
 namespace cvtest {
 namespace ocl {
 
+///////////// Lut ////////////////////////
+
+typedef Size_MatType LUTFixture;
+
+OCL_PERF_TEST_P(LUTFixture, LUT,
+          ::testing::Combine(OCL_TEST_SIZES,
+                             OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, CV_8UC(cn)), lut(1, 256, type);
+    int dstType = CV_MAKETYPE(lut.depth(), src.channels());
+    UMat dst(srcSize, dstType);
+
+    declare.in(src, lut, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::LUT(src, lut, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Exp ////////////////////////
+
+typedef Size_MatType ExpFixture;
+
+OCL_PERF_TEST_P(ExpFixture, Exp, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src).out(dst);
+    randu(src, 5, 16);
+
+    OCL_TEST_CYCLE() cv::exp(src, dst);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Log ////////////////////////
+
+typedef Size_MatType LogFixture;
+
+OCL_PERF_TEST_P(LogFixture, Log, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    randu(src, 1, 10000);
+    declare.in(src).out(dst);
+
+    OCL_TEST_CYCLE() cv::log(src, dst);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
 ///////////// Add ////////////////////////
 
 typedef Size_MatType AddFixture;
 
 OCL_PERF_TEST_P(AddFixture, Add,
-            ::testing::Combine(OCL_TEST_SIZES,
-                               OCL_TEST_TYPES))
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
 {
     const Size srcSize = GET_PARAM(0);
     const int type = GET_PARAM(1);
@@ -61,15 +129,875 @@ OCL_PERF_TEST_P(AddFixture, Add,
     checkDeviceMaxMemoryAllocSize(srcSize, type);
 
     UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    randu(src1);
-    randu(src2);
-    declare.in(src1, src2).out(dst);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
 
     OCL_TEST_CYCLE() cv::add(src1, src2, dst);
 
     SANITY_CHECK(dst);
 }
 
+///////////// Subtract ////////////////////////
+
+typedef Size_MatType SubtractFixture;
+
+OCL_PERF_TEST_P(SubtractFixture, Subtract,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::subtract(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Mul ////////////////////////
+
+typedef Size_MatType MulFixture;
+
+OCL_PERF_TEST_P(MulFixture, Multiply, ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::multiply(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Div ////////////////////////
+
+typedef Size_MatType DivFixture;
+
+OCL_PERF_TEST_P(DivFixture, Divide,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::divide(src1, src2, dst);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Absdiff ////////////////////////
+
+typedef Size_MatType AbsDiffFixture;
+
+OCL_PERF_TEST_P(AbsDiffFixture, Absdiff,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).in(dst);
+
+    OCL_TEST_CYCLE() cv::absdiff(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// CartToPolar ////////////////////////
+
+typedef Size_MatType CartToPolarFixture;
+
+OCL_PERF_TEST_P(CartToPolarFixture, CartToPolar, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type),
+            dst1(srcSize, type), dst2(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst1, dst2);
+
+    OCL_TEST_CYCLE() cv::cartToPolar(src1, src2, dst1, dst2);
+
+    SANITY_CHECK(dst1, 8e-3);
+    SANITY_CHECK(dst2, 8e-3);
+}
+
+///////////// PolarToCart ////////////////////////
+
+typedef Size_MatType PolarToCartFixture;
+
+OCL_PERF_TEST_P(PolarToCartFixture, PolarToCart, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type),
+            dst1(srcSize, type), dst2(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst1, dst2);
+
+    OCL_TEST_CYCLE() cv::polarToCart(src1, src2, dst1, dst2);
+
+    SANITY_CHECK(dst1, 5e-5);
+    SANITY_CHECK(dst2, 5e-5);
+}
+
+///////////// Magnitude ////////////////////////
+
+typedef Size_MatType MagnitudeFixture;
+
+OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type),
+            dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::magnitude(src1, src2, dst);
+
+    SANITY_CHECK(dst, 1e-6);
+}
+
+///////////// Transpose ////////////////////////
+
+typedef Size_MatType TransposeFixture;
+
+OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::transpose(src, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Flip ////////////////////////
+
+enum
+{
+    FLIP_BOTH = 0, FLIP_ROWS, FLIP_COLS
+};
+
+CV_ENUM(FlipType, FLIP_BOTH, FLIP_ROWS, FLIP_COLS)
+
+typedef std::tr1::tuple<Size, MatType, FlipType> FlipParams;
+typedef TestBaseWithParam<FlipParams> FlipFixture;
+
+OCL_PERF_TEST_P(FlipFixture, Flip,
+            ::testing::Combine(OCL_TEST_SIZES,
+                               OCL_TEST_TYPES, FlipType::all()))
+{
+    const FlipParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const int flipType = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::flip(src, dst, flipType - 1);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// minMaxLoc ////////////////////////
+
+typedef Size_MatType MinMaxLocFixture;
+
+OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    bool onecn = CV_MAT_CN(type) == 1;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);;
+    declare.in(src, WARMUP_RNG);
+
+    double min_val = 0.0, max_val = 0.0;
+    Point min_loc, max_loc;
+
+    OCL_TEST_CYCLE() cv::minMaxLoc(src, &min_val, &max_val, onecn ? &min_loc : NULL,
+                                   onecn ? &max_loc : NULL);
+
+    ASSERT_GE(max_val, min_val);
+    SANITY_CHECK(min_val);
+    SANITY_CHECK(max_val);
+
+    int min_loc_x = min_loc.x, min_loc_y = min_loc.y, max_loc_x = max_loc.x,
+            max_loc_y = max_loc.y;
+    SANITY_CHECK(min_loc_x);
+    SANITY_CHECK(min_loc_y);
+    SANITY_CHECK(max_loc_x);
+    SANITY_CHECK(max_loc_y);
+}
+
+///////////// Sum ////////////////////////
+
+typedef Size_MatType SumFixture;
+
+OCL_PERF_TEST_P(SumFixture, Sum,
+            ::testing::Combine(OCL_TEST_SIZES,
+                               OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), depth = CV_MAT_DEPTH(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    Scalar result;
+    randu(src, 0, 60);
+    declare.in(src);
+
+    OCL_TEST_CYCLE() result = cv::sum(src);
+
+    if (depth >= CV_32F)
+        SANITY_CHECK(result, 1e-6, ERROR_RELATIVE);
+    else
+        SANITY_CHECK(result);
+}
+
+///////////// countNonZero ////////////////////////
+
+typedef Size_MatType CountNonZeroFixture;
+
+OCL_PERF_TEST_P(CountNonZeroFixture, CountNonZero,
+                ::testing::Combine(OCL_TEST_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    int result = 0;
+    randu(src, 0, 10);
+    declare.in(src);
+
+    OCL_TEST_CYCLE() result = cv::countNonZero(src);
+
+    SANITY_CHECK(result);
+}
+
+///////////// Phase ////////////////////////
+
+typedef Size_MatType PhaseFixture;
+
+OCL_PERF_TEST_P(PhaseFixture, Phase, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type),
+            dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::phase(src1, src2, dst, 1);
+
+    SANITY_CHECK(dst, 1e-2);
+}
+
+///////////// bitwise_and////////////////////////
+
+typedef Size_MatType BitwiseAndFixture;
+
+OCL_PERF_TEST_P(BitwiseAndFixture, Bitwise_and,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bitwise_and(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// bitwise_xor ////////////////////////
+
+typedef Size_MatType BitwiseXorFixture;
+
+OCL_PERF_TEST_P(BitwiseXorFixture, Bitwise_xor,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bitwise_xor(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// bitwise_or ////////////////////////
+
+typedef Size_MatType BitwiseOrFixture;
+
+OCL_PERF_TEST_P(BitwiseOrFixture, Bitwise_or,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bitwise_or(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// bitwise_not ////////////////////////
+
+typedef Size_MatType BitwiseNotFixture;
+
+OCL_PERF_TEST_P(BitwiseNotFixture, Bitwise_not,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bitwise_not(src, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// compare////////////////////////
+
+CV_ENUM(CmpCode, CMP_LT, CMP_LE, CMP_EQ, CMP_NE, CMP_GE, CMP_GT)
+
+typedef std::tr1::tuple<Size, MatType, CmpCode> CompareParams;
+typedef TestBaseWithParam<CompareParams> CompareFixture;
+
+OCL_PERF_TEST_P(CompareFixture, Compare,
+            ::testing::Combine(OCL_TEST_SIZES,
+                               OCL_TEST_TYPES, CmpCode::all()))
+{
+    const CompareParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const int cmpCode = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, CV_8UC1);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::compare(src1, src2, dst, cmpCode);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// pow ////////////////////////
+
+typedef Size_MatType PowFixture;
+
+OCL_PERF_TEST_P(PowFixture, Pow, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    randu(src, -100, 100);
+    declare.in(src).out(dst);
+
+    OCL_TEST_CYCLE() cv::pow(src, -2.0, dst);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// AddWeighted////////////////////////
+
+typedef Size_MatType AddWeightedFixture;
+
+OCL_PERF_TEST_P(AddWeightedFixture, AddWeighted,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), depth = CV_MAT_DEPTH(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+    double alpha = 2.0, beta = 1.0, gama = 3.0;
+
+    OCL_TEST_CYCLE() cv::addWeighted(src1, alpha, src2, beta, gama, dst);
+
+    if (depth >= CV_32F)
+        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    else
+        SANITY_CHECK(dst);
+}
+
+///////////// Sqrt ///////////////////////
+
+typedef Size_MatType SqrtFixture;
+
+OCL_PERF_TEST_P(SqrtFixture, Sqrt, ::testing::Combine(
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    randu(src, 0, 1000);
+    declare.in(src).out(dst);
+
+    OCL_TEST_CYCLE() cv::sqrt(src, dst);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// SetIdentity ////////////////////////
+
+typedef Size_MatType SetIdentityFixture;
+
+OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat dst(srcSize, type);
+    declare.out(dst);
+
+    OCL_TEST_CYCLE() cv::setIdentity(dst, cv::Scalar::all(181));
+
+    SANITY_CHECK(dst);
+}
+
+///////////// MeanStdDev ////////////////////////
+
+typedef Size_MatType MeanStdDevFixture;
+
+OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const double eps = 2e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    Scalar mean, stddev;
+    declare.in(src, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::meanStdDev(src, mean, stddev);
+
+    double mean0 = mean[0], mean1 = mean[1], mean2 = mean[2], mean3 = mean[3];
+    double stddev0 = stddev[0], stddev1 = stddev[1], stddev2 = stddev[2], stddev3 = stddev[3];
+
+    SANITY_CHECK(mean0, eps, ERROR_RELATIVE);
+    SANITY_CHECK(mean1, eps, ERROR_RELATIVE);
+    SANITY_CHECK(mean2, eps, ERROR_RELATIVE);
+    SANITY_CHECK(mean3, eps, ERROR_RELATIVE);
+    SANITY_CHECK(stddev0, eps, ERROR_RELATIVE);
+    SANITY_CHECK(stddev1, eps, ERROR_RELATIVE);
+    SANITY_CHECK(stddev2, eps, ERROR_RELATIVE);
+    SANITY_CHECK(stddev3, eps, ERROR_RELATIVE);
+}
+
+///////////// Norm ////////////////////////
+
+CV_ENUM(NormType, NORM_INF, NORM_L1, NORM_L2)
+
+typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
+typedef TestBaseWithParam<NormParams> NormFixture;
+
+OCL_PERF_TEST_P(NormFixture, Norm,
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all()))
+{
+    const NormParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const int normType = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type);
+    double res;
+    declare.in(src1, src2, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() res = cv::norm(src1, src2, normType);
+
+    SANITY_CHECK(res, 1e-5, ERROR_RELATIVE);
+}
+
+///////////// UMat::dot ////////////////////////
+
+typedef Size_MatType UMatDotFixture;
+
+OCL_PERF_TEST_P(UMatDotFixture, UMatDot,
+            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    double r = 0.0;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() r = src1.dot(src2);
+
+    SANITY_CHECK(r, 1e-5, ERROR_RELATIVE);
+}
+
+///////////// Repeat ////////////////////////
+
+typedef Size_MatType RepeatFixture;
+
+OCL_PERF_TEST_P(RepeatFixture, Repeat,
+            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), nx = 2, ny = 2;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(Size(srcSize.width * nx, srcSize.height * ny), type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::repeat(src, nx, ny, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Min ////////////////////////
+
+typedef Size_MatType MinFixture;
+
+OCL_PERF_TEST_P(MinFixture, Min,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::min(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Max ////////////////////////
+
+typedef Size_MatType MaxFixture;
+
+OCL_PERF_TEST_P(MaxFixture, Max,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::max(src1, src2, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// InRange ////////////////////////
+
+typedef Size_MatType InRangeFixture;
+
+OCL_PERF_TEST_P(InRangeFixture, InRange,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), lb(srcSize, type), ub(srcSize, type), dst(srcSize, CV_8UC1);
+    declare.in(src, lb, ub, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::inRange(src, lb, ub, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Normalize ////////////////////////
+
+CV_ENUM(NormalizeModes, CV_MINMAX, CV_L2, CV_L1, CV_C)
+
+typedef tuple<Size, MatType, NormalizeModes> NormalizeParams;
+typedef TestBaseWithParam<NormalizeParams> NormalizeFixture;
+
+OCL_PERF_TEST_P(NormalizeFixture, Normalize,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all()))
+{
+    const NormalizeParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), mode = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::normalize(src, dst, 10, 110, mode);
+
+    SANITY_CHECK(dst, 5e-2);
+}
+
+///////////// ConvertScaleAbs ////////////////////////
+
+typedef Size_MatType ConvertScaleAbsFixture;
+
+OCL_PERF_TEST_P(ConvertScaleAbsFixture, ConvertScaleAbs,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_8UC(cn));
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::convertScaleAbs(src, dst, 0.5, 2);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// PatchNaNs ////////////////////////
+
+typedef Size_MatType PatchNaNsFixture;
+
+OCL_PERF_TEST_P(PatchNaNsFixture, PatchNaNs,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    Size srcSize = get<0>(params);
+    const int type = get<1>(params), cn = CV_MAT_CN(type);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(src);
+
+    // generating NaNs
+    {
+        Mat src_ = src.getMat(ACCESS_RW);
+        srcSize.width *= cn;
+        for (int y = 0; y < srcSize.height; ++y)
+        {
+            float * const ptr = src_.ptr<float>(y);
+            for (int x = 0; x < srcSize.width; ++x)
+                ptr[x] = (x + y) % 2 == 0 ? std::numeric_limits<float>::quiet_NaN() : ptr[x];
+        }
+    }
+
+    OCL_TEST_CYCLE() cv::patchNaNs(src, 17.7);
+
+    SANITY_CHECK(src);
+}
+
+
+///////////// ScaleAdd ////////////////////////
+
+typedef Size_MatType ScaleAddFixture;
+
+OCL_PERF_TEST_P(ScaleAddFixture, ScaleAdd,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::scaleAdd(src1, 0.6, src2, dst);
+
+    SANITY_CHECK(dst, 1e-6);
+}
+
+///////////// PSNR ////////////////////////
+
+typedef Size_MatType PSNRFixture;
+
+OCL_PERF_TEST_P(PSNRFixture, PSNR,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    double psnr = 0;
+    UMat src1(srcSize, type), src2(srcSize, type);
+    declare.in(src1, src2, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() psnr = cv::PSNR(src1, src2);
+
+    SANITY_CHECK(psnr, 1e-4, ERROR_RELATIVE);
+}
+
+///////////// Reduce ////////////////////////
+
+CV_ENUM(ReduceMinMaxOp, CV_REDUCE_MIN, CV_REDUCE_MAX)
+
+typedef tuple<Size, std::pair<MatType, MatType>, int, ReduceMinMaxOp> ReduceMinMaxParams;
+typedef TestBaseWithParam<ReduceMinMaxParams> ReduceMinMaxFixture;
+
+OCL_PERF_TEST_P(ReduceMinMaxFixture, Reduce,
+                ::testing::Combine(OCL_TEST_SIZES,
+                                   OCL_PERF_ENUM(std::make_pair<MatType, MatType>(CV_8UC1, CV_8UC1),
+                                                 std::make_pair<MatType, MatType>(CV_32FC4, CV_32FC4)),
+                                   OCL_PERF_ENUM(0, 1),
+                                   ReduceMinMaxOp::all()))
+{
+    const ReduceMinMaxParams params = GetParam();
+    const std::pair<MatType, MatType> types = get<1>(params);
+    const int stype = types.first, dtype = types.second,
+            dim = get<2>(params), op = get<3>(params);
+    const Size srcSize = get<0>(params),
+            dstSize(dim == 0 ? srcSize.width : 1, dim == 0 ? 1 : srcSize.height);
+    const double eps = CV_MAT_DEPTH(dtype) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, stype);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, stype), dst(dstSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::reduce(src, dst, dim, op, dtype);
+
+    SANITY_CHECK(dst, eps);
+}
+
+CV_ENUM(ReduceAccOp, CV_REDUCE_SUM, CV_REDUCE_AVG)
+
+typedef tuple<Size, std::pair<MatType, MatType>, int, ReduceAccOp> ReduceAccParams;
+typedef TestBaseWithParam<ReduceAccParams> ReduceAccFixture;
+
+OCL_PERF_TEST_P(ReduceAccFixture, Reduce,
+                ::testing::Combine(OCL_TEST_SIZES,
+                                   OCL_PERF_ENUM(std::make_pair<MatType, MatType>(CV_8UC4, CV_32SC4),
+                                                 std::make_pair<MatType, MatType>(CV_32FC1, CV_32FC1)),
+                                   OCL_PERF_ENUM(0, 1),
+                                   ReduceAccOp::all()))
+{
+    const ReduceAccParams params = GetParam();
+    const std::pair<MatType, MatType> types = get<1>(params);
+    const int stype = types.first, dtype = types.second,
+            dim = get<2>(params), op = get<3>(params);
+    const Size srcSize = get<0>(params),
+            dstSize(dim == 0 ? srcSize.width : 1, dim == 0 ? 1 : srcSize.height);
+    const double eps = CV_MAT_DEPTH(dtype) <= CV_32S ? 1 : 3e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, stype);
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat src(srcSize, stype), dst(dstSize, dtype);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::reduce(src, dst, dim, op, dtype);
+
+    SANITY_CHECK(dst, eps);
+}
+
 } } // namespace cvtest::ocl
 
 #endif // HAVE_OPENCL
diff --git a/modules/core/perf/opencl/perf_bufferpool.cpp b/modules/core/perf/opencl/perf_bufferpool.cpp
new file mode 100644
index 000000000..2e01db404
--- /dev/null
+++ b/modules/core/perf/opencl/perf_bufferpool.cpp
@@ -0,0 +1,132 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+struct BufferPoolState
+{
+    BufferPoolController* controller_;
+    size_t oldMaxReservedSize_;
+
+    BufferPoolState(BufferPoolController* c, bool enable)
+        : controller_(c)
+    {
+        if (!cv::ocl::useOpenCL())
+        {
+            throw ::perf::TestBase::PerfSkipTestException();
+        }
+        oldMaxReservedSize_ = c->getMaxReservedSize();
+        if (oldMaxReservedSize_ == (size_t)-1)
+        {
+            throw ::perf::TestBase::PerfSkipTestException();
+        }
+        if (!enable)
+        {
+            c->setMaxReservedSize(0);
+        }
+        else
+        {
+            c->freeAllReservedBuffers();
+        }
+    }
+
+    ~BufferPoolState()
+    {
+        controller_->setMaxReservedSize(oldMaxReservedSize_);
+    }
+};
+
+typedef TestBaseWithParam<bool> BufferPoolFixture;
+
+OCL_PERF_TEST_P(BufferPoolFixture, BufferPool_UMatCreation100, Bool())
+{
+    BufferPoolState s(cv::ocl::getOpenCLAllocator()->getBufferPoolController(), GetParam());
+
+    Size sz(1920, 1080);
+
+    OCL_TEST_CYCLE()
+    {
+        for (int i = 0; i < 100; i++)
+        {
+            UMat u(sz, CV_8UC1);
+        }
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(BufferPoolFixture, BufferPool_UMatCountNonZero100, Bool())
+{
+    BufferPoolState s(cv::ocl::getOpenCLAllocator()->getBufferPoolController(), GetParam());
+
+    Size sz(1920, 1080);
+
+    OCL_TEST_CYCLE()
+    {
+        for (int i = 0; i < 100; i++)
+        {
+            UMat u(sz, CV_8UC1);
+            countNonZero(u);
+        }
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(BufferPoolFixture, BufferPool_UMatCanny10, Bool())
+{
+    BufferPoolState s(cv::ocl::getOpenCLAllocator()->getBufferPoolController(), GetParam());
+
+    Size sz(1920, 1080);
+
+    int aperture = 3;
+    bool useL2 = false;
+    double thresh_low = 100;
+    double thresh_high = 120;
+
+    OCL_TEST_CYCLE()
+    {
+        for (int i = 0; i < 10; i++)
+        {
+            UMat src(sz, CV_8UC1);
+            UMat dst;
+            Canny(src, dst, thresh_low, thresh_high, aperture, useL2);
+            dst.getMat(ACCESS_READ); // complete async operations
+        }
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+OCL_PERF_TEST_P(BufferPoolFixture, BufferPool_UMatIntegral10, Bool())
+{
+    BufferPoolState s(cv::ocl::getOpenCLAllocator()->getBufferPoolController(), GetParam());
+
+    Size sz(1920, 1080);
+
+    OCL_TEST_CYCLE()
+    {
+        for (int i = 0; i < 10; i++)
+        {
+            UMat src(sz, CV_32FC1);
+            UMat dst;
+            integral(src, dst);
+            dst.getMat(ACCESS_READ); // complete async operations
+        }
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/core/perf/opencl/perf_channels.cpp b/modules/core/perf/opencl/perf_channels.cpp
new file mode 100644
index 000000000..1dbad51a0
--- /dev/null
+++ b/modules/core/perf/opencl/perf_channels.cpp
@@ -0,0 +1,203 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// Merge////////////////////////
+
+typedef tuple<Size, MatDepth, int> MergeParams;
+typedef TestBaseWithParam<MergeParams> MergeFixture;
+
+OCL_PERF_TEST_P(MergeFixture, Merge,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8U, CV_32F), Values(2, 3)))
+{
+    const MergeParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params), dtype = CV_MAKE_TYPE(depth, cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
+
+    UMat dst(srcSize, dtype);
+    vector<UMat> src(cn);
+    for (vector<UMat>::iterator i = src.begin(), end = src.end(); i != end; ++i)
+    {
+        i->create(srcSize, CV_MAKE_TYPE(depth, 1));
+        declare.in(*i, WARMUP_RNG);
+    }
+    declare.out(dst);
+
+    OCL_TEST_CYCLE() cv::merge(src, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Split ////////////////////////
+
+typedef MergeParams SplitParams;
+typedef TestBaseWithParam<SplitParams> SplitFixture;
+
+OCL_PERF_TEST_P(SplitFixture, Split,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8U, CV_32F), Values(2, 3)))
+{
+    const SplitParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), cn = get<2>(params), type = CV_MAKE_TYPE(depth, cn);
+
+    ASSERT_TRUE(cn == 3 || cn == 2);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type);
+    std::vector<UMat> dst(cn, UMat(srcSize, CV_MAKE_TYPE(depth, 1)));
+
+    declare.in(src, WARMUP_RNG);
+    for (int i = 0; i < cn; ++i)
+        declare.in(dst[i]);
+
+    OCL_TEST_CYCLE() cv::split(src, dst);
+
+    ASSERT_EQ(cn, (int)dst.size());
+
+    if (cn == 2)
+    {
+        UMat & dst0 = dst[0], & dst1 = dst[1];
+        SANITY_CHECK(dst0);
+        SANITY_CHECK(dst1);
+    }
+    else
+    {
+        UMat & dst0 = dst[0], & dst1 = dst[1], & dst2 = dst[2];
+        SANITY_CHECK(dst0);
+        SANITY_CHECK(dst1);
+        SANITY_CHECK(dst2);
+    }
+}
+
+///////////// MixChannels ////////////////////////
+
+typedef tuple<Size, MatDepth> MixChannelsParams;
+typedef TestBaseWithParam<MixChannelsParams> MixChannelsFixture;
+
+OCL_PERF_TEST_P(MixChannelsFixture, MixChannels,
+                ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_PERF_ENUM(CV_8U, CV_32F)))
+{
+    const MixChannelsParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), type = CV_MAKE_TYPE(depth, 2), n = 2;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    std::vector<UMat> src(n), dst(n);
+    for (int i = 0; i < n; ++i)
+    {
+        src[i] = UMat(srcSize, type);
+        dst[i] = UMat(srcSize, type);
+        declare.in(src[i], WARMUP_RNG).out(dst[i]);
+    }
+
+    int fromTo[] = { 1,2, 2,0, 0,3, 3,1 };
+
+    OCL_TEST_CYCLE() cv::mixChannels(src, dst, fromTo, 4);
+
+    UMat & dst0 = dst[0], & dst1 = dst[1];
+    SANITY_CHECK(dst0);
+    SANITY_CHECK(dst1);
+}
+
+///////////// InsertChannel ////////////////////////
+
+typedef Size_MatDepth InsertChannelFixture;
+
+OCL_PERF_TEST_P(InsertChannelFixture, InsertChannel,
+                ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_PERF_ENUM(CV_8U, CV_32F)))
+{
+    const Size_MatDepth_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), type = CV_MAKE_TYPE(depth, 3);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, depth), dst(srcSize, type, Scalar::all(17));
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::insertChannel(src, dst, 1);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// ExtractChannel ////////////////////////
+
+typedef Size_MatDepth ExtractChannelFixture;
+
+OCL_PERF_TEST_P(ExtractChannelFixture, ExtractChannel,
+                ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_PERF_ENUM(CV_8U, CV_32F)))
+{
+    const Size_MatDepth_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), type = CV_MAKE_TYPE(depth, 3);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, depth);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::extractChannel(src, dst, 1);
+
+    SANITY_CHECK(dst);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_haar.cpp b/modules/core/perf/opencl/perf_dxt.cpp
similarity index 59%
rename from modules/ocl/perf/perf_haar.cpp
rename to modules/core/perf/opencl/perf_dxt.cpp
index 8e69b7648..d0219913b 100644
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/core/perf/opencl/perf_dxt.cpp
@@ -43,45 +43,57 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-#include "opencv2/objdetect/objdetect_c.h"
+#ifdef HAVE_OPENCL
 
-using namespace perf;
+namespace cvtest {
+namespace ocl {
 
-///////////// Haar ////////////////////////
-PERF_TEST(HaarFixture, Haar)
+///////////// dft ////////////////////////
+
+typedef tuple<Size, int> DftParams;
+typedef TestBaseWithParam<DftParams> DftFixture;
+
+OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                                Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE,
+                                                       (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE)))
 {
-    vector<Rect> faces;
+    const DftParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int flags = get<1>(params);
 
-    Mat img = imread(getDataPath("gpu/haarcascade/basketball1.png"), IMREAD_GRAYSCALE);
-    ASSERT_TRUE(!img.empty()) << "can't open basketball1.png";
-    declare.in(img);
+    UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
+    declare.in(src, WARMUP_RNG).out(dst);
 
-    if (RUN_PLAIN_IMPL)
-    {
-        CascadeClassifier faceCascade;
-        ASSERT_TRUE(faceCascade.load(getDataPath("gpu/haarcascade/haarcascade_frontalface_alt.xml")))
-                << "can't load haarcascade_frontalface_alt.xml";
+    OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT);
 
-        TEST_CYCLE() faceCascade.detectMultiScale(img, faces,
-                                                     1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-
-        SANITY_CHECK(faces, 4 + 1e-4);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::OclCascadeClassifier faceCascade;
-        ocl::oclMat oclImg(img);
-
-        ASSERT_TRUE(faceCascade.load(getDataPath("gpu/haarcascade/haarcascade_frontalface_alt.xml")))
-                << "can't load haarcascade_frontalface_alt.xml";
-
-        OCL_TEST_CYCLE() faceCascade.detectMultiScale(oclImg, faces,
-                                     1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));
-
-        SANITY_CHECK(faces, 4 + 1e-4);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, 1e-3);
 }
+
+///////////// MulSpectrums ////////////////////////
+
+typedef tuple<Size, bool> MulSpectrumsParams;
+typedef TestBaseWithParam<MulSpectrumsParams> MulSpectrumsFixture;
+
+OCL_PERF_TEST_P(MulSpectrumsFixture, MulSpectrums,
+                ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   Bool()))
+{
+    const MulSpectrumsParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const bool conj = get<1>(params);
+
+    UMat src1(srcSize, CV_32FC2), src2(srcSize, CV_32FC2), dst(srcSize, CV_32FC2);
+    declare.in(src1, src2, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::mulSpectrums(src1, src2, dst, 0, conj);
+
+    SANITY_CHECK(dst, 1e-3);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_gemm.cpp b/modules/core/perf/opencl/perf_gemm.cpp
similarity index 75%
rename from modules/ocl/perf/perf_gemm.cpp
rename to modules/core/perf/opencl/perf_gemm.cpp
index 4dcd5d4d6..3aa87d6a1 100644
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/core/perf/opencl/perf_gemm.cpp
@@ -43,46 +43,40 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include "perf_precomp.hpp"
 
-using namespace perf;
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
 
 ///////////// gemm ////////////////////////
 
-typedef TestBaseWithParam<Size> gemmFixture;
+typedef tuple<Size, int> GemmParams;
+typedef TestBaseWithParam<GemmParams> GemmFixture;
 
-#ifdef HAVE_CLAMDBLAS
-
-PERF_TEST_P(gemmFixture, gemm, ::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000))
+OCL_PERF_TEST_P(GemmFixture, Gemm, ::testing::Combine(
+                    ::testing::Values(Size(1000, 1000), Size(1500, 1500)),
+            Values((int)cv::GEMM_3_T, (int)cv::GEMM_3_T | (int)cv::GEMM_2_T)))
 {
-    const Size srcSize = GetParam();
+    GemmParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int flags = get<1>(params);
 
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
+    UMat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
             src3(srcSize, CV_32FC1), dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, src3).out(dst).time(srcSize == OCL_SIZE_2000 ? 65 : 8);
+    declare.in(src1, src2, src3).out(dst);
     randu(src1, -10.0f, 10.0f);
     randu(src2, -10.0f, 10.0f);
     randu(src3, -10.0f, 10.0f);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
-                oclSrc3(src3), oclDst(srcSize, CV_32FC1);
+    OCL_TEST_CYCLE() cv::gemm(src1, src2, 0.6, src3, 1.5, dst, flags);
 
-        OCL_TEST_CYCLE() cv::ocl::gemm(oclSrc1, oclSrc2, 1.0, oclSrc3, 1.0, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 0.01);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst);
-
-        SANITY_CHECK(dst, 0.01);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, 0.01);
 }
 
-#endif
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/core/perf/opencl/perf_usage_flags.cpp b/modules/core/perf/opencl/perf_usage_flags.cpp
new file mode 100644
index 000000000..3f59fec8b
--- /dev/null
+++ b/modules/core/perf/opencl/perf_usage_flags.cpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef TestBaseWithParam<std::tr1::tuple<cv::Size, bool> > UsageFlagsBoolFixture;
+
+OCL_PERF_TEST_P(UsageFlagsBoolFixture, UsageFlags_AllocHostMem, ::testing::Combine(OCL_TEST_SIZES, Bool()))
+{
+    Size sz = get<0>(GetParam());
+    bool allocHostMem = get<1>(GetParam());
+
+    UMat src(sz, CV_8UC1, Scalar::all(128));
+
+    OCL_TEST_CYCLE()
+    {
+        UMat dst(allocHostMem ? USAGE_ALLOCATE_HOST_MEMORY : USAGE_DEFAULT);
+
+        cv::add(src, Scalar::all(1), dst);
+        {
+            Mat canvas = dst.getMat(ACCESS_RW);
+            cv::putText(canvas, "Test", Point(20, 20), FONT_HERSHEY_PLAIN, 1, Scalar::all(255));
+        }
+        UMat final;
+        cv::subtract(dst, Scalar::all(1), final);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index 22d7def2c..3598c8639 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -202,3 +202,43 @@ PERF_TEST_P(Size_MatType, subtractScalar, TYPICAL_MATS_CORE_ARITHM)
 
     SANITY_CHECK(c, 1e-8);
 }
+
+PERF_TEST_P(Size_MatType, multiply, TYPICAL_MATS_CORE_ARITHM)
+{
+    Size sz = get<0>(GetParam());
+    int type = get<1>(GetParam());
+    cv::Mat a(sz, type), b(sz, type), c(sz, type);
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+    if (CV_MAT_DEPTH(type) == CV_32S)
+    {
+        //According to docs, saturation is not applied when result is 32bit integer
+        a /= (2 << 16);
+        b /= (2 << 16);
+    }
+
+    TEST_CYCLE() multiply(a, b, c);
+
+    SANITY_CHECK(c, 1e-8);
+}
+
+PERF_TEST_P(Size_MatType, multiplyScale, TYPICAL_MATS_CORE_ARITHM)
+{
+    Size sz = get<0>(GetParam());
+    int type = get<1>(GetParam());
+    cv::Mat a(sz, type), b(sz, type), c(sz, type);
+    double scale = 0.5;
+
+    declare.in(a, b, WARMUP_RNG).out(c);
+
+    if (CV_MAT_DEPTH(type) == CV_32S)
+    {
+        //According to docs, saturation is not applied when result is 32bit integer
+        a /= (2 << 16);
+        b /= (2 << 16);
+    }
+
+    TEST_CYCLE() multiply(a, b, c, scale);
+
+    SANITY_CHECK(c, 1e-8);
+}
diff --git a/modules/core/src/algorithm.cpp b/modules/core/src/algorithm.cpp
index ff67a5df1..9f9493e8a 100644
--- a/modules/core/src/algorithm.cpp
+++ b/modules/core/src/algorithm.cpp
@@ -628,7 +628,7 @@ void AlgorithmInfo::set(Algorithm* algo, const char* parameter, int argType, con
             || argType == Param::FLOAT || argType == Param::UNSIGNED_INT || argType == Param::UINT64 || argType == Param::UCHAR)
     {
         if ( !( p->type == Param::INT || p->type == Param::REAL || p->type == Param::BOOLEAN
-                || p->type == Param::UNSIGNED_INT || p->type == Param::UINT64 || p->type == Param::FLOAT || argType == Param::UCHAR) )
+                || p->type == Param::UNSIGNED_INT || p->type == Param::UINT64 || p->type == Param::FLOAT || p->type == Param::UCHAR) )
         {
             String message = getErrorMessageForWrongArgumentInSetter(algo->name(), parameter, p->type, argType);
             CV_Error(CV_StsBadArg, message);
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 359d27222..a8374521f 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -251,16 +251,16 @@ void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
     template <>                                                                                  \
     struct name<template_arg>{                                                                   \
         typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p);}; \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v);};       \
+        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
+        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
     }
 
 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
     template <>                                                                \
     struct name<template_arg>{                                                 \
         typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p);}; \
-        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
+        static reg_type load(const template_arg * p) { return load_body (p); } \
+        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
     }
 
 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
@@ -915,11 +915,14 @@ void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t bl
 
 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
-       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14 };
+       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
+       OCL_OP_RDIV_SCALE=15 };
+
+#ifdef HAVE_OPENCL
 
 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
     "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
-    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", 0 };
+    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
 
 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                           InputArray _mask, bool bitwise, int oclop, bool haveScalar )
@@ -931,16 +934,23 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if( oclop < 0 || ((haveMask || haveScalar) && (cn > 4 || cn == 3)) ||
+    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
             (!doubleSupport && srcdepth == CV_64F))
         return false;
 
     char opts[1024];
     int kercn = haveMask || haveScalar ? cn : 1;
-    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s",
+    int scalarcn = kercn == 3 ? 4 : kercn;
+
+    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d",
             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
-            ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+            ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
+            ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
+            ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
+            kercn);
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
     if( k.empty() )
@@ -957,7 +967,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
     if( haveScalar )
     {
-        size_t esz = CV_ELEM_SIZE(srctype);
+        size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
         double buf[4] = {0,0,0,0};
 
         if( oclop != OCL_OP_NOT )
@@ -988,6 +998,7 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     return k.run(2, globalsize, 0, false);
 }
 
+#endif
 
 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
                        InputArray _mask, const BinaryFunc* tab,
@@ -1000,16 +1011,19 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
     int dims1 = psrc1->dims(), dims2 = psrc2->dims();
     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+#ifdef HAVE_OPENCL
     bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
-                        ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
+            dims1 <= 2 && dims2 <= 2;
+#endif
     bool haveMask = !_mask.empty(), haveScalar = false;
     BinaryFunc func;
 
     if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
     {
         _dst.create(sz1, type1);
-        if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false) )
-            return;
+        CV_OCL_RUN(use_opencl,
+                   ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
+
         if( bitwise )
         {
             func = *tab;
@@ -1076,8 +1090,9 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
     if( haveMask && reallocate )
         _dst.setTo(0.);
 
-    if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar ))
-        return;
+    CV_OCL_RUN(use_opencl,
+               ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
+
 
     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
     Mat dst = _dst.getMat(), mask = _mask.getMat();
@@ -1088,9 +1103,7 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
         cn = (int)esz;
     }
     else
-    {
         func = tab[depth1];
-    }
 
     if( !haveScalar )
     {
@@ -1277,6 +1290,7 @@ static int actualScalarDepth(const double* data, int len)
         CV_32S;
 }
 
+#ifdef HAVE_OPENCL
 
 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                           InputArray _mask, int wtype,
@@ -1287,7 +1301,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
     bool haveMask = !_mask.empty();
 
-    if( ((haveMask || haveScalar) && (cn > 4 || cn == 3)) )
+    if( ((haveMask || haveScalar) && cn > 4) )
         return false;
 
     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
@@ -1300,26 +1314,33 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         return false;
 
     int kercn = haveMask || haveScalar ? cn : 1;
+    int scalarcn = kercn == 3 ? 4 : kercn;
 
-    char cvtstr[3][32], opts[1024];
-    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s "
-            "-D dstT=%s -D workT=%s -D convertToWT1=%s "
-            "-D convertToWT2=%s -D convertToDT=%s%s",
+    char cvtstr[4][32], opts[1024];
+    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
+            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D convertToWT1=%s "
+            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d",
             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
+            ocl::typeToStr(CV_MAKETYPE(depth1, 1)),
             ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+            ocl::typeToStr(CV_MAKETYPE(depth2, 1)),
             ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+            ocl::typeToStr(CV_MAKETYPE(ddepth, 1)),
             ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+            ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
+            ocl::typeToStr(CV_MAKETYPE(wdepth, 1)),
             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn);
 
+    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
     const uchar* usrdata_p = (const uchar*)usrdata;
     const double* usrdata_d = (const double*)usrdata;
     float usrdata_f[3];
     int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
-        oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
+        oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
     if( n > 0 && wdepth == CV_32F )
     {
         for( i = 0; i < n; i++ )
@@ -1343,7 +1364,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
     if( haveScalar )
     {
-        size_t esz = CV_ELEM_SIZE(wtype);
+        size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
         double buf[4]={0,0,0,0};
         Mat src2sc = _src2.getMat();
 
@@ -1352,13 +1373,20 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
 
         if( !haveMask )
-            k.args(src1arg, dstarg, scalararg);
+        {
+            if(n == 0)
+                k.args(src1arg, dstarg, scalararg);
+            else if(n == 1)
+                k.args(src1arg, dstarg, scalararg,
+                       ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
+            else
+                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
+        }
         else
             k.args(src1arg, maskarg, dstarg, scalararg);
     }
     else
     {
-        size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
         src2 = _src2.getUMat();
         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
 
@@ -1385,6 +1413,7 @@ static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
 
 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
@@ -1399,7 +1428,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
-    bool use_opencl = _dst.kind() == _OutputArray::UMAT && ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
+#ifdef HAVE_OPENCL
+    bool use_opencl = _dst.isUMat() && dims1 <= 2 && dims2 <= 2;
+#endif
     bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
     bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
 
@@ -1409,11 +1440,10 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
     {
         _dst.createSameSize(*psrc1, type1);
-        if( use_opencl &&
+        CV_OCL_RUN(use_opencl,
             ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
                           (!usrdata ? type1 : std::max(depth1, CV_32F)),
                           usrdata, oclop, false))
-            return;
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
@@ -1424,8 +1454,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     bool haveScalar = false, swapped12 = false;
 
     if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
-        ((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) &&
-         (sz1 == Size(1,4) || sz2 == Size(1,4))) )
+        (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
+        (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
     {
         if( checkScalar(*psrc1, type2, kind1, kind2) )
         {
@@ -1439,6 +1469,8 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
             swapped12 = true;
             if( oclop == OCL_OP_SUB )
                 oclop = OCL_OP_RSUB;
+            if ( oclop == OCL_OP_DIV_SCALE )
+                oclop = OCL_OP_RDIV_SCALE;
         }
         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
             CV_Error( CV_StsUnmatchedSizes,
@@ -1508,10 +1540,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
     if( reallocate )
         _dst.setTo(0.);
 
-    if( use_opencl &&
-        ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
-                      usrdata, oclop, haveScalar))
-        return;
+    CV_OCL_RUN(use_opencl,
+               ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
+               usrdata, oclop, haveScalar))
 
     BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
     BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
@@ -2588,6 +2619,8 @@ static double getMaxVal(int depth)
     return tab[depth];
 }
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
 {
     if ( !((_src1.isMat() || _src1.isUMat()) && (_src2.isMat() || _src2.isUMat())) )
@@ -2600,7 +2633,7 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
 
     const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D BINARY_OP -D srcT1=%s -D workT=srcT1"
+                  format("-D BINARY_OP -D srcT1=%s -D workT=srcT1 -D cn=1"
                          " -D OP_CMP -D CMP_OPERATOR=%s%s",
                          ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
                          operationMap[op],
@@ -2624,6 +2657,8 @@ static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, in
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 }
 
 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
@@ -2631,9 +2666,8 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
                op == CMP_NE || op == CMP_GE || op == CMP_GT );
 
-    if (ocl::useOpenCL() && _src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat() &&
-            ocl_compare(_src1, _src2, _dst, op))
-        return;
+    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
+               ocl_compare(_src1, _src2, _dst, op))
 
     int kind1 = _src1.kind(), kind2 = _src2.kind();
     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
@@ -2865,11 +2899,125 @@ static InRangeFunc getInRangeFunc(int depth)
     return inRangeTab[depth];
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_inRange( InputArray _src, InputArray _lowerb,
+                         InputArray _upperb, OutputArray _dst )
+{
+    int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
+    Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
+    int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
+    int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
+    int cn = CV_MAT_CN(stype);
+    bool lbScalar = false, ubScalar = false;
+
+    if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
+        ssize != lsize || stype != ltype )
+    {
+        if( !checkScalar(_lowerb, stype, lkind, skind) )
+            CV_Error( CV_StsUnmatchedSizes,
+                     "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
+        lbScalar = true;
+    }
+
+    if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
+        ssize != usize || stype != utype )
+    {
+        if( !checkScalar(_upperb, stype, ukind, skind) )
+            CV_Error( CV_StsUnmatchedSizes,
+                     "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
+        ubScalar = true;
+    }
+
+    if (lbScalar != ubScalar)
+        return false;
+
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            haveScalar = lbScalar && ubScalar;
+
+    if ( (!doubleSupport && sdepth == CV_64F) ||
+         (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
+        return false;
+
+    ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc,
+                    format("%s-D cn=%d -D T=%s%s", haveScalar ? "-D HAVE_SCALAR " : "",
+                           cn, ocl::typeToStr(sdepth), doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (ker.empty())
+        return false;
+
+    _dst.create(ssize, CV_8UC1);
+    UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
+    Mat lscalar, uscalar;
+
+    if (lbScalar && ubScalar)
+    {
+        lscalar = _lowerb.getMat();
+        uscalar = _upperb.getMat();
+
+        size_t esz = src.elemSize();
+        size_t blocksize = 36;
+
+        AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
+        uchar *buf = alignPtr(_buf + blocksize*cn, 16);
+
+        if( ldepth != sdepth && sdepth < CV_32S )
+        {
+            int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
+            int* iubuf = ilbuf + cn;
+
+            BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
+            sccvtfunc(lscalar.data, 0, 0, 0, (uchar*)ilbuf, 0, Size(cn, 1), 0);
+            sccvtfunc(uscalar.data, 0, 0, 0, (uchar*)iubuf, 0, Size(cn, 1), 0);
+            int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
+
+            for( int k = 0; k < cn; k++ )
+            {
+                if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
+                    ilbuf[k] = minval+1, iubuf[k] = minval;
+            }
+            lscalar = Mat(cn, 1, CV_32S, ilbuf);
+            uscalar = Mat(cn, 1, CV_32S, iubuf);
+        }
+
+        lscalar.convertTo(lscalar, stype);
+        uscalar.convertTo(uscalar, stype);
+    }
+    else
+    {
+        lscalaru = _lowerb.getUMat();
+        uscalaru = _upperb.getUMat();
+    }
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            dstarg = ocl::KernelArg::WriteOnly(dst);
+
+    if (haveScalar)
+    {
+        lscalar.copyTo(lscalaru);
+        uscalar.copyTo(uscalaru);
+
+        ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
+               ocl::KernelArg::PtrReadOnly(uscalaru));
+    }
+    else
+        ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
+               ocl::KernelArg::ReadOnlyNoSize(uscalaru));
+
+    size_t globalsize[2] = { ssize.width, ssize.height };
+    return ker.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }
 
 void cv::inRange(InputArray _src, InputArray _lowerb,
                  InputArray _upperb, OutputArray _dst)
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
+               _upperb.dims() <= 2 && _dst.isUMat(),
+               ocl_inRange(_src, _lowerb, _upperb, _dst))
+
     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
     Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
 
@@ -2893,14 +3041,14 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
         ubScalar = true;
     }
 
-    CV_Assert( ((int)lbScalar ^ (int)ubScalar) == 0 );
+    CV_Assert(lbScalar == ubScalar);
 
     int cn = src.channels(), depth = src.depth();
 
     size_t esz = src.elemSize();
     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
 
-    _dst.create(src.dims, src.size, CV_8U);
+    _dst.create(src.dims, src.size, CV_8UC1);
     Mat dst = _dst.getMat();
     InRangeFunc func = getInRangeFunc(depth);
 
diff --git a/modules/core/src/bufferpool.impl.hpp b/modules/core/src/bufferpool.impl.hpp
new file mode 100644
index 000000000..18a90e069
--- /dev/null
+++ b/modules/core/src/bufferpool.impl.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#ifndef __OPENCV_CORE_BUFFER_POOL_IMPL_HPP__
+#define __OPENCV_CORE_BUFFER_POOL_IMPL_HPP__
+
+#include "opencv2/core/bufferpool.hpp"
+
+namespace cv {
+
+class DummyBufferPoolController : public BufferPoolController
+{
+public:
+    DummyBufferPoolController() { }
+    virtual ~DummyBufferPoolController() { }
+
+    virtual size_t getReservedSize() const { return (size_t)-1; }
+    virtual size_t getMaxReservedSize() const { return (size_t)-1; }
+    virtual void setMaxReservedSize(size_t size) { (void)size; }
+    virtual void freeAllReservedBuffers() { }
+};
+
+} // namespace
+
+#endif // __OPENCV_CORE_BUFFER_POOL_IMPL_HPP__
diff --git a/modules/core/src/command_line_parser.cpp b/modules/core/src/command_line_parser.cpp
index 7a0284f75..0238a9972 100644
--- a/modules/core/src/command_line_parser.cpp
+++ b/modules/core/src/command_line_parser.cpp
@@ -41,6 +41,8 @@ static String get_type_name(int type)
 {
     if( type == Param::INT )
         return "int";
+    if( type == Param::BOOLEAN )
+        return "bool";
     if( type == Param::UNSIGNED_INT )
         return "unsigned";
     if( type == Param::UINT64 )
@@ -59,6 +61,12 @@ static void from_str(const String& str, int type, void* dst)
     std::stringstream ss(str.c_str());
     if( type == Param::INT )
         ss >> *(int*)dst;
+    else if( type == Param::BOOLEAN )
+    {
+        std::string temp;
+        ss >> temp;
+        *(bool*) dst = temp == "true";
+    }
     else if( type == Param::UNSIGNED_INT )
         ss >> *(unsigned*)dst;
     else if( type == Param::UINT64 )
@@ -229,6 +237,11 @@ CommandLineParser::CommandLineParser(int argc, const char* const argv[], const S
     impl->sort_params();
 }
 
+CommandLineParser::~CommandLineParser()
+{
+    if (CV_XADD(&impl->refcount, -1) == 1)
+        delete impl;
+}
 
 CommandLineParser::CommandLineParser(const CommandLineParser& parser)
 {
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 05c1a6e40..e64d09976 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -264,6 +264,8 @@ void cv::split(const Mat& src, Mat* mv)
     }
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
@@ -287,10 +289,12 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
         return false;
 
     Size size = _m.size();
-    std::vector<UMat> & dst = *(std::vector<UMat> *)_mv.getObj();
-    dst.resize(cn);
+    _mv.create(cn, 1, depth);
     for (int i = 0; i < cn; ++i)
-        dst[i].create(size, depth);
+        _mv.create(size, depth, i);
+
+    std::vector<UMat> dst;
+    _mv.getUMatVector(dst);
 
     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
     for (int i = 0; i < cn; ++i)
@@ -302,11 +306,12 @@ static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
 
 }
 
+#endif
+
 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
 {
-    if (ocl::useOpenCL() && _m.dims() <= 2 && _mv.isUMatVector() &&
-            ocl_split(_m, _mv))
-        return;
+    CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
+               ocl_split(_m, _mv))
 
     Mat m = _m.getMat();
     if( m.empty() )
@@ -314,10 +319,19 @@ void cv::split(InputArray _m, OutputArrayOfArrays _mv)
         _mv.release();
         return;
     }
+
     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
-    _mv.create(m.channels(), 1, m.depth());
-    Mat* dst = &_mv.getMatRef(0);
-    split(m, dst);
+
+    Size size = m.size();
+    int depth = m.depth(), cn = m.channels();
+    _mv.create(cn, 1, depth);
+    for (int i = 0; i < cn; ++i)
+        _mv.create(size, depth, i);
+
+    std::vector<Mat> dst;
+    _mv.getMatVector(dst);
+
+    split(m, &dst[0]);
 }
 
 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
@@ -395,11 +409,14 @@ void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
     }
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 {
-    const std::vector<UMat> & src = *(const std::vector<UMat> *)(_mv.getObj());
+    std::vector<UMat> src;
+    _mv.getUMatVector(src);
     CV_Assert(!src.empty());
 
     int type = src[0].type(), depth = CV_MAT_DEPTH(type);
@@ -442,10 +459,12 @@ static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
 
 }
 
+#endif
+
 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
 {
-    if (ocl::useOpenCL() && _mv.isUMatVector() && _dst.isUMat() && ocl_merge(_mv, _dst))
-        return;
+    CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
+               ocl_merge(_mv, _dst))
 
     std::vector<Mat> mv;
     _mv.getMatVector(mv);
@@ -612,16 +631,115 @@ void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, cons
     }
 }
 
+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
+{
+    int totalChannels = 0;
+    for (size_t i = 0, size = um.size(); i < size; ++i)
+    {
+        int ccn = um[i].channels();
+        totalChannels += ccn;
+
+        if (totalChannels == cn)
+        {
+            idx = (int)(i + 1);
+            cnidx = 0;
+            return;
+        }
+        else if (totalChannels > cn)
+        {
+            idx = (int)i;
+            cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
+            return;
+        }
+    }
+
+    idx = cnidx = -1;
+}
+
+static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
+                            const int* fromTo, size_t npairs)
+{
+    std::vector<UMat> src, dst;
+    _src.getUMatVector(src);
+    _dst.getUMatVector(dst);
+
+    size_t nsrc = src.size(), ndst = dst.size();
+    CV_Assert(nsrc > 0 && ndst > 0);
+
+    Size size = src[0].size();
+    int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth);
+
+    for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
+        CV_Assert(src[i].size() == size && src[i].depth() == depth);
+    for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
+        CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
+
+    String declsrc, decldst, declproc, declcn;
+    std::vector<UMat> srcargs(npairs), dstargs(npairs);
+
+    for (size_t i = 0; i < npairs; ++i)
+    {
+        int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
+        int src_idx, src_cnidx, dst_idx, dst_cnidx;
+
+        getUMatIndex(src, scn, src_idx, src_cnidx);
+        getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
+
+        CV_Assert(dst_idx >= 0 && src_idx >= 0);
+
+        srcargs[i] = src[src_idx];
+        srcargs[i].offset += src_cnidx * esz;
+
+        dstargs[i] = dst[dst_idx];
+        dstargs[i].offset += dst_cnidx * esz;
+
+        declsrc += format("DECLARE_INPUT_MAT(%d)", i);
+        decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
+        declproc += format("PROCESS_ELEM(%d)", i);
+        declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
+    }
+
+    ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
+                  format("-D T=%s -D DECLARE_INPUT_MATS=%s -D DECLARE_OUTPUT_MATS=%s"
+                         " -D PROCESS_ELEMS=%s%s", ocl::memopTypeToStr(depth),
+                         declsrc.c_str(), decldst.c_str(), declproc.c_str(), declcn.c_str()));
+    if (k.empty())
+        return false;
+
+    int argindex = 0;
+    for (size_t i = 0; i < npairs; ++i)
+        argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
+    for (size_t i = 0; i < npairs; ++i)
+        argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
+    k.set(k.set(argindex, size.height), size.width);
+
+    size_t globalsize[2] = { size.width, size.height };
+    return k.run(2, globalsize, NULL, false);
+}
+
+}
+
+#endif
 
 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
                  const int* fromTo, size_t npairs)
 {
-    if(npairs == 0)
+    if (npairs == 0 || fromTo == NULL)
         return;
+
+    CV_OCL_RUN(dst.isUMatVector(),
+               ocl_mixChannels(src, dst, fromTo, npairs))
+
     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
-                      src.kind() != _InputArray::STD_VECTOR_VECTOR;
+            src.kind() != _InputArray::STD_VECTOR_VECTOR &&
+            src.kind() != _InputArray::STD_VECTOR_UMAT;
     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
-                      dst.kind() != _InputArray::STD_VECTOR_VECTOR;
+            dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
+            dst.kind() != _InputArray::STD_VECTOR_UMAT;
     int i;
     int nsrc = src_is_mat ? 1 : (int)src.total();
     int ndst = dst_is_mat ? 1 : (int)dst.total();
@@ -639,12 +757,18 @@ void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
                      const std::vector<int>& fromTo)
 {
-    if(fromTo.empty())
+    if (fromTo.empty())
         return;
+
+    CV_OCL_RUN(dst.isUMatVector(),
+               ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
+
     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
-                      src.kind() != _InputArray::STD_VECTOR_VECTOR;
+            src.kind() != _InputArray::STD_VECTOR_VECTOR &&
+            src.kind() != _InputArray::STD_VECTOR_UMAT;
     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
-                      dst.kind() != _InputArray::STD_VECTOR_VECTOR;
+            dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
+            dst.kind() != _InputArray::STD_VECTOR_UMAT;
     int i;
     int nsrc = src_is_mat ? 1 : (int)src.total();
     int ndst = dst_is_mat ? 1 : (int)dst.total();
@@ -661,20 +785,41 @@ void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
 
 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
 {
-    Mat src = _src.getMat();
-    CV_Assert( 0 <= coi && coi < src.channels() );
-    _dst.create(src.dims, &src.size[0], src.depth());
-    Mat dst = _dst.getMat();
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert( 0 <= coi && coi < cn );
     int ch[] = { coi, 0 };
+
+    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
+    {
+        UMat src = _src.getUMat();
+        _dst.create(src.dims, &src.size[0], depth);
+        UMat dst = _dst.getUMat();
+        mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
+        return;
+    }
+
+    Mat src = _src.getMat();
+    _dst.create(src.dims, &src.size[0], depth);
+    Mat dst = _dst.getMat();
     mixChannels(&src, 1, &dst, 1, ch, 1);
 }
 
 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
 {
-    Mat src = _src.getMat(), dst = _dst.getMat();
-    CV_Assert( src.size == dst.size && src.depth() == dst.depth() );
-    CV_Assert( 0 <= coi && coi < dst.channels() && src.channels() == 1 );
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
+    CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
+    CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
+
     int ch[] = { 0, coi };
+    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
+    {
+        UMat src = _src.getUMat(), dst = _dst.getUMat();
+        mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
+        return;
+    }
+
+    Mat src = _src.getMat(), dst = _dst.getMat();
     mixChannels(&src, 1, &dst, 1, ch, 1);
 }
 
@@ -938,122 +1083,122 @@ stype* dst, size_t dstep, Size size, double*) \
 }
 
 
-DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float);
-DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float);
+DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
+DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
 
-DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float);
-DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float);
-DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float);
-DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float);
-DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float);
-DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float);
-DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float);
+DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
+DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
+DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
+DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
+DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
+DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
+DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
 
-DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float);
-DEF_CVT_SCALE_FUNC(8s,     schar, schar, float);
-DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float);
-DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float);
-DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float);
-DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float);
-DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float);
+DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
+DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
+DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
+DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
+DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
+DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
+DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
 
-DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float);
-DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float);
-DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float);
-DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float);
-DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float);
-DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float);
-DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float);
+DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
+DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
+DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
+DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
+DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
+DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
+DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
 
-DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float);
-DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float);
-DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float);
-DEF_CVT_SCALE_FUNC(16s,    short, short, float);
-DEF_CVT_SCALE_FUNC(32s16s, int, short, float);
-DEF_CVT_SCALE_FUNC(32f16s, float, short, float);
-DEF_CVT_SCALE_FUNC(64f16s, double, short, float);
+DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
+DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
+DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
+DEF_CVT_SCALE_FUNC(16s,    short, short, float)
+DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
+DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
+DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
 
-DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float);
-DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float);
-DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float);
-DEF_CVT_SCALE_FUNC(16s32s, short, int, float);
-DEF_CVT_SCALE_FUNC(32s,    int, int, double);
-DEF_CVT_SCALE_FUNC(32f32s, float, int, float);
-DEF_CVT_SCALE_FUNC(64f32s, double, int, double);
+DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
+DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
+DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
+DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
+DEF_CVT_SCALE_FUNC(32s,    int, int, double)
+DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
+DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
 
-DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float);
-DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float);
-DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float);
-DEF_CVT_SCALE_FUNC(16s32f, short, float, float);
-DEF_CVT_SCALE_FUNC(32s32f, int, float, double);
-DEF_CVT_SCALE_FUNC(32f,    float, float, float);
-DEF_CVT_SCALE_FUNC(64f32f, double, float, double);
+DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
+DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
+DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
+DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
+DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
+DEF_CVT_SCALE_FUNC(32f,    float, float, float)
+DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
 
-DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double);
-DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double);
-DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double);
-DEF_CVT_SCALE_FUNC(16s64f, short, double, double);
-DEF_CVT_SCALE_FUNC(32s64f, int, double, double);
-DEF_CVT_SCALE_FUNC(32f64f, float, double, double);
-DEF_CVT_SCALE_FUNC(64f,    double, double, double);
+DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
+DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
+DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
+DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
+DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
+DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
+DEF_CVT_SCALE_FUNC(64f,    double, double, double)
 
-DEF_CPY_FUNC(8u,     uchar);
-DEF_CVT_FUNC(8s8u,   schar, uchar);
-DEF_CVT_FUNC(16u8u,  ushort, uchar);
-DEF_CVT_FUNC(16s8u,  short, uchar);
-DEF_CVT_FUNC(32s8u,  int, uchar);
-DEF_CVT_FUNC(32f8u,  float, uchar);
-DEF_CVT_FUNC(64f8u,  double, uchar);
+DEF_CPY_FUNC(8u,     uchar)
+DEF_CVT_FUNC(8s8u,   schar, uchar)
+DEF_CVT_FUNC(16u8u,  ushort, uchar)
+DEF_CVT_FUNC(16s8u,  short, uchar)
+DEF_CVT_FUNC(32s8u,  int, uchar)
+DEF_CVT_FUNC(32f8u,  float, uchar)
+DEF_CVT_FUNC(64f8u,  double, uchar)
 
-DEF_CVT_FUNC(8u8s,   uchar, schar);
-DEF_CVT_FUNC(16u8s,  ushort, schar);
-DEF_CVT_FUNC(16s8s,  short, schar);
-DEF_CVT_FUNC(32s8s,  int, schar);
-DEF_CVT_FUNC(32f8s,  float, schar);
-DEF_CVT_FUNC(64f8s,  double, schar);
+DEF_CVT_FUNC(8u8s,   uchar, schar)
+DEF_CVT_FUNC(16u8s,  ushort, schar)
+DEF_CVT_FUNC(16s8s,  short, schar)
+DEF_CVT_FUNC(32s8s,  int, schar)
+DEF_CVT_FUNC(32f8s,  float, schar)
+DEF_CVT_FUNC(64f8s,  double, schar)
 
-DEF_CVT_FUNC(8u16u,  uchar, ushort);
-DEF_CVT_FUNC(8s16u,  schar, ushort);
-DEF_CPY_FUNC(16u,    ushort);
-DEF_CVT_FUNC(16s16u, short, ushort);
-DEF_CVT_FUNC(32s16u, int, ushort);
-DEF_CVT_FUNC(32f16u, float, ushort);
-DEF_CVT_FUNC(64f16u, double, ushort);
+DEF_CVT_FUNC(8u16u,  uchar, ushort)
+DEF_CVT_FUNC(8s16u,  schar, ushort)
+DEF_CPY_FUNC(16u,    ushort)
+DEF_CVT_FUNC(16s16u, short, ushort)
+DEF_CVT_FUNC(32s16u, int, ushort)
+DEF_CVT_FUNC(32f16u, float, ushort)
+DEF_CVT_FUNC(64f16u, double, ushort)
 
-DEF_CVT_FUNC(8u16s,  uchar, short);
-DEF_CVT_FUNC(8s16s,  schar, short);
-DEF_CVT_FUNC(16u16s, ushort, short);
-DEF_CVT_FUNC(32s16s, int, short);
-DEF_CVT_FUNC(32f16s, float, short);
-DEF_CVT_FUNC(64f16s, double, short);
+DEF_CVT_FUNC(8u16s,  uchar, short)
+DEF_CVT_FUNC(8s16s,  schar, short)
+DEF_CVT_FUNC(16u16s, ushort, short)
+DEF_CVT_FUNC(32s16s, int, short)
+DEF_CVT_FUNC(32f16s, float, short)
+DEF_CVT_FUNC(64f16s, double, short)
 
-DEF_CVT_FUNC(8u32s,  uchar, int);
-DEF_CVT_FUNC(8s32s,  schar, int);
-DEF_CVT_FUNC(16u32s, ushort, int);
-DEF_CVT_FUNC(16s32s, short, int);
-DEF_CPY_FUNC(32s,    int);
-DEF_CVT_FUNC(32f32s, float, int);
-DEF_CVT_FUNC(64f32s, double, int);
+DEF_CVT_FUNC(8u32s,  uchar, int)
+DEF_CVT_FUNC(8s32s,  schar, int)
+DEF_CVT_FUNC(16u32s, ushort, int)
+DEF_CVT_FUNC(16s32s, short, int)
+DEF_CPY_FUNC(32s,    int)
+DEF_CVT_FUNC(32f32s, float, int)
+DEF_CVT_FUNC(64f32s, double, int)
 
-DEF_CVT_FUNC(8u32f,  uchar, float);
-DEF_CVT_FUNC(8s32f,  schar, float);
-DEF_CVT_FUNC(16u32f, ushort, float);
-DEF_CVT_FUNC(16s32f, short, float);
-DEF_CVT_FUNC(32s32f, int, float);
-DEF_CVT_FUNC(64f32f, double, float);
+DEF_CVT_FUNC(8u32f,  uchar, float)
+DEF_CVT_FUNC(8s32f,  schar, float)
+DEF_CVT_FUNC(16u32f, ushort, float)
+DEF_CVT_FUNC(16s32f, short, float)
+DEF_CVT_FUNC(32s32f, int, float)
+DEF_CVT_FUNC(64f32f, double, float)
 
-DEF_CVT_FUNC(8u64f,  uchar, double);
-DEF_CVT_FUNC(8s64f,  schar, double);
-DEF_CVT_FUNC(16u64f, ushort, double);
-DEF_CVT_FUNC(16s64f, short, double);
-DEF_CVT_FUNC(32s64f, int, double);
-DEF_CVT_FUNC(32f64f, float, double);
-DEF_CPY_FUNC(64s,    int64);
+DEF_CVT_FUNC(8u64f,  uchar, double)
+DEF_CVT_FUNC(8s64f,  schar, double)
+DEF_CVT_FUNC(16u64f, ushort, double)
+DEF_CVT_FUNC(16s64f, short, double)
+DEF_CVT_FUNC(32s64f, int, double)
+DEF_CVT_FUNC(32f64f, float, double)
+DEF_CPY_FUNC(64s,    int64)
 
 static BinaryFunc getCvtScaleAbsFunc(int depth)
 {
@@ -1161,10 +1306,52 @@ static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if (!doubleSupport && depth == CV_64F)
+        return false;
+
+    char cvt[2][50];
+    int wdepth = std::max(depth, CV_32F);
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
+                  format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=uchar -D srcT1=%s"
+                         " -D workT=%s -D convertToWT1=%s -D convertToDT=%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(wdepth),
+                         ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
+                         ocl::convertTypeStr(wdepth, CV_8U, 1, cvt[1]),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    _dst.createSameSize(_src, CV_8UC(cn));
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            dstarg = ocl::KernelArg::WriteOnly(dst, cn);
+
+    if (wdepth == CV_32F)
+        k.args(srcarg, dstarg, (float)alpha, (float)beta);
+    else if (wdepth == CV_64F)
+        k.args(srcarg, dstarg, alpha, beta);
+
+    size_t globalsize[2] = { src.cols * cn, src.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }
 
 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_convertScaleAbs(_src, _dst, alpha, beta))
+
     Mat src = _src.getMat();
     int cn = src.channels();
     double scale[] = {alpha, beta};
@@ -1300,9 +1487,7 @@ static LUTFunc lutTab[] =
     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
 };
 
-}
-
-namespace cv {
+#ifdef HAVE_OPENCL
 
 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
 {
@@ -1320,6 +1505,9 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s%s", dcn, lcn,
                          ocl::typeToStr(src.depth()), ocl::typeToStr(ddepth),
                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
            ocl::KernelArg::WriteOnly(dst));
 
@@ -1327,7 +1515,9 @@ static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
     return k.run(2, globalSize, NULL, false);
 }
 
-} // cv
+#endif
+
+}
 
 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
 {
@@ -1338,8 +1528,8 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
         _lut.total() == 256 && _lut.isContinuous() &&
         (depth == CV_8U || depth == CV_8S) );
 
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_LUT(_src, _lut, _dst))
-        return;
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_LUT(_src, _lut, _dst))
 
     Mat src = _src.getMat(), lut = _lut.getMat();
     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
@@ -1357,43 +1547,68 @@ void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
         func(ptrs[0], lut.data, ptrs[1], len, cn, lutcn);
 }
 
+namespace cv {
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_normalize( InputArray _src, OutputArray _dst, InputArray _mask, int rtype,
+                           double scale, double shift )
+{
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+
+    if( _mask.empty() )
+        src.convertTo( dst, rtype, scale, shift );
+    else
+    {
+        UMat temp;
+        src.convertTo( temp, rtype, scale, shift );
+        temp.copyTo( dst, _mask );
+    }
+
+    return true;
+}
+
+#endif
+
+}
 
 void cv::normalize( InputArray _src, OutputArray _dst, double a, double b,
                     int norm_type, int rtype, InputArray _mask )
 {
-    Mat src = _src.getMat(), mask = _mask.getMat();
-
     double scale = 1, shift = 0;
     if( norm_type == CV_MINMAX )
     {
         double smin = 0, smax = 0;
         double dmin = MIN( a, b ), dmax = MAX( a, b );
-        minMaxLoc( _src, &smin, &smax, 0, 0, mask );
+        minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
         shift = dmin - smin*scale;
     }
     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
     {
-        scale = norm( src, norm_type, mask );
+        scale = norm( _src, norm_type, _mask );
         scale = scale > DBL_EPSILON ? a/scale : 0.;
         shift = 0;
     }
     else
         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
 
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     if( rtype < 0 )
-        rtype = _dst.fixedType() ? _dst.depth() : src.depth();
+        rtype = _dst.fixedType() ? _dst.depth() : depth;
+    _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
 
-    _dst.create(src.dims, src.size, CV_MAKETYPE(rtype, src.channels()));
-    Mat dst = _dst.getMat();
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
 
-    if( !mask.data )
+    Mat src = _src.getMat(), dst = _dst.getMat();
+    if( _mask.empty() )
         src.convertTo( dst, rtype, scale, shift );
     else
     {
         Mat temp;
         src.convertTo( temp, rtype, scale, shift );
-        temp.copyTo( dst, mask );
+        temp.copyTo( dst, _mask );
     }
 }
 
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 7d54ebc0b..3c051e68f 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -166,16 +166,16 @@ static void copyMask##suffix(const uchar* src, size_t sstep, const uchar* mask,
 }
 
 
-DEF_COPY_MASK(8u, uchar);
-DEF_COPY_MASK(16u, ushort);
-DEF_COPY_MASK(8uC3, Vec3b);
-DEF_COPY_MASK(32s, int);
-DEF_COPY_MASK(16uC3, Vec3s);
-DEF_COPY_MASK(32sC2, Vec2i);
-DEF_COPY_MASK(32sC3, Vec3i);
-DEF_COPY_MASK(32sC4, Vec4i);
-DEF_COPY_MASK(32sC6, Vec6i);
-DEF_COPY_MASK(32sC8, Vec8i);
+DEF_COPY_MASK(8u, uchar)
+DEF_COPY_MASK(16u, ushort)
+DEF_COPY_MASK(8uC3, Vec3b)
+DEF_COPY_MASK(32s, int)
+DEF_COPY_MASK(16uC3, Vec3s)
+DEF_COPY_MASK(32sC2, Vec2i)
+DEF_COPY_MASK(32sC3, Vec3i)
+DEF_COPY_MASK(32sC4, Vec4i)
+DEF_COPY_MASK(32sC6, Vec6i)
+DEF_COPY_MASK(32sC8, Vec8i)
 
 BinaryFunc copyMaskTab[] =
 {
@@ -247,10 +247,7 @@ void Mat::copyTo( OutputArray _dst ) const
             const uchar* sptr = data;
             uchar* dptr = dst.data;
 
-            // to handle the copying 1xn matrix => nx1 std vector.
-            Size sz = size() == dst.size() ?
-                getContinuousSize(*this, dst) :
-                getContinuousSize(*this);
+            Size sz = getContinuousSize(*this, dst);
             size_t len = sz.width*elemSize();
 
             for( ; sz.height--; sptr += step, dptr += dst.step )
@@ -301,6 +298,7 @@ void Mat::copyTo( OutputArray _dst, InputArray _mask ) const
 
     if( dims <= 2 )
     {
+        CV_Assert( size() == mask.size() );
         Size sz = getContinuousSize(*this, dst, mask, mcn);
         copymask(data, step, mask.data, mask.step, dst.data, dst.step, sz, &esz);
         return;
@@ -355,7 +353,7 @@ Mat& Mat::operator = (const Scalar& s)
 
 Mat& Mat::setTo(InputArray _value, InputArray _mask)
 {
-    if( !data )
+    if( empty() )
         return *this;
 
     Mat value = _value.getMat(), mask = _mask.getMat();
@@ -477,6 +475,8 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
     }
 }
 
+#ifdef HAVE_OPENCL
+
 enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
 
 static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
@@ -521,13 +521,13 @@ static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
     return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), rows, cols).run(2, globalsize, NULL, false);
 }
 
+#endif
+
 void flip( InputArray _src, OutputArray _dst, int flip_mode )
 {
     CV_Assert( _src.dims() <= 2 );
 
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if ( use_opencl && ocl_flip(_src,_dst, flip_mode))
-        return;
+    CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src,_dst, flip_mode))
 
     Mat src = _src.getMat();
     _dst.create( src.size(), src.type() );
@@ -543,6 +543,7 @@ void flip( InputArray _src, OutputArray _dst, int flip_mode )
         flipHoriz( dst.data, dst.step, dst.data, dst.step, dst.size(), esz );
 }
 
+#ifdef HAVE_OPENCL
 
 static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
@@ -558,6 +559,8 @@ static bool ocl_repeat(InputArray _src, int ny, int nx, OutputArray _dst)
     return true;
 }
 
+#endif
+
 void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
 {
     CV_Assert( _src.dims() <= 2 );
@@ -566,11 +569,8 @@ void repeat(InputArray _src, int ny, int nx, OutputArray _dst)
     Size ssize = _src.size();
     _dst.create(ssize.height*ny, ssize.width*nx, _src.type());
 
-    if (ocl::useOpenCL() && _src.isUMat())
-    {
-        CV_Assert(ocl_repeat(_src, ny, nx, _dst));
-        return;
-    }
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_repeat(_src, ny, nx, _dst))
 
     Mat src = _src.getMat(), dst = _dst.getMat();
     Size dsize = dst.size();
@@ -632,6 +632,7 @@ int cv::borderInterpolate( int p, int len, int borderType )
     }
     else if( borderType == BORDER_WRAP )
     {
+        CV_Assert(len > 0);
         if( p < 0 )
             p -= ((p-len+1)/len)*len;
         if( p >= len )
@@ -770,6 +771,8 @@ void copyMakeConstBorder_8u( const uchar* src, size_t srcstep, cv::Size srcroi,
 
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_copyMakeBorder( InputArray _src, OutputArray _dst, int top, int bottom,
@@ -826,14 +829,15 @@ static bool ocl_copyMakeBorder( InputArray _src, OutputArray _dst, int top, int
 
 }
 
+#endif
+
 void cv::copyMakeBorder( InputArray _src, OutputArray _dst, int top, int bottom,
                          int left, int right, int borderType, const Scalar& value )
 {
     CV_Assert( top >= 0 && bottom >= 0 && left >= 0 && right >= 0 );
 
-    if (ocl::useOpenCL() && _dst.isUMat() && _src.dims() <= 2 &&
-            ocl_copyMakeBorder(_src, _dst, top, bottom, left, right, borderType, value))
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_copyMakeBorder(_src, _dst, top, bottom, left, right, borderType, value))
 
     Mat src = _src.getMat();
 
diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu
index f4c9bbdca..9dc8aa56f 100644
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -272,9 +272,15 @@ void cv::cuda::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& stream
     GpuMat mask = _mask.getGpuMat();
     CV_DbgAssert( size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == channels()) );
 
+    uchar* data0 = _dst.getGpuMat().data;
+
     _dst.create(size(), type());
     GpuMat dst = _dst.getGpuMat();
 
+    // do not leave dst uninitialized
+    if (dst.data != data0)
+        dst.setTo(Scalar::all(0), stream);
+
     typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream);
     static const func_t funcs[9][4] =
     {
diff --git a/modules/core/src/cuda_host_mem.cpp b/modules/core/src/cuda_host_mem.cpp
index 0eb73606a..15a0d9a93 100644
--- a/modules/core/src/cuda_host_mem.cpp
+++ b/modules/core/src/cuda_host_mem.cpp
@@ -46,6 +46,7 @@
 using namespace cv;
 using namespace cv::cuda;
 
+#ifdef HAVE_CUDA
 namespace
 {
     size_t alignUpStep(size_t what, size_t alignment)
@@ -56,6 +57,7 @@ namespace
         return res;
     }
 }
+#endif
 
 void cv::cuda::CudaMem::create(int rows_, int cols_, int type_)
 {
diff --git a/modules/core/src/directx.cpp b/modules/core/src/directx.cpp
index 071df0352..4d9fd6c9e 100644
--- a/modules/core/src/directx.cpp
+++ b/modules/core/src/directx.cpp
@@ -236,7 +236,7 @@ namespace ocl {
 static bool g_isDirect3DDevice9Ex = false; // Direct3DDevice9Ex or Direct3DDevice9 was used
 #endif
 
-Context2& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
+Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
 {
     (void)pD3D11Device;
 #if !defined(HAVE_DIRECTX)
@@ -338,13 +338,13 @@ Context2& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device)
     }
 
 
-    Context2& ctx = Context2::getDefault(false);
+    Context& ctx = Context::getDefault(false);
     initializeContextFromHandle(ctx, platforms[found], context, device);
     return ctx;
 #endif
 }
 
-Context2& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
+Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
 {
     (void)pD3D10Device;
 #if !defined(HAVE_DIRECTX)
@@ -446,13 +446,13 @@ Context2& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device)
     }
 
 
-    Context2& ctx = Context2::getDefault(false);
+    Context& ctx = Context::getDefault(false);
     initializeContextFromHandle(ctx, platforms[found], context, device);
     return ctx;
 #endif
 }
 
-Context2& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex)
+Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex)
 {
     (void)pDirect3DDevice9Ex;
 #if !defined(HAVE_DIRECTX)
@@ -555,14 +555,14 @@ Context2& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDe
             CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
     }
 
-    Context2& ctx = Context2::getDefault(false);
+    Context& ctx = Context::getDefault(false);
     initializeContextFromHandle(ctx, platforms[found], context, device);
     g_isDirect3DDevice9Ex = true;
     return ctx;
 #endif
 }
 
-Context2& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9)
+Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9)
 {
     (void)pDirect3DDevice9;
 #if !defined(HAVE_DIRECTX)
@@ -665,7 +665,7 @@ Context2& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice
             CV_Error(cv::Error::OpenCLInitError, "OpenCL: Can't create context for DirectX interop");
     }
 
-    Context2& ctx = Context2::getDefault(false);
+    Context& ctx = Context::getDefault(false);
     initializeContextFromHandle(ctx, platforms[found], context, device);
     g_isDirect3DDevice9Ex = false;
     return ctx;
@@ -720,7 +720,7 @@ void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D)
     CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     UMat u = src.getUMat();
@@ -777,7 +777,7 @@ void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst
     CV_Assert(textureType >= 0);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
@@ -868,7 +868,7 @@ void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D)
     CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     UMat u = src.getUMat();
@@ -925,7 +925,7 @@ void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst
     CV_Assert(textureType >= 0);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
@@ -1019,7 +1019,7 @@ void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurfa
     CV_Assert(srcSize.width == (int)desc.Width && srcSize.height == (int)desc.Height);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     UMat u = src.getUMat();
@@ -1083,7 +1083,7 @@ void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArr
     CV_Assert(surfaceType >= 0);
 
     using namespace cv::ocl;
-    Context2& ctx = Context2::getDefault();
+    Context& ctx = Context::getDefault();
     cl_context context = (cl_context)ctx.ptr();
 
     // TODO Need to specify ACCESS_WRITE here somehow to prevent useless data copying!
diff --git a/modules/core/src/drawing.cpp b/modules/core/src/drawing.cpp
index 5cc498256..0ba932163 100644
--- a/modules/core/src/drawing.cpp
+++ b/modules/core/src/drawing.cpp
@@ -1568,9 +1568,11 @@ PolyLine( Mat& img, const Point* v, int count, bool is_closed,
 *                              External functions                                        *
 \****************************************************************************************/
 
-void line( Mat& img, Point pt1, Point pt2, const Scalar& color,
+void line( InputOutputArray _img, Point pt1, Point pt2, const Scalar& color,
            int thickness, int line_type, int shift )
 {
+    Mat img = _img.getMat();
+
     if( line_type == CV_AA && img.depth() != CV_8U )
         line_type = 8;
 
@@ -1582,10 +1584,12 @@ void line( Mat& img, Point pt1, Point pt2, const Scalar& color,
     ThickLine( img, pt1, pt2, buf, thickness, line_type, 3, shift );
 }
 
-void rectangle( Mat& img, Point pt1, Point pt2,
+void rectangle( InputOutputArray _img, Point pt1, Point pt2,
                 const Scalar& color, int thickness,
                 int lineType, int shift )
 {
+    Mat img = _img.getMat();
+
     if( lineType == CV_AA && img.depth() != CV_8U )
         lineType = 8;
 
@@ -1622,9 +1626,11 @@ void rectangle( Mat& img, Rect rec,
 }
 
 
-void circle( Mat& img, Point center, int radius,
+void circle( InputOutputArray _img, Point center, int radius,
              const Scalar& color, int thickness, int line_type, int shift )
 {
+    Mat img = _img.getMat();
+
     if( line_type == CV_AA && img.depth() != CV_8U )
         line_type = 8;
 
@@ -1647,10 +1653,12 @@ void circle( Mat& img, Point center, int radius,
 }
 
 
-void ellipse( Mat& img, Point center, Size axes,
+void ellipse( InputOutputArray _img, Point center, Size axes,
               double angle, double start_angle, double end_angle,
               const Scalar& color, int thickness, int line_type, int shift )
 {
+    Mat img = _img.getMat();
+
     if( line_type == CV_AA && img.depth() != CV_8U )
         line_type = 8;
 
@@ -1672,9 +1680,11 @@ void ellipse( Mat& img, Point center, Size axes,
                _end_angle, buf, thickness, line_type );
 }
 
-void ellipse(Mat& img, const RotatedRect& box, const Scalar& color,
+void ellipse(InputOutputArray _img, const RotatedRect& box, const Scalar& color,
              int thickness, int lineType)
 {
+    Mat img = _img.getMat();
+
     if( lineType == CV_AA && img.depth() != CV_8U )
         lineType = 8;
 
@@ -1918,11 +1928,12 @@ static const int* getFontData(int fontFace)
 }
 
 
-void putText( Mat& img, const String& text, Point org,
+void putText( InputOutputArray _img, const String& text, Point org,
               int fontFace, double fontScale, Scalar color,
               int thickness, int line_type, bool bottomLeftOrigin )
 
 {
+    Mat img = _img.getMat();
     const int* ascii = getFontData(fontFace);
 
     double buf[4];
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index c39f11d4f..1d3a67b5e 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -1545,7 +1545,7 @@ class PlanCache
             clStridesOut[2] = dft_rows ? clStridesOut[1] : dft_size.width * clStridesOut[1];
 
             // TODO remove all plans if context changed
-            CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context2::getDefault().ptr(), dim, clLengthsIn))
+            CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context::getDefault().ptr(), dim, clLengthsIn))
 
             // setting plan properties
             CLAMDDFT_Assert(clAmdFftSetPlanPrecision(plHandle, doubleFP ? CLFFT_DOUBLE : CLFFT_SINGLE));
@@ -1560,8 +1560,8 @@ class PlanCache
             CLAMDDFT_Assert(clAmdFftSetPlanScale(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale))
 
             // ready to bake
-            cl_command_queue commandQueue = (cl_command_queue)ocl::Queue::getDefault().ptr();
-            CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &commandQueue, NULL, NULL))
+            cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
+            CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &queue, NULL, NULL))
         }
 
         ~FftPlan()
@@ -1593,7 +1593,7 @@ public:
     clAmdFftPlanHandle getPlanHandle(const Size & dft_size, int src_step, int dst_step, bool doubleFP,
                                      bool inplace, int flags, FftType fftType)
     {
-        cl_context currentContext = (cl_context)ocl::Context2::getDefault().ptr();
+        cl_context currentContext = (cl_context)ocl::Context::getDefault().ptr();
 
         for (size_t i = 0, size = planStorage.size(); i < size; i ++)
         {
@@ -1704,11 +1704,11 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
     cl_mem srcarg = (cl_mem)src.handle(ACCESS_READ);
     cl_mem dstarg = (cl_mem)dst.handle(ACCESS_RW);
 
-    cl_command_queue commandQueue = (cl_command_queue)ocl::Queue::getDefault().ptr();
+    cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr();
     cl_event e = 0;
 
     CLAMDDFT_Assert(clAmdFftEnqueueTransform(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
-                                       1, &commandQueue, 0, NULL, &e,
+                                       1, &queue, 0, NULL, &e,
                                        &srcarg, &dstarg, (cl_mem)tmpBuffer.handle(ACCESS_RW)))
 
     tmpBuffer.addref();
@@ -1726,9 +1726,9 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags)
 void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 {
 #ifdef HAVE_CLAMDFFT
-    if (ocl::useOpenCL() && ocl::haveAmdFft() && _dst.isUMat() && _src0.dims() <= 2
-            && nonzero_rows == 0 && ocl_dft(_src0, _dst, flags))
-        return;
+    CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
+            _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0,
+               ocl_dft(_src0, _dst, flags))
 #endif
 
     static DFTFunc dft_tbl[6] =
@@ -2135,6 +2135,8 @@ void cv::idft( InputArray src, OutputArray dst, int flags, int nonzero_rows )
     dft( src, dst, flags | DFT_INVERSE, nonzero_rows );
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
@@ -2168,12 +2170,13 @@ static bool ocl_mulSpectrums( InputArray _srcA, InputArray _srcB,
 
 }
 
+#endif
+
 void cv::mulSpectrums( InputArray _srcA, InputArray _srcB,
                        OutputArray _dst, int flags, bool conjB )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
+    CV_OCL_RUN(_dst.isUMat() && _srcA.dims() <= 2 && _srcB.dims() <= 2,
             ocl_mulSpectrums(_srcA, _srcB, _dst, flags, conjB))
-        return;
 
     Mat srcA = _srcA.getMat(), srcB = _srcB.getMat();
     int depth = srcA.depth(), cn = srcA.channels(), type = srcA.type();
@@ -2577,7 +2580,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
 
     DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
 
-    if( (flags & DFT_ROWS) || src.rows == 1 ||
+    if( (flags & DCT_ROWS) || src.rows == 1 ||
         (src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
     {
         stage = end_stage = 0;
@@ -2597,7 +2600,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
         {
             len = src.cols;
             count = src.rows;
-            if( len == 1 && !(flags & DFT_ROWS) )
+            if( len == 1 && !(flags & DCT_ROWS) )
             {
                 len = src.rows;
                 count = 1;
diff --git a/modules/core/src/gl_core_3_1.cpp b/modules/core/src/gl_core_3_1.cpp
index 48201b4b7..318eb50e1 100644
--- a/modules/core/src/gl_core_3_1.cpp
+++ b/modules/core/src/gl_core_3_1.cpp
@@ -44,22 +44,27 @@
 #include "gl_core_3_1.hpp"
 
 #ifdef HAVE_OPENGL
-    #if defined(__APPLE__)
-        #include <mach-o/dyld.h>
+
+    #ifdef __APPLE__
+        #include <dlfcn.h>
 
         static void* AppleGLGetProcAddress (const char* name)
         {
-            static const struct mach_header* image = 0;
-            if (!image)
-                image = NSAddImage("/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL", NSADDIMAGE_OPTION_RETURN_ON_ERROR);
+            static bool initialized = false;
+            static void * handle = NULL;
+            if (!handle)
+            {
+                if (!initialized)
+                {
+                    initialized = true;
+                    const char * const path = "/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL";
 
-            // prepend a '_' for the Unix C symbol mangling convention
-            String symbolName = "_";
-            symbolName += String(name);
-
-            NSSymbol symbol = image ? NSLookupSymbolInImage(image, &symbolName[0], NSLOOKUPSYMBOLINIMAGE_OPTION_BIND | NSLOOKUPSYMBOLINIMAGE_OPTION_RETURN_ON_ERROR) : 0;
-
-            return symbol ? NSAddressOfSymbol(symbol) : 0;
+                    handle = dlopen(path, RTLD_LAZY | RTLD_GLOBAL);
+                }
+                if (!handle)
+                    return NULL;
+            }
+            return dlsym(handle, name);
         }
     #endif // __APPLE__
 
diff --git a/modules/core/src/glob.cpp b/modules/core/src/glob.cpp
index c75bd2e66..93dc72ff8 100644
--- a/modules/core/src/glob.cpp
+++ b/modules/core/src/glob.cpp
@@ -64,7 +64,7 @@ namespace
         HANDLE handle;
         dirent ent;
 #ifdef HAVE_WINRT
-        DIR() {};
+        DIR() { }
         ~DIR()
         {
             if (ent.d_name)
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 0b596071a..f81e83553 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -54,6 +54,7 @@ static const float atan2_p3 = -0.3258083974640975f*(float)(180/CV_PI);
 static const float atan2_p5 = 0.1555786518463281f*(float)(180/CV_PI);
 static const float atan2_p7 = -0.04432655554792128f*(float)(180/CV_PI);
 
+#ifdef HAVE_OPENCL
 
 enum { OCL_OP_LOG=0, OCL_OP_EXP=1, OCL_OP_MAG=2, OCL_OP_PHASE_DEGREES=3, OCL_OP_PHASE_RADIANS=4 };
 
@@ -98,6 +99,8 @@ static bool ocl_math_op(InputArray _src1, InputArray _src2, OutputArray _dst, in
     return k.run(2, globalsize, 0, false);
 }
 
+#endif
+
 float fastAtan2( float y, float x )
 {
     float ax = std::abs(x), ay = std::abs(y);
@@ -401,11 +404,8 @@ void magnitude( InputArray src1, InputArray src2, OutputArray dst )
     int type = src1.type(), depth = src1.depth(), cn = src1.channels();
     CV_Assert( src1.size() == src2.size() && type == src2.type() && (depth == CV_32F || depth == CV_64F));
 
-    bool use_opencl = dst.isUMat() && ocl::useOpenCL()
-        && src1.dims() <= 2 && src2.dims() <= 2;
-
-    if(use_opencl && ocl_math_op(src1, src2, dst, OCL_OP_MAG) )
-        return;
+    CV_OCL_RUN(dst.isUMat() && src1.dims() <= 2 && src2.dims() <= 2,
+               ocl_math_op(src1, src2, dst, OCL_OP_MAG))
 
     Mat X = src1.getMat(), Y = src2.getMat();
     dst.create(X.dims, X.size, X.type());
@@ -439,11 +439,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
     int type = src1.type(), depth = src1.depth(), cn = src1.channels();
     CV_Assert( src1.size() == src2.size() && type == src2.type() && (depth == CV_32F || depth == CV_64F));
 
-    bool use_opencl = dst.isUMat() && ocl::useOpenCL()
-        && src1.dims() <= 2 && src2.dims() <= 2;
-
-    if(use_opencl && ocl_math_op(src1, src2, dst, angleInDegrees ? OCL_OP_PHASE_DEGREES : OCL_OP_PHASE_RADIANS) )
-        return;
+    CV_OCL_RUN(dst.isUMat() && src1.dims() <= 2 && src2.dims() <= 2,
+               ocl_math_op(src1, src2, dst, angleInDegrees ? OCL_OP_PHASE_DEGREES : OCL_OP_PHASE_RADIANS))
 
     Mat X = src1.getMat(), Y = src2.getMat();
     dst.create( X.dims, X.size, type );
@@ -497,6 +494,8 @@ void phase( InputArray src1, InputArray src2, OutputArray dst, bool angleInDegre
     }
 }
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
                              OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
 {
@@ -533,12 +532,13 @@ static bool ocl_cartToPolar( InputArray _src1, InputArray _src2,
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 void cartToPolar( InputArray src1, InputArray src2,
                   OutputArray dst1, OutputArray dst2, bool angleInDegrees )
 {
-    if (ocl::useOpenCL() && dst1.isUMat() && dst2.isUMat() &&
+    CV_OCL_RUN(dst1.isUMat() && dst2.isUMat(),
             ocl_cartToPolar(src1, src2, dst1, dst2, angleInDegrees))
-        return;
 
     Mat X = src1.getMat(), Y = src2.getMat();
     int type = X.type(), depth = X.depth(), cn = X.channels();
@@ -683,6 +683,8 @@ static void SinCos_32f( const float *angle, float *sinval, float* cosval,
 }
 
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
                              OutputArray _dst1, OutputArray _dst2, bool angleInDegrees )
 {
@@ -715,15 +717,16 @@ static bool ocl_polarToCart( InputArray _mag, InputArray _angle,
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 void polarToCart( InputArray src1, InputArray src2,
                   OutputArray dst1, OutputArray dst2, bool angleInDegrees )
 {
     int type = src2.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     CV_Assert((depth == CV_32F || depth == CV_64F) && (src1.empty() || src1.type() == type));
 
-    if (ocl::useOpenCL() && !src1.empty() && src2.dims() <= 2 && dst1.isUMat() && dst2.isUMat() &&
-            ocl_polarToCart(src1, src2, dst1, dst2, angleInDegrees))
-        return;
+    CV_OCL_RUN(!src1.empty() && src2.dims() <= 2 && dst1.isUMat() && dst2.isUMat(),
+               ocl_polarToCart(src1, src2, dst1, dst2, angleInDegrees))
 
     Mat Mag = src1.getMat(), Angle = src2.getMat();
     CV_Assert( Mag.empty() || Angle.size == Mag.size);
@@ -1289,10 +1292,8 @@ void exp( InputArray _src, OutputArray _dst )
     int type = _src.type(), depth = _src.depth(), cn = _src.channels();
     CV_Assert( depth == CV_32F || depth == CV_64F );
 
-    bool use_opencl = _dst.isUMat() && ocl::useOpenCL() && _src.dims() <= 2;
-
-    if(use_opencl && ocl_math_op(_src, noArray(), _dst, OCL_OP_EXP) )
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_math_op(_src, noArray(), _dst, OCL_OP_EXP))
 
     Mat src = _src.getMat();
     _dst.create( src.dims, src.size, type );
@@ -1938,10 +1939,8 @@ void log( InputArray _src, OutputArray _dst )
     int type = _src.type(), depth = _src.depth(), cn = _src.channels();
     CV_Assert( depth == CV_32F || depth == CV_64F );
 
-    bool use_opencl = _dst.isUMat() && ocl::useOpenCL() && _src.dims() <= 2;
-
-    if(use_opencl && ocl_math_op(_src, noArray(), _dst, OCL_OP_LOG) )
-        return;
+    CV_OCL_RUN( _dst.isUMat() && _src.dims() <= 2,
+                ocl_math_op(_src, noArray(), _dst, OCL_OP_LOG))
 
     Mat src = _src.getMat();
     _dst.create( src.dims, src.size, type );
@@ -2032,17 +2031,19 @@ static IPowFunc ipowTab[] =
     (IPowFunc)iPow32s, (IPowFunc)iPow32f, (IPowFunc)iPow64f, 0
 };
 
-static bool ocl_pow(InputArray _src, double power, OutputArray _dst)
+#ifdef HAVE_OPENCL
+
+static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
+                    bool is_ipower, int ipower)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if ( !(_src.dims() <= 2 && (depth == CV_32F || depth == CV_64F)) ||
-         (depth == CV_64F && !doubleSupport) )
+    if (depth == CV_64F && !doubleSupport)
         return false;
 
     bool issqrt = std::abs(power - 0.5) < DBL_EPSILON;
-    const char * const op = issqrt ? "OP_SQRT" : "OP_POW";
+    const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
                   format("-D dstT=%s -D %s -D UNARY_OP%s", ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
@@ -2057,50 +2058,55 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst)
     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
             dstarg = ocl::KernelArg::WriteOnly(dst, cn);
 
-    if (depth == CV_32F)
-        k.args(srcarg, dstarg, (float)power);
+    if (issqrt)
+        k.args(srcarg, dstarg);
+    else if (is_ipower)
+        k.args(srcarg, dstarg, ipower);
     else
-        k.args(srcarg, dstarg, power);
+    {
+        if (depth == CV_32F)
+            k.args(srcarg, dstarg, (float)power);
+        else
+            k.args(srcarg, dstarg, power);
+    }
 
     size_t globalsize[2] = { dst.cols *  cn, dst.rows };
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 void pow( InputArray _src, double power, OutputArray _dst )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_pow(_src, power, _dst))
-        return;
-
-    Mat src = _src.getMat();
-    int type = src.type(), depth = src.depth(), cn = src.channels();
-
-    _dst.create( src.dims, src.size, type );
-    Mat dst = _dst.getMat();
-
-    int ipower = cvRound(power);
-    bool is_ipower = false;
+    bool is_ipower = false, same = false;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type),
+            cn = CV_MAT_CN(type), ipower = cvRound(power);
 
     if( fabs(ipower - power) < DBL_EPSILON )
     {
         if( ipower < 0 )
         {
-            divide( 1., src, dst );
+            divide( 1., _src, _dst );
             if( ipower == -1 )
                 return;
             ipower = -ipower;
-            src = dst;
+            same = true;
         }
 
         switch( ipower )
         {
         case 0:
-            dst = Scalar::all(1);
+            _dst.createSameSize(_src, type);
+            _dst.setTo(Scalar::all(1));
             return;
         case 1:
-            src.copyTo(dst);
+            _src.copyTo(_dst);
             return;
         case 2:
-            multiply(src, src, dst);
+            if (same)
+                multiply(_dst, _dst, _dst);
+            else
+                multiply(_src, _src, _dst);
             return;
         default:
             is_ipower = true;
@@ -2109,6 +2115,22 @@ void pow( InputArray _src, double power, OutputArray _dst )
     else
         CV_Assert( depth == CV_32F || depth == CV_64F );
 
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_pow(same ? _dst : _src, power, _dst, is_ipower, ipower))
+
+    Mat src, dst;
+    if (same)
+    {
+        dst = _dst.getMat();
+        src = dst;
+    }
+    else
+    {
+        src = _src.getMat();
+        _dst.create( src.dims, src.size, type );
+        dst = _dst.getMat();
+    }
+
     const Mat* arrays[] = {&src, &dst, 0};
     uchar* ptrs[2];
     NAryMatIterator it(arrays, ptrs);
@@ -2364,12 +2386,35 @@ bool checkRange(InputArray _src, bool quiet, Point* pt, double minVal, double ma
     return badPt.x < 0;
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_patchNaNs( InputOutputArray _a, float value )
+{
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
+                     format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=int"));
+    if (k.empty())
+        return false;
+
+    UMat a = _a.getUMat();
+    int cn = a.channels();
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(a),
+           ocl::KernelArg::WriteOnly(a, cn), (float)value);
+
+    size_t globalsize[2] = { a.cols * cn, a.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
 
 void patchNaNs( InputOutputArray _a, double _val )
 {
-    Mat a = _a.getMat();
-    CV_Assert( a.depth() == CV_32F );
+    CV_Assert( _a.depth() == CV_32F );
 
+    CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2,
+               ocl_patchNaNs(_a, (float)_val))
+
+    Mat a = _a.getMat();
     const Mat* arrays[] = {&a, 0};
     int* ptrs[1];
     NAryMatIterator it(arrays, (uchar**)ptrs);
diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp
index 16eb6e087..c6dde6509 100644
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
 
 #ifdef HAVE_IPP
@@ -724,7 +725,7 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
 
     UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
     if (haveC)
-        ctrans ? transpose(matC, D) : matC.getMat().copyTo(D); // TODO fix it as soon as .copyTo works as expected
+        ctrans ? transpose(matC, D) : matC.copyTo(D);
     else
         D.setTo(Scalar::all(0));
 
@@ -784,10 +785,8 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
            InputArray matC, double beta, OutputArray _matD, int flags )
 {
 #ifdef HAVE_CLAMDBLAS
-    if (ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 &&
-            ocl::useOpenCL() && _matD.isUMat() &&
+    CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat(),
             ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
-        return;
 #endif
 
     const int block_lin_size = 128;
@@ -2154,20 +2153,64 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 
 typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
+{
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    Size size = _src1.size();
+
+    if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
+        return false;
+
+    char cvt[2][50];
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
+                  format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D workT=%s -D convertToWT1=%s"
+                         " -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s%s", ocl::typeToStr(depth),
+                         ocl::typeToStr(wdepth), ocl::convertTypeStr(depth, wdepth, 1, cvt[0]),
+                         ocl::convertTypeStr(wdepth, depth, 1, cvt[1]),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    _dst.create(size, type);
+    UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dstarg = ocl::KernelArg::WriteOnly(dst, cn);
+
+    if (wdepth == CV_32F)
+        k.args(src1arg, src2arg, dstarg, (float)alpha);
+    else
+        k.args(src1arg, src2arg, dstarg, alpha);
+
+    size_t globalsize[2] = { dst.cols * cn, dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }
 
 void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst )
 {
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
-    int depth = src1.depth(), cn = src1.channels();
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert( type == _src2.type() );
+
+    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
+            ocl_scaleAdd(_src1, alpha, _src2, _dst, type))
 
-    CV_Assert( src1.type() == src2.type() );
     if( depth < CV_32F )
     {
         addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
         return;
     }
 
+    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+    CV_Assert(src1.size == src2.size);
+
     _dst.create(src1.dims, src1.size, src1.type());
     Mat dst = _dst.getMat();
 
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 6f2580498..a1d6044d5 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -43,6 +43,8 @@
 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
 
+#include "bufferpool.impl.hpp"
+
 /****************************************************************************************\
 *                           [scaled] Identity matrix initialization                      *
 \****************************************************************************************/
@@ -157,11 +159,17 @@ void MatAllocator::copy(UMatData* usrc, UMatData* udst, int dims, const size_t s
         memcpy(ptrs[1], ptrs[0], planesz);
 }
 
+BufferPoolController* MatAllocator::getBufferPoolController() const
+{
+    static DummyBufferPoolController dummy;
+    return &dummy;
+}
+
 class StdMatAllocator : public MatAllocator
 {
 public:
     UMatData* allocate(int dims, const int* sizes, int type,
-                       void* data0, size_t* step, int /*flags*/) const
+                       void* data0, size_t* step, int /*flags*/, UMatUsageFlags /*usageFlags*/) const
     {
         size_t total = CV_ELEM_SIZE(type);
         for( int i = dims-1; i >= 0; i-- )
@@ -188,10 +196,9 @@ public:
         return u;
     }
 
-    bool allocate(UMatData* u, int /*accessFlags*/) const
+    bool allocate(UMatData* u, int /*accessFlags*/, UMatUsageFlags /*usageFlags*/) const
     {
         if(!u) return false;
-        CV_XADD(&u->urefcount, 1);
         return true;
     }
 
@@ -214,8 +221,8 @@ public:
 
 MatAllocator* Mat::getStdAllocator()
 {
-    static StdMatAllocator allocator;
-    return &allocator;
+    static MatAllocator * allocator = new StdMatAllocator();
+    return allocator;
 }
 
 void swap( Mat& a, Mat& b )
@@ -275,7 +282,7 @@ static inline void setSize( Mat& m, int _dims, const int* _sz,
     if( !_sz )
         return;
 
-    size_t esz = CV_ELEM_SIZE(m.flags), total = esz;
+    size_t esz = CV_ELEM_SIZE(m.flags), esz1 = CV_ELEM_SIZE1(m.flags), total = esz;
     int i;
     for( i = _dims-1; i >= 0; i-- )
     {
@@ -284,7 +291,14 @@ static inline void setSize( Mat& m, int _dims, const int* _sz,
         m.size.p[i] = s;
 
         if( _steps )
+        {
+            if (_steps[i] % esz1 != 0)
+            {
+                CV_Error(Error::BadStep, "Step must be a multiple of esz1");
+            }
+
             m.step.p[i] = i < _dims-1 ? _steps[i] : esz;
+        }
         else if( autoSteps )
         {
             m.step.p[i] = total;
@@ -384,13 +398,13 @@ void Mat::create(int d, const int* _sizes, int _type)
             a = a0;
         try
         {
-            u = a->allocate(dims, size, _type, 0, step.p, 0);
+            u = a->allocate(dims, size, _type, 0, step.p, 0, USAGE_DEFAULT);
             CV_Assert(u != 0);
         }
         catch(...)
         {
             if(a != a0)
-                u = a0->allocate(dims, size, _type, 0, step.p, 0);
+                u = a0->allocate(dims, size, _type, 0, step.p, 0, USAGE_DEFAULT);
             CV_Assert(u != 0);
         }
         CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
@@ -1193,7 +1207,6 @@ Mat _InputArray::getMat(int i) const
     return Mat();
 }
 
-
 UMat _InputArray::getUMat(int i) const
 {
     int k = kind();
@@ -1226,7 +1239,6 @@ UMat _InputArray::getUMat(int i) const
     return getMat(i).getUMat(accessFlags);
 }
 
-
 void _InputArray::getMatVector(std::vector<Mat>& mv) const
 {
     int k = kind();
@@ -1324,6 +1336,42 @@ void _InputArray::getMatVector(std::vector<Mat>& mv) const
     CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 
+void _InputArray::getUMatVector(std::vector<UMat>& umv) const
+{
+    int k = kind();
+    int accessFlags = flags & ACCESS_MASK;
+
+    if( k == NONE )
+    {
+        umv.clear();
+        return;
+    }
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
+        size_t i, n = v.size();
+        umv.resize(n);
+
+        for( i = 0; i < n; i++ )
+            umv[i] = v[i].getUMat(accessFlags);
+        return;
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& v = *(const std::vector<UMat>*)obj;
+        size_t i, n = v.size();
+        umv.resize(n);
+
+        for( i = 0; i < n; i++ )
+            umv[i] = v[i];
+        return;
+    }
+
+    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
+}
+
 cuda::GpuMat _InputArray::getGpuMat() const
 {
     int k = kind();
@@ -1430,6 +1478,16 @@ Size _InputArray::size(int i) const
         return vv[i].size();
     }
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        if( i < 0 )
+            return vv.empty() ? Size() : Size((int)vv.size(), 1);
+        CV_Assert( i < (int)vv.size() );
+
+        return vv[i].size();
+    }
+
     if( k == OPENGL_BUFFER )
     {
         CV_Assert( i < 0 );
@@ -1444,11 +1502,6 @@ Size _InputArray::size(int i) const
         return d_mat->size();
     }
 
-    if( k == OCL_MAT )
-    {
-        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
-    }
-
     CV_Assert( k == CUDA_MEM );
     //if( k == CUDA_MEM )
     {
@@ -1458,7 +1511,6 @@ Size _InputArray::size(int i) const
     }
 }
 
-
 int _InputArray::sizend(int* arrsz, int i) const
 {
     int j, d=0, k = kind();
@@ -1517,7 +1569,6 @@ int _InputArray::sizend(int* arrsz, int i) const
     return d;
 }
 
-
 bool _InputArray::sameSize(const _InputArray& arr) const
 {
     int k1 = kind(), k2 = arr.kind();
@@ -1608,6 +1659,16 @@ int _InputArray::dims(int i) const
         return vv[i].dims;
     }
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+
+        return vv[i].dims;
+    }
+
     if( k == OPENGL_BUFFER )
     {
         CV_Assert( i < 0 );
@@ -1620,11 +1681,6 @@ int _InputArray::dims(int i) const
         return 2;
     }
 
-    if( k == OCL_MAT )
-    {
-        return 2;
-    }
-
     CV_Assert( k == CUDA_MEM );
     //if( k == CUDA_MEM )
     {
@@ -1659,6 +1715,16 @@ size_t _InputArray::total(int i) const
         return vv[i].total();
     }
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        if( i < 0 )
+            return vv.size();
+
+        CV_Assert( i < (int)vv.size() );
+        return vv[i].total();
+    }
+
     return size(i).area();
 }
 
@@ -1681,6 +1747,18 @@ int _InputArray::type(int i) const
     if( k == NONE )
         return -1;
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        if( vv.empty() )
+        {
+            CV_Assert((flags & FIXED_TYPE) != 0);
+            return CV_MAT_TYPE(flags);
+        }
+        CV_Assert( i < (int)vv.size() );
+        return vv[i >= 0 ? i : 0].type();
+    }
+
     if( k == STD_VECTOR_MAT )
     {
         const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
@@ -1751,14 +1829,15 @@ bool _InputArray::empty() const
         return vv.empty();
     }
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        return vv.empty();
+    }
+
     if( k == OPENGL_BUFFER )
         return ((const ogl::Buffer*)obj)->empty();
 
-    if( k == OCL_MAT )
-    {
-        CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
-    }
-
     if( k == GPU_MAT )
         return ((const cuda::GpuMat*)obj)->empty();
 
@@ -1794,7 +1873,38 @@ bool _InputArray::isContinuous(int i) const
         return vv[i].isContinuous();
     }
 
-    CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    CV_Error(CV_StsNotImplemented, "Unknown/unsupported array type");
+    return false;
+}
+
+bool _InputArray::isSubmatrix(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+        return i < 0 ? ((const Mat*)obj)->isSubmatrix() : false;
+
+    if( k == UMAT )
+        return i < 0 ? ((const UMat*)obj)->isSubmatrix() : false;
+
+    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+        return false;
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isSubmatrix();
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isSubmatrix();
+    }
+
+    CV_Error(CV_StsNotImplemented, "");
     return false;
 }
 
@@ -2228,7 +2338,66 @@ void _OutputArray::create(int d, const int* sizes, int mtype, int i,
             if(CV_MAT_CN(mtype) == m.channels() && ((1 << CV_MAT_TYPE(flags)) & fixedDepthMask) != 0 )
                 mtype = m.type();
             else
-                CV_Assert(!fixedType() || (CV_MAT_CN(mtype) == m.channels() && ((1 << CV_MAT_TYPE(flags)) & fixedDepthMask) != 0));
+                CV_Assert(CV_MAT_TYPE(mtype) == m.type());
+        }
+        if(fixedSize())
+        {
+            CV_Assert(m.dims == d);
+            for(int j = 0; j < d; ++j)
+                CV_Assert(m.size[j] == sizes[j]);
+        }
+
+        m.create(d, sizes, mtype);
+        return;
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        std::vector<UMat>& v = *(std::vector<UMat>*)obj;
+
+        if( i < 0 )
+        {
+            CV_Assert( d == 2 && (sizes[0] == 1 || sizes[1] == 1 || sizes[0]*sizes[1] == 0) );
+            size_t len = sizes[0]*sizes[1] > 0 ? sizes[0] + sizes[1] - 1 : 0, len0 = v.size();
+
+            CV_Assert(!fixedSize() || len == len0);
+            v.resize(len);
+            if( fixedType() )
+            {
+                int _type = CV_MAT_TYPE(flags);
+                for( size_t j = len0; j < len; j++ )
+                {
+                    if( v[j].type() == _type )
+                        continue;
+                    CV_Assert( v[j].empty() );
+                    v[j].flags = (v[j].flags & ~CV_MAT_TYPE_MASK) | _type;
+                }
+            }
+            return;
+        }
+
+        CV_Assert( i < (int)v.size() );
+        UMat& m = v[i];
+
+        if( allowTransposed )
+        {
+            if( !m.isContinuous() )
+            {
+                CV_Assert(!fixedType() && !fixedSize());
+                m.release();
+            }
+
+            if( d == 2 && m.dims == 2 && m.u &&
+                m.type() == mtype && m.rows == sizes[1] && m.cols == sizes[0] )
+                return;
+        }
+
+        if(fixedType())
+        {
+            if(CV_MAT_CN(mtype) == m.channels() && ((1 << CV_MAT_TYPE(flags)) & fixedDepthMask) != 0 )
+                mtype = m.type();
+            else
+                CV_Assert(CV_MAT_TYPE(mtype) == m.type());
         }
         if(fixedSize())
         {
@@ -2262,6 +2431,12 @@ void _OutputArray::release() const
         return;
     }
 
+    if( k == UMAT )
+    {
+        ((UMat*)obj)->release();
+        return;
+    }
+
     if( k == GPU_MAT )
     {
         ((cuda::GpuMat*)obj)->release();
@@ -2301,6 +2476,12 @@ void _OutputArray::release() const
         return;
     }
 
+    if( k == STD_VECTOR_UMAT )
+    {
+        ((std::vector<UMat>*)obj)->clear();
+        return;
+    }
+
     CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }
 
@@ -2340,6 +2521,23 @@ Mat& _OutputArray::getMatRef(int i) const
     }
 }
 
+UMat& _OutputArray::getUMatRef(int i) const
+{
+    int k = kind();
+    if( i < 0 )
+    {
+        CV_Assert( k == UMAT );
+        return *(UMat*)obj;
+    }
+    else
+    {
+        CV_Assert( k == STD_VECTOR_UMAT );
+        std::vector<UMat>& v = *(std::vector<UMat>*)obj;
+        CV_Assert( i < (int)v.size() );
+        return v[i];
+    }
+}
+
 cuda::GpuMat& _OutputArray::getGpuMatRef() const
 {
     int k = kind();
@@ -2361,7 +2559,7 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
     return *(cuda::CudaMem*)obj;
 }
 
-void _OutputArray::setTo(const _InputArray& arr) const
+void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
 {
     int k = kind();
 
@@ -2370,10 +2568,16 @@ void _OutputArray::setTo(const _InputArray& arr) const
     else if( k == MAT || k == MATX || k == STD_VECTOR )
     {
         Mat m = getMat();
-        m.setTo(arr);
+        m.setTo(arr, mask);
     }
     else if( k == UMAT )
-        ((UMat*)obj)->setTo(arr);
+        ((UMat*)obj)->setTo(arr, mask);
+    else if( k == GPU_MAT )
+    {
+        Mat value = arr.getMat();
+        CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::GPU_MAT) );
+        ((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>((double *)value.data)), mask);
+    }
     else
         CV_Error(Error::StsNotImplemented, "");
 }
@@ -2469,6 +2673,8 @@ void cv::vconcat(InputArray _src, OutputArray dst)
 
 //////////////////////////////////////// set identity ////////////////////////////////////////////
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
@@ -2491,12 +2697,14 @@ static bool ocl_setIdentity( InputOutputArray _m, const Scalar& s )
 
 }
 
+#endif
+
 void cv::setIdentity( InputOutputArray _m, const Scalar& s )
 {
     CV_Assert( _m.dims() <= 2 );
 
-    if (ocl::useOpenCL() && _m.isUMat() && ocl_setIdentity(_m, s))
-        return;
+    CV_OCL_RUN(_m.isUMat(),
+               ocl_setIdentity(_m, s))
 
     Mat m = _m.getMat();
     int i, j, rows = m.rows, cols = m.cols, type = m.type();
@@ -2675,6 +2883,8 @@ static TransposeInplaceFunc transposeInplaceTab[] =
     0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
 };
 
+#ifdef HAVE_OPENCL
+
 static inline int divUp(int a, int b)
 {
     return (a + b - 1) / b;
@@ -2704,6 +2914,9 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
     ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
                   format("-D T=%s -D TILE_DIM=%d -D BLOCK_ROWS=%d",
                          ocl::memopTypeToStr(type), TILE_DIM, BLOCK_ROWS));
+    if (k.empty())
+        return false;
+
     if (inplace)
         k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
     else
@@ -2716,6 +2929,8 @@ static bool ocl_transpose( InputArray _src, OutputArray _dst )
     return k.run(2, globalsize, localsize, false);
 }
 
+#endif
+
 }
 
 void cv::transpose( InputArray _src, OutputArray _dst )
@@ -2723,8 +2938,8 @@ void cv::transpose( InputArray _src, OutputArray _dst )
     int type = _src.type(), esz = CV_ELEM_SIZE(type);
     CV_Assert( _src.dims() <= 2 && esz <= 32 );
 
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_transpose(_src, _dst))
-        return;
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_transpose(_src, _dst))
 
     Mat src = _src.getMat();
     if( src.empty() )
@@ -2760,39 +2975,24 @@ void cv::transpose( InputArray _src, OutputArray _dst )
 }
 
 
+////////////////////////////////////// completeSymm /////////////////////////////////////////
+
 void cv::completeSymm( InputOutputArray _m, bool LtoR )
 {
     Mat m = _m.getMat();
-    CV_Assert( m.dims <= 2 );
+    size_t step = m.step, esz = m.elemSize();
+    CV_Assert( m.dims <= 2 && m.rows == m.cols );
 
-    int i, j, nrows = m.rows, type = m.type();
-    int j0 = 0, j1 = nrows;
-    CV_Assert( m.rows == m.cols );
+    int rows = m.rows;
+    int j0 = 0, j1 = rows;
 
-    if( type == CV_32FC1 || type == CV_32SC1 )
+    uchar* data = m.data;
+    for( int i = 0; i < rows; i++ )
     {
-        int* data = (int*)m.data;
-        size_t step = m.step/sizeof(data[0]);
-        for( i = 0; i < nrows; i++ )
-        {
-            if( !LtoR ) j1 = i; else j0 = i+1;
-            for( j = j0; j < j1; j++ )
-                data[i*step + j] = data[j*step + i];
-        }
+        if( !LtoR ) j1 = i; else j0 = i+1;
+        for( int j = j0; j < j1; j++ )
+            memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz);
     }
-    else if( type == CV_64FC1 )
-    {
-        double* data = (double*)m.data;
-        size_t step = m.step/sizeof(data[0]);
-        for( i = 0; i < nrows; i++ )
-        {
-            if( !LtoR ) j1 = i; else j0 = i+1;
-            for( j = j0; j < j1; j++ )
-                data[i*step + j] = data[j*step + i];
-        }
-    }
-    else
-        CV_Error( CV_StsUnsupportedFormat, "" );
 }
 
 
@@ -2969,23 +3169,83 @@ typedef void (*ReduceFunc)( const Mat& src, Mat& dst );
 #define reduceMinC32f reduceC_<float, float, OpMin<float> >
 #define reduceMinC64f reduceC_<double,double,OpMin<double> >
 
+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+static bool ocl_reduce(InputArray _src, OutputArray _dst,
+                       int dim, int op, int op0, int stype, int dtype)
+{
+    int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+            ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        return false;
+
+    if (op == CV_REDUCE_AVG)
+    {
+        op = CV_REDUCE_SUM;
+        if (sdepth < CV_32S && ddepth < CV_32S)
+            ddepth = CV_32S;
+    }
+
+    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
+                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
+    char cvt[40];
+    ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc,
+                  format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
+                         ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
+                         ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
+    _dst.create(dsize, dtype);
+    UMat dst = _dst.getUMat(), temp = dst;
+
+    if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
+        temp.create(dsize, CV_32SC(cn));
+
+    size_t globalsize = std::max(dsize.width, dsize.height);
+
+    k.args(ocl::KernelArg::ReadOnly(src),
+           ocl::KernelArg::WriteOnlyNoSize(temp));
+    if (!k.run(1, &globalsize, NULL, false))
+        return false;
+
+    if (op0 == CV_REDUCE_AVG)
+        temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols));
+
+    return true;
+}
+
+}
+
+#endif
+
 void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
 {
-    Mat src = _src.getMat();
-    CV_Assert( src.dims <= 2 );
+    CV_Assert( _src.dims() <= 2 );
     int op0 = op;
-    int stype = src.type(), sdepth = src.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype);
     if( dtype < 0 )
         dtype = _dst.fixedType() ? _dst.type() : stype;
+    dtype = CV_MAKETYPE(dtype >= 0 ? dtype : stype, cn);
     int ddepth = CV_MAT_DEPTH(dtype);
 
-    _dst.create(dim == 0 ? 1 : src.rows, dim == 0 ? src.cols : 1,
-                CV_MAKETYPE(dtype >= 0 ? dtype : stype, cn));
-    Mat dst = _dst.getMat(), temp = dst;
-
+    CV_Assert( cn == CV_MAT_CN(dtype) );
     CV_Assert( op == CV_REDUCE_SUM || op == CV_REDUCE_MAX ||
                op == CV_REDUCE_MIN || op == CV_REDUCE_AVG );
-    CV_Assert( src.channels() == dst.channels() );
+
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_reduce(_src, _dst, dim, op, op0, stype, dtype))
+
+    Mat src = _src.getMat();
+    _dst.create(dim == 0 ? 1 : src.rows, dim == 0 ? src.cols : 1, dtype);
+    Mat dst = _dst.getMat(), temp = dst;
 
     if( op == CV_REDUCE_AVG )
     {
@@ -4944,6 +5204,30 @@ void normalize( const SparseMat& src, SparseMat& dst, double a, int norm_type )
 
 ////////////////////// RotatedRect //////////////////////
 
+RotatedRect::RotatedRect(const Point2f& _point1, const Point2f& _point2, const Point2f& _point3)
+{
+    Point2f _center = 0.5f * (_point1 + _point3);
+    Vec2f vecs[2];
+    vecs[0] = Vec2f(_point1 - _point2);
+    vecs[1] = Vec2f(_point2 - _point3);
+    // check that given sides are perpendicular
+    CV_Assert( abs(vecs[0].dot(vecs[1])) / (norm(vecs[0]) * norm(vecs[1])) <= FLT_EPSILON );
+
+    // wd_i stores which vector (0,1) or (1,2) will make the width
+    // One of them will definitely have slope within -1 to 1
+    int wd_i = 0;
+    if( abs(vecs[1][1]) < abs(vecs[1][0]) ) wd_i = 1;
+    int ht_i = (wd_i + 1) % 2;
+
+    float _angle = atan(vecs[wd_i][1] / vecs[wd_i][0]) * 180.0f / (float) CV_PI;
+    float _width = (float) norm(vecs[wd_i]);
+    float _height = (float) norm(vecs[ht_i]);
+
+    center = _center;
+    size = Size2f(_width, _height);
+    angle = _angle;
+}
+
 void RotatedRect::points(Point2f pt[]) const
 {
     double _angle = angle*CV_PI/180.;
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 67e54234c..9c92b8381 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -40,7 +40,49 @@
 //M*/
 
 #include "precomp.hpp"
+#include <list>
 #include <map>
+#include <string>
+#include <sstream>
+#include <iostream> // std::cerr
+
+#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
+
+#include "opencv2/core/bufferpool.hpp"
+#ifndef LOG_BUFFER_POOL
+# if 0
+#   define LOG_BUFFER_POOL printf
+# else
+#   define LOG_BUFFER_POOL(...)
+# endif
+#endif
+
+// TODO Move to some common place
+static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
+{
+    const char* envValue = getenv(name);
+    if (envValue == NULL)
+    {
+        return defaultValue;
+    }
+    cv::String value = envValue;
+    size_t pos = 0;
+    for (; pos < value.size(); pos++)
+    {
+        if (!isdigit(value[pos]))
+            break;
+    }
+    cv::String valueStr = value.substr(0, pos);
+    cv::String suffixStr = value.substr(pos, value.length() - pos);
+    int v = atoi(valueStr.c_str());
+    if (suffixStr.length() == 0)
+        return v;
+    else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
+        return v * 1024 * 1024;
+    else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
+        return v * 1024;
+    CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
+}
 
 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
@@ -612,7 +654,7 @@ static void* initOpenCLAndLoad(const char* funcname)
             initialized = true;
             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
             if( g_haveOpenCL )
-                fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
+                fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
             else
                 fprintf(stderr, "Failed to load OpenCL runtime\n");
         }
@@ -818,6 +860,7 @@ OCL_FUNC_P(cl_mem, clCreateSubBuffer,
     const void * buffer_create_info,
     cl_int * errcode_ret),
     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
+*/
 
 OCL_FUNC_P(cl_mem, clCreateImage,
     (cl_context context,
@@ -828,6 +871,18 @@ OCL_FUNC_P(cl_mem, clCreateImage,
     cl_int * errcode_ret),
     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
 
+OCL_FUNC_P(cl_mem, clCreateImage2D,
+    (cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format * image_format,
+    size_t image_width,
+    size_t image_height,
+    size_t image_row_pitch,
+    void * host_ptr,
+    cl_int *errcode_ret),
+    (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
+
+/*
 OCL_FUNC(cl_int, clGetSupportedImageFormats,
  (cl_context context,
  cl_mem_flags flags,
@@ -942,21 +997,26 @@ OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
  cl_event * event),
  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
  num_events_in_wait_list, event_wait_list, event))
+*/
 
 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
  (cl_command_queue command_queue,
  cl_mem src_buffer,
  cl_mem dst_image,
  size_t src_offset,
- const size_t * dst_origin[3],
- const size_t * region[3],
+ const size_t dst_origin[3],
+ const size_t region[3],
  cl_uint num_events_in_wait_list,
  const cl_event * event_wait_list,
  cl_event * event),
  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
  region, num_events_in_wait_list, event_wait_list, event))
 
+ OCL_FUNC(cl_int, clFlush,
+ (cl_command_queue command_queue),
+ (command_queue))
 
+/*
 OCL_FUNC_P(void*, clEnqueueMapImage,
  (cl_command_queue command_queue,
  cl_mem image,
@@ -973,7 +1033,9 @@ OCL_FUNC_P(void*, clEnqueueMapImage,
  (command_queue, image, blocking_map, map_flags, origin, region,
  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
  event_wait_list, event, errcode_ret))
+*/
 
+/*
 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
 
 OCL_FUNC(cl_int, clGetKernelInfo,
@@ -1234,6 +1296,12 @@ OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
 
 #endif
 
+#ifdef _DEBUG
+#define CV_OclDbgAssert CV_DbgAssert
+#else
+#define CV_OclDbgAssert(expr) (void)(expr)
+#endif
+
 namespace cv { namespace ocl {
 
 struct UMat2D
@@ -1312,11 +1380,13 @@ inline bool operator < (const HashKey& h1, const HashKey& h2)
     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
 }
 
-static bool g_isOpenCLInitialized = false;
-static bool g_isOpenCLAvailable = false;
 
 bool haveOpenCL()
 {
+#ifdef HAVE_OPENCL
+    static bool g_isOpenCLInitialized = false;
+    static bool g_isOpenCLAvailable = false;
+
     if (!g_isOpenCLInitialized)
     {
         try
@@ -1331,6 +1401,9 @@ bool haveOpenCL()
         g_isOpenCLInitialized = true;
     }
     return g_isOpenCLAvailable;
+#else
+    return false;
+#endif
 }
 
 bool useOpenCL()
@@ -1501,7 +1574,7 @@ bool haveAmdFft()
 
 #endif
 
-void finish2()
+void finish()
 {
     Queue::getDefault().finish();
 }
@@ -1511,6 +1584,8 @@ void finish2()
     void release() { if( CV_XADD(&refcount, -1) == 1 ) delete this; } \
     int refcount
 
+/////////////////////////////////////////// Platform /////////////////////////////////////////////
+
 struct Platform::Impl
 {
     Impl()
@@ -1528,13 +1603,13 @@ struct Platform::Impl
         {
             //cl_uint num_entries
             cl_uint n = 0;
-            if( clGetPlatformIDs(1, &handle, &n) < 0 || n == 0 )
+            if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
                 handle = 0;
             if( handle != 0 )
             {
                 char buf[1000];
                 size_t len = 0;
-                clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len);
+                CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
                 buf[len] = '\0';
                 vendor = String(buf);
             }
@@ -1595,7 +1670,29 @@ Platform& Platform::getDefault()
     return p;
 }
 
-///////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////// Device ////////////////////////////////////////////
+
+// deviceVersion has format
+//   OpenCL<space><major_version.minor_version><space><vendor-specific information>
+// by specification
+//   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
+//   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
+static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
+{
+    major = minor = 0;
+    if (10 >= deviceVersion.length())
+        return;
+    const char *pstr = deviceVersion.c_str();
+    if (0 != strncmp(pstr, "OpenCL ", 7))
+        return;
+    size_t ppos = deviceVersion.find('.', 7);
+    if (String::npos == ppos)
+        return;
+    String temp = deviceVersion.substr(7, ppos - 7);
+    major = atoi(temp.c_str());
+    temp = deviceVersion.substr(ppos + 1);
+    minor = atoi(temp.c_str());
+}
 
 struct Device::Impl
 {
@@ -1603,6 +1700,18 @@ struct Device::Impl
     {
         handle = (cl_device_id)d;
         refcount = 1;
+
+        name_ = getStrProp(CL_DEVICE_NAME);
+        version_ = getStrProp(CL_DEVICE_VERSION);
+        doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
+        hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
+        maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
+        maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
+        type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
+        driverVersion_ = getStrProp(CL_DRIVER_VERSION);
+
+        String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
+        parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
     }
 
     template<typename _TpCL, typename _TpOut>
@@ -1611,7 +1720,7 @@ struct Device::Impl
         _TpCL temp=_TpCL();
         size_t sz = 0;
 
-        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) >= 0 &&
+        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
     }
 
@@ -1620,7 +1729,7 @@ struct Device::Impl
         cl_bool temp = CL_FALSE;
         size_t sz = 0;
 
-        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) >= 0 &&
+        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
             sz == sizeof(temp) ? temp != 0 : false;
     }
 
@@ -1628,12 +1737,23 @@ struct Device::Impl
     {
         char buf[1024];
         size_t sz=0;
-        return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) >= 0 &&
+        return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
             sz < sizeof(buf) ? String(buf) : String();
     }
 
     IMPLEMENT_REFCOUNTABLE();
     cl_device_id handle;
+
+    String name_;
+    String version_;
+    int doubleFPConfig_;
+    bool hostUnifiedMemory_;
+    int maxComputeUnits_;
+    size_t maxWorkGroupSize_;
+    int type_;
+    int deviceVersionMajor_;
+    int deviceVersionMinor_;
+    String driverVersion_;
 };
 
 
@@ -1685,11 +1805,14 @@ void* Device::ptr() const
 }
 
 String Device::name() const
-{ return p ? p->getStrProp(CL_DEVICE_NAME) : String(); }
+{ return p ? p->name_ : String(); }
 
 String Device::extensions() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
 
+String Device::version() const
+{ return p ? p->version_ : String(); }
+
 String Device::vendor() const
 { return p ? p->getStrProp(CL_DEVICE_VENDOR) : String(); }
 
@@ -1699,11 +1822,17 @@ String Device::OpenCL_C_Version() const
 String Device::OpenCLVersion() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
 
+int Device::deviceVersionMajor() const
+{ return p ? p->deviceVersionMajor_ : 0; }
+
+int Device::deviceVersionMinor() const
+{ return p ? p->deviceVersionMinor_ : 0; }
+
 String Device::driverVersion() const
-{ return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }
+{ return p ? p->driverVersion_ : String(); }
 
 int Device::type() const
-{ return p ? p->getProp<cl_device_type, int>(CL_DEVICE_TYPE) : 0; }
+{ return p ? p->type_ : 0; }
 
 int Device::addressBits() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
@@ -1722,7 +1851,7 @@ bool Device::linkerAvailable() const
 #endif
 
 int Device::doubleFPConfig() const
-{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG) : 0; }
+{ return p ? p->doubleFPConfig_ : 0; }
 
 int Device::singleFPConfig() const
 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
@@ -1762,7 +1891,7 @@ int Device::localMemType() const
 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
 
 bool Device::hostUnifiedMemory() const
-{ return p ? p->getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY) : false; }
+{ return p ? p->hostUnifiedMemory_ : false; }
 
 bool Device::imageSupport() const
 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
@@ -1800,7 +1929,7 @@ int Device::maxClockFrequency() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
 
 int Device::maxComputeUnits() const
-{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS) : 0; }
+{ return p ? p->maxComputeUnits_ : 0; }
 
 int Device::maxConstantArgs() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
@@ -1824,7 +1953,7 @@ int Device::maxSamplers() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
 
 size_t Device::maxWorkGroupSize() const
-{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE) : 0; }
+{ return p ? p->maxWorkGroupSize_ : 0; }
 
 int Device::maxWorkItemDims() const
 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
@@ -1835,8 +1964,8 @@ void Device::maxWorkItemSizes(size_t* sizes) const
     {
         const int MAX_DIMS = 32;
         size_t retsz = 0;
-        clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
-                MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz);
+        CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
     }
 }
 
@@ -1898,14 +2027,212 @@ size_t Device::profilingTimerResolution() const
 
 const Device& Device::getDefault()
 {
-    const Context2& ctx = Context2::getDefault();
+    const Context& ctx = Context::getDefault();
     int idx = coreTlsData.get()->device;
     return ctx.device(idx);
 }
 
-/////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////// Context ///////////////////////////////////////////////////
 
-struct Context2::Impl
+template <typename Functor, typename ObjectType>
+inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
+{
+    ::size_t required;
+    cl_int err = f(obj, name, 0, NULL, &required);
+    if (err != CL_SUCCESS)
+        return err;
+
+    param.clear();
+    if (required > 0)
+    {
+        AutoBuffer<char> buf(required + 1);
+        char* ptr = (char*)buf; // cleanup is not needed
+        err = f(obj, name, required, ptr, NULL);
+        if (err != CL_SUCCESS)
+            return err;
+        param = ptr;
+    }
+
+    return CL_SUCCESS;
+}
+
+static void split(const std::string &s, char delim, std::vector<std::string> &elems)
+{
+    elems.clear();
+    if (s.size() == 0)
+        return;
+    std::istringstream ss(s);
+    std::string item;
+    while (!ss.eof())
+    {
+        std::getline(ss, item, delim);
+        elems.push_back(item);
+    }
+}
+
+// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
+// Sample: AMD:GPU:
+// Sample: AMD:GPU:Tahiti
+// Sample: :GPU|CPU: = '' = ':' = '::'
+static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
+        std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
+{
+    std::vector<std::string> parts;
+    split(configurationStr, ':', parts);
+    if (parts.size() > 3)
+    {
+        std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
+        return false;
+    }
+    if (parts.size() > 2)
+        deviceNameOrID = parts[2];
+    if (parts.size() > 1)
+    {
+        split(parts[1], '|', deviceTypes);
+    }
+    if (parts.size() > 0)
+    {
+        platform = parts[0];
+    }
+    return true;
+}
+
+static cl_device_id selectOpenCLDevice()
+{
+    std::string platform, deviceName;
+    std::vector<std::string> deviceTypes;
+
+    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
+    if (configuration && !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
+        return NULL;
+
+    bool isID = false;
+    int deviceID = -1;
+    if (deviceName.length() == 1)
+    // We limit ID range to 0..9, because we want to write:
+    // - '2500' to mean i5-2500
+    // - '8350' to mean AMD FX-8350
+    // - '650' to mean GeForce 650
+    // To extend ID range change condition to '> 0'
+    {
+        isID = true;
+        for (size_t i = 0; i < deviceName.length(); i++)
+        {
+            if (!isdigit(deviceName[i]))
+            {
+                isID = false;
+                break;
+            }
+        }
+        if (isID)
+        {
+            deviceID = atoi(deviceName.c_str());
+            if (deviceID < 0)
+                return NULL;
+        }
+    }
+
+    std::vector<cl_platform_id> platforms;
+    {
+        cl_uint numPlatforms = 0;
+        CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
+
+        if (numPlatforms == 0)
+            return NULL;
+        platforms.resize((size_t)numPlatforms);
+        CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
+        platforms.resize(numPlatforms);
+    }
+
+    int selectedPlatform = -1;
+    if (platform.length() > 0)
+    {
+        for (size_t i = 0; i < platforms.size(); i++)
+        {
+            std::string name;
+            CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
+            if (name.find(platform) != std::string::npos)
+            {
+                selectedPlatform = (int)i;
+                break;
+            }
+        }
+        if (selectedPlatform == -1)
+        {
+            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
+            goto not_found;
+        }
+    }
+
+    if (deviceTypes.size() == 0)
+    {
+        if (!isID)
+        {
+            deviceTypes.push_back("GPU");
+            deviceTypes.push_back("CPU");
+        }
+        else
+            deviceTypes.push_back("ALL");
+    }
+    for (size_t t = 0; t < deviceTypes.size(); t++)
+    {
+        int deviceType = 0;
+        if (deviceTypes[t] == "GPU")
+            deviceType = Device::TYPE_GPU;
+        else if (deviceTypes[t] == "CPU")
+            deviceType = Device::TYPE_CPU;
+        else if (deviceTypes[t] == "ACCELERATOR")
+            deviceType = Device::TYPE_ACCELERATOR;
+        else if (deviceTypes[t] == "ALL")
+            deviceType = Device::TYPE_ALL;
+        else
+        {
+            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
+            goto not_found;
+        }
+
+        std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
+        for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
+                (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
+                i++)
+        {
+            cl_uint count = 0;
+            cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
+            CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
+            if (count == 0)
+                continue;
+            size_t base = devices.size();
+            devices.resize(base + count);
+            status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
+            CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
+        }
+
+        for (size_t i = (isID ? deviceID : 0);
+             (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
+             i++)
+        {
+            std::string name;
+            CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
+            if (isID || name.find(deviceName) != std::string::npos)
+            {
+                // TODO check for OpenCL 1.1
+                return devices[i];
+            }
+        }
+    }
+
+not_found:
+    std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
+            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
+            << "    Device types: ";
+    for (size_t t = 0; t < deviceTypes.size(); t++)
+        std::cerr << deviceTypes[t] << " ";
+
+    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    return NULL;
+}
+
+struct Context::Impl
 {
     Impl()
     {
@@ -1913,6 +2240,40 @@ struct Context2::Impl
         handle = 0;
     }
 
+    void setDefault()
+    {
+        CV_Assert(handle == NULL);
+
+        cl_device_id d = selectOpenCLDevice();
+
+        if (d == NULL)
+            return;
+
+        cl_platform_id pl = NULL;
+        CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
+
+        cl_context_properties prop[] =
+        {
+            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
+            0
+        };
+
+        // !!! in the current implementation force the number of devices to 1 !!!
+        cl_uint nd = 1;
+        cl_int status;
+
+        handle = clCreateContext(prop, nd, &d, 0, 0, &status);
+
+        bool ok = handle != 0 && status == CL_SUCCESS;
+        if( ok )
+        {
+            devices.resize(nd);
+            devices[0].set(d);
+        }
+        else
+            handle = NULL;
+    }
+
     Impl(int dtype0)
     {
         refcount = 1;
@@ -1928,13 +2289,12 @@ struct Context2::Impl
 
         cl_uint i, nd0 = 0, nd = 0;
         int dtype = dtype0 & 15;
-        clGetDeviceIDs( pl, dtype, 0, 0, &nd0 );
-        if(retval < 0)
-            return;
+        CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
+
         AutoBuffer<void*> dlistbuf(nd0*2+1);
         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
         cl_device_id* dlist_new = dlist + nd0;
-        clGetDeviceIDs(	pl, dtype, nd0, dlist, &nd0 );
+        CV_OclDbgAssert(clGetDeviceIDs(	pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
         String name0;
 
         for(i = 0; i < nd0; i++)
@@ -1960,7 +2320,7 @@ struct Context2::Impl
         nd = 1;
 
         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
-        bool ok = handle != 0 && retval >= 0;
+        bool ok = handle != 0 && retval == CL_SUCCESS;
         if( ok )
         {
             devices.resize(nd);
@@ -1972,11 +2332,14 @@ struct Context2::Impl
     ~Impl()
     {
         if(handle)
+        {
             clReleaseContext(handle);
+            handle = NULL;
+        }
         devices.clear();
     }
 
-    Program getProg(const ProgramSource2& src,
+    Program getProg(const ProgramSource& src,
                     const String& buildflags, String& errmsg)
     {
         String prefix = Program::getPrefix(buildflags);
@@ -1996,7 +2359,7 @@ struct Context2::Impl
     cl_context handle;
     std::vector<Device> devices;
 
-    typedef ProgramSource2::hash_t hash_t;
+    typedef ProgramSource::hash_t hash_t;
 
     struct HashKey
     {
@@ -2011,18 +2374,33 @@ struct Context2::Impl
 };
 
 
-Context2::Context2()
+Context::Context()
 {
     p = 0;
 }
 
-Context2::Context2(int dtype)
+Context::Context(int dtype)
 {
     p = 0;
     create(dtype);
 }
 
-bool Context2::create(int dtype0)
+bool Context::create()
+{
+    if( !haveOpenCL() )
+        return false;
+    if(p)
+        p->release();
+    p = new Impl();
+    if(!p->handle)
+    {
+        delete p;
+        p = 0;
+    }
+    return p != 0;
+}
+
+bool Context::create(int dtype0)
 {
     if( !haveOpenCL() )
         return false;
@@ -2037,19 +2415,23 @@ bool Context2::create(int dtype0)
     return p != 0;
 }
 
-Context2::~Context2()
+Context::~Context()
 {
-    p->release();
+    if (p)
+    {
+        p->release();
+        p = NULL;
+    }
 }
 
-Context2::Context2(const Context2& c)
+Context::Context(const Context& c)
 {
     p = (Impl*)c.p;
     if(p)
         p->addref();
 }
 
-Context2& Context2::operator = (const Context2& c)
+Context& Context::operator = (const Context& c)
 {
     Impl* newp = (Impl*)c.p;
     if(newp)
@@ -2060,67 +2442,59 @@ Context2& Context2::operator = (const Context2& c)
     return *this;
 }
 
-void* Context2::ptr() const
+void* Context::ptr() const
 {
-    return p->handle;
+    return p == NULL ? NULL : p->handle;
 }
 
-size_t Context2::ndevices() const
+size_t Context::ndevices() const
 {
     return p ? p->devices.size() : 0;
 }
 
-const Device& Context2::device(size_t idx) const
+const Device& Context::device(size_t idx) const
 {
     static Device dummy;
     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
 }
 
-Context2& Context2::getDefault(bool initialize)
+Context& Context::getDefault(bool initialize)
 {
-    static Context2 ctx;
+    static Context ctx;
     if(!ctx.p && haveOpenCL())
     {
+        if (!ctx.p)
+            ctx.p = new Impl();
         if (initialize)
         {
-            // do not create new Context2 right away.
+            // do not create new Context right away.
             // First, try to retrieve existing context of the same type.
-            // In its turn, Platform::getContext() may call Context2::create()
+            // In its turn, Platform::getContext() may call Context::create()
             // if there is no such context.
-            ctx.create(Device::TYPE_ACCELERATOR);
-            if(!ctx.p)
-                ctx.create(Device::TYPE_DGPU);
-            if(!ctx.p)
-                ctx.create(Device::TYPE_IGPU);
-            if(!ctx.p)
-                ctx.create(Device::TYPE_CPU);
-        }
-        else
-        {
-            ctx.p = new Impl();
+            if (ctx.p->handle == NULL)
+                ctx.p->setDefault();
         }
     }
 
     return ctx;
 }
 
-Program Context2::getProg(const ProgramSource2& prog,
+Program Context::getProg(const ProgramSource& prog,
                          const String& buildopts, String& errmsg)
 {
     return p ? p->getProg(prog, buildopts, errmsg) : Program();
 }
 
-void initializeContextFromHandle(Context2& ctx, void* platform, void* _context, void* _device)
+void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
 {
     cl_context context = (cl_context)_context;
     cl_device_id device = (cl_device_id)_device;
 
     // cleanup old context
-    Context2::Impl* impl = ctx._getImpl();
+    Context::Impl * impl = ctx.p;
     if (impl->handle)
     {
-        cl_int status = clReleaseContext(impl->handle);
-        (void)status;
+        CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
     }
     impl->devices.clear();
 
@@ -2129,21 +2503,22 @@ void initializeContextFromHandle(Context2& ctx, void* platform, void* _context,
     impl->devices[0].set(device);
 
     Platform& p = Platform::getDefault();
-    Platform::Impl* pImpl = p._getImpl();
+    Platform::Impl* pImpl = p.p;
     pImpl->handle = (cl_platform_id)platform;
 }
 
+/////////////////////////////////////////// Queue /////////////////////////////////////////////
 
 struct Queue::Impl
 {
-    Impl(const Context2& c, const Device& d)
+    Impl(const Context& c, const Device& d)
     {
         refcount = 1;
-        const Context2* pc = &c;
+        const Context* pc = &c;
         cl_context ch = (cl_context)pc->ptr();
         if( !ch )
         {
-            pc = &Context2::getDefault();
+            pc = &Context::getDefault();
             ch = (cl_context)pc->ptr();
         }
         cl_device_id dh = (cl_device_id)d.ptr();
@@ -2151,6 +2526,7 @@ struct Queue::Impl
             dh = (cl_device_id)pc->device(0).ptr();
         cl_int retval = 0;
         handle = clCreateCommandQueue(ch, dh, 0, &retval);
+        CV_OclDbgAssert(retval == CL_SUCCESS);
     }
 
     ~Impl()
@@ -2163,6 +2539,7 @@ struct Queue::Impl
             {
                 clFinish(handle);
                 clReleaseCommandQueue(handle);
+                handle = NULL;
             }
         }
     }
@@ -2170,7 +2547,6 @@ struct Queue::Impl
     IMPLEMENT_REFCOUNTABLE();
 
     cl_command_queue handle;
-    bool initialized;
 };
 
 Queue::Queue()
@@ -2178,7 +2554,7 @@ Queue::Queue()
     p = 0;
 }
 
-Queue::Queue(const Context2& c, const Device& d)
+Queue::Queue(const Context& c, const Device& d)
 {
     p = 0;
     create(c, d);
@@ -2208,7 +2584,7 @@ Queue::~Queue()
         p->release();
 }
 
-bool Queue::create(const Context2& c, const Device& d)
+bool Queue::create(const Context& c, const Device& d)
 {
     if(p)
         p->release();
@@ -2219,7 +2595,9 @@ bool Queue::create(const Context2& c, const Device& d)
 void Queue::finish()
 {
     if(p && p->handle)
-        clFinish(p->handle);
+    {
+        CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
+    }
 }
 
 void* Queue::ptr() const
@@ -2231,7 +2609,7 @@ Queue& Queue::getDefault()
 {
     Queue& q = coreTlsData.get()->oclQueue;
     if( !q.p && haveOpenCL() )
-        q.create(Context2::getDefault());
+        q.create(Context::getDefault());
     return q;
 }
 
@@ -2243,6 +2621,8 @@ static cl_command_queue getQueue(const Queue& q)
     return qq;
 }
 
+/////////////////////////////////////////// KernelArg /////////////////////////////////////////////
+
 KernelArg::KernelArg()
     : flags(0), m(0), obj(0), sz(0), wscale(1)
 {
@@ -2259,16 +2639,18 @@ KernelArg KernelArg::Constant(const Mat& m)
     return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize());
 }
 
+/////////////////////////////////////////// Kernel /////////////////////////////////////////////
 
 struct Kernel::Impl
 {
-    Impl(const char* kname, const Program& prog)
+    Impl(const char* kname, const Program& prog) :
+        refcount(1), e(0), nu(0)
     {
-        e = 0; refcount = 1;
         cl_program ph = (cl_program)prog.ptr();
         cl_int retval = 0;
         handle = ph != 0 ?
             clCreateKernel(ph, kname, &retval) : 0;
+        CV_OclDbgAssert(retval == CL_SUCCESS);
         for( int i = 0; i < MAX_ARRS; i++ )
             u[i] = 0;
         haveTempDstUMats = false;
@@ -2344,7 +2726,7 @@ Kernel::Kernel(const char* kname, const Program& prog)
     create(kname, prog);
 }
 
-Kernel::Kernel(const char* kname, const ProgramSource2& src,
+Kernel::Kernel(const char* kname, const ProgramSource& src,
                const String& buildopts, String* errmsg)
 {
     p = 0;
@@ -2388,7 +2770,7 @@ bool Kernel::create(const char* kname, const Program& prog)
     return p != 0;
 }
 
-bool Kernel::create(const char* kname, const ProgramSource2& src,
+bool Kernel::create(const char* kname, const ProgramSource& src,
                     const String& buildopts, String* errmsg)
 {
     if(p)
@@ -2398,7 +2780,7 @@ bool Kernel::create(const char* kname, const ProgramSource2& src,
     }
     String tempmsg;
     if( !errmsg ) errmsg = &tempmsg;
-    const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg);
+    const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
     return create(kname, prog);
 }
 
@@ -2414,14 +2796,25 @@ bool Kernel::empty() const
 
 int Kernel::set(int i, const void* value, size_t sz)
 {
+    if (!p || !p->handle)
+        return -1;
     CV_Assert(i >= 0);
     if( i == 0 )
         p->cleanupUMats();
-    if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 )
+
+    cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
+    CV_OclDbgAssert(retval == CL_SUCCESS);
+    if (retval != CL_SUCCESS)
         return -1;
     return i+1;
 }
 
+int Kernel::set(int i, const Image2D& image2D)
+{
+    cl_mem h = (cl_mem)image2D.ptr();
+    return set(i, &h, sizeof(h));
+}
+
 int Kernel::set(int i, const UMat& m)
 {
     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
@@ -2429,9 +2822,9 @@ int Kernel::set(int i, const UMat& m)
 
 int Kernel::set(int i, const KernelArg& arg)
 {
-    CV_Assert( i >= 0 );
     if( !p || !p->handle )
         return -1;
+    CV_Assert( i >= 0 );
     if( i == 0 )
         p->cleanupUMats();
     if( arg.m )
@@ -2441,45 +2834,52 @@ int Kernel::set(int i, const KernelArg& arg)
         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
 
+        if (!h)
+        {
+            p->release();
+            p = 0;
+            return -1;
+        }
+
         if (ptronly)
-            clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
         else if( arg.m->dims <= 2 )
         {
             UMat2D u2d(*arg.m);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
-            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
-            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
             i += 3;
 
             if( !(arg.flags & KernelArg::NO_SIZE) )
             {
                 int cols = u2d.cols*arg.wscale;
-                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
-                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
                 i += 2;
             }
         }
         else
         {
             UMat3D u3d(*arg.m);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
-            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
-            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
-            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
+            CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
             i += 4;
             if( !(arg.flags & KernelArg::NO_SIZE) )
             {
                 int cols = u3d.cols*arg.wscale;
-                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows);
-                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
-                clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
+                CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
                 i += 3;
             }
         }
         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
         return i;
     }
-    clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
+    CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
     return i+1;
 }
 
@@ -2509,17 +2909,17 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
                                            offset, globalsize, _localsize, 0, 0,
                                            sync ? 0 : &p->e);
-    if( sync || retval < 0 )
+    if( sync || retval != CL_SUCCESS )
     {
-        clFinish(qq);
+        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
         p->cleanupUMats();
     }
     else
     {
         p->addref();
-        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
+        CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
     }
-    return retval >= 0;
+    return retval == CL_SUCCESS;
 }
 
 bool Kernel::runTask(bool sync, const Queue& q)
@@ -2529,60 +2929,70 @@ bool Kernel::runTask(bool sync, const Queue& q)
 
     cl_command_queue qq = getQueue(q);
     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
-    if( sync || retval < 0 )
+    if( sync || retval != CL_SUCCESS )
     {
-        clFinish(qq);
+        CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
         p->cleanupUMats();
     }
     else
     {
         p->addref();
-        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
+        CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
     }
-    return retval >= 0;
+    return retval == CL_SUCCESS;
 }
 
 
 size_t Kernel::workGroupSize() const
 {
-    if(!p)
+    if(!p || !p->handle)
         return 0;
     size_t val = 0, retsz = 0;
     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
-                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
+                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
+}
+
+size_t Kernel::preferedWorkGroupSizeMultiple() const
+{
+    if(!p || !p->handle)
+        return 0;
+    size_t val = 0, retsz = 0;
+    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
+    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
 }
 
 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
 {
-    if(!p || !wsz)
+    if(!p || !p->handle || !wsz)
         return 0;
     size_t retsz = 0;
     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
-                                    sizeof(wsz[0]*3), wsz, &retsz) >= 0;
+                                    sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
 }
 
 size_t Kernel::localMemSize() const
 {
-    if(!p)
+    if(!p || !p->handle)
         return 0;
     size_t retsz = 0;
     cl_ulong val = 0;
     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
-                                    sizeof(val), &val, &retsz) >= 0 ? (size_t)val : 0;
+                                    sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// Program /////////////////////////////////////////////
 
 struct Program::Impl
 {
-    Impl(const ProgramSource2& _src,
+    Impl(const ProgramSource& _src,
          const String& _buildflags, String& errmsg)
     {
         refcount = 1;
-        const Context2& ctx = Context2::getDefault();
+        const Context& ctx = Context::getDefault();
         src = _src;
         buildflags = _buildflags;
         const String& srcstr = src.source();
@@ -2591,7 +3001,7 @@ struct Program::Impl
         cl_int retval = 0;
 
         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
-        if( handle && retval >= 0 )
+        if( handle && retval == CL_SUCCESS )
         {
             int i, n = (int)ctx.ndevices();
             AutoBuffer<void*> deviceListBuf(n+1);
@@ -2602,25 +3012,33 @@ struct Program::Impl
             retval = clBuildProgram(handle, n,
                                     (const cl_device_id*)deviceList,
                                     buildflags.c_str(), 0, 0);
-            if( retval < 0 )
+#if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
+            if( retval != CL_SUCCESS )
+#endif
             {
                 size_t retsz = 0;
-                retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
+                cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
-                if( retval >= 0 && retsz > 1 )
+                if (buildInfo_retval == CL_SUCCESS && retsz > 1)
                 {
                     AutoBuffer<char> bufbuf(retsz + 16);
                     char* buf = bufbuf;
-                    retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
+                    buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
-                    if( retval >= 0 )
+                    if (buildInfo_retval == CL_SUCCESS)
                     {
+                        // TODO It is useful to see kernel name & program file name also
                         errmsg = String(buf);
-                        CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
+                        printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
+                        fflush(stdout);
                     }
                 }
+                if (retval != CL_SUCCESS && handle)
+                {
+                    clReleaseProgram(handle);
+                    handle = NULL;
+                }
             }
-            CV_Assert(retval >= 0);
         }
     }
 
@@ -2632,7 +3050,7 @@ struct Program::Impl
         if(_buf.empty())
             return;
         String prefix0 = Program::getPrefix(buildflags);
-        const Context2& ctx = Context2::getDefault();
+        const Context& ctx = Context::getDefault();
         const Device& dev = Device::getDefault();
         const char* pos0 = _buf.c_str();
         const char* pos1 = strchr(pos0, '\n');
@@ -2654,6 +3072,7 @@ struct Program::Impl
         cl_int binstatus = 0, retval = 0;
         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
                                            &codelen, &bin, &binstatus, &retval);
+        CV_OclDbgAssert(retval == CL_SUCCESS);
     }
 
     String store()
@@ -2663,13 +3082,13 @@ struct Program::Impl
         size_t progsz = 0, retsz = 0;
         String prefix = Program::getPrefix(buildflags);
         size_t prefixlen = prefix.length();
-        if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) < 0)
+        if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
             return String();
         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
         uchar* buf = bufbuf;
         memcpy(buf, prefix.c_str(), prefixlen);
         buf += prefixlen;
-        if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) < 0)
+        if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
             return String();
         buf[progsz] = (uchar)'\0';
         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
@@ -2678,12 +3097,15 @@ struct Program::Impl
     ~Impl()
     {
         if( handle )
+        {
             clReleaseProgram(handle);
+            handle = NULL;
+        }
     }
 
     IMPLEMENT_REFCOUNTABLE();
 
-    ProgramSource2 src;
+    ProgramSource src;
     String buildflags;
     cl_program handle;
 };
@@ -2691,7 +3113,7 @@ struct Program::Impl
 
 Program::Program() { p = 0; }
 
-Program::Program(const ProgramSource2& src,
+Program::Program(const ProgramSource& src,
         const String& buildflags, String& errmsg)
 {
     p = 0;
@@ -2722,7 +3144,7 @@ Program::~Program()
         p->release();
 }
 
-bool Program::create(const ProgramSource2& src,
+bool Program::create(const ProgramSource& src,
             const String& buildflags, String& errmsg)
 {
     if(p)
@@ -2736,9 +3158,9 @@ bool Program::create(const ProgramSource2& src,
     return p != 0;
 }
 
-const ProgramSource2& Program::source() const
+const ProgramSource& Program::source() const
 {
-    static ProgramSource2 dummy;
+    static ProgramSource dummy;
     return p ? p->src : dummy;
 }
 
@@ -2772,15 +3194,15 @@ String Program::getPrefix() const
 
 String Program::getPrefix(const String& buildflags)
 {
-    const Context2& ctx = Context2::getDefault();
+    const Context& ctx = Context::getDefault();
     const Device& dev = ctx.device(0);
     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
 
-struct ProgramSource2::Impl
+struct ProgramSource::Impl
 {
     Impl(const char* _src)
     {
@@ -2799,39 +3221,39 @@ struct ProgramSource2::Impl
 
     IMPLEMENT_REFCOUNTABLE();
     String src;
-    ProgramSource2::hash_t h;
+    ProgramSource::hash_t h;
 };
 
 
-ProgramSource2::ProgramSource2()
+ProgramSource::ProgramSource()
 {
     p = 0;
 }
 
-ProgramSource2::ProgramSource2(const char* prog)
+ProgramSource::ProgramSource(const char* prog)
 {
     p = new Impl(prog);
 }
 
-ProgramSource2::ProgramSource2(const String& prog)
+ProgramSource::ProgramSource(const String& prog)
 {
     p = new Impl(prog);
 }
 
-ProgramSource2::~ProgramSource2()
+ProgramSource::~ProgramSource()
 {
     if(p)
         p->release();
 }
 
-ProgramSource2::ProgramSource2(const ProgramSource2& prog)
+ProgramSource::ProgramSource(const ProgramSource& prog)
 {
     p = prog.p;
     if(p)
         p->addref();
 }
 
-ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog)
+ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
 {
     Impl* newp = (Impl*)prog.p;
     if(newp)
@@ -2842,34 +3264,300 @@ ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog)
     return *this;
 }
 
-const String& ProgramSource2::source() const
+const String& ProgramSource::source() const
 {
     static String dummy;
     return p ? p->src : dummy;
 }
 
-ProgramSource2::hash_t ProgramSource2::hash() const
+ProgramSource::hash_t ProgramSource::hash() const
 {
     return p ? p->h : 0;
 }
 
-//////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
+
+class OpenCLBufferPool
+{
+protected:
+    ~OpenCLBufferPool() { }
+public:
+    virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
+    virtual void release(cl_mem handle, size_t capacity) = 0;
+};
+
+class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
+{
+public:
+    struct BufferEntry
+    {
+        cl_mem clBuffer_;
+        size_t capacity_;
+    };
+protected:
+    Mutex mutex_;
+
+    size_t currentReservedSize;
+    size_t maxReservedSize;
+
+    std::list<BufferEntry> reservedEntries_; // LRU order
+
+    // synchronized
+    bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
+    {
+        if (reservedEntries_.empty())
+            return false;
+        std::list<BufferEntry>::iterator i = reservedEntries_.begin();
+        std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
+        BufferEntry result = {NULL, 0};
+        size_t minDiff = (size_t)(-1);
+        for (; i != reservedEntries_.end(); ++i)
+        {
+            BufferEntry& e = *i;
+            if (e.capacity_ >= size)
+            {
+                size_t diff = e.capacity_ - size;
+                if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
+                {
+                    minDiff = diff;
+                    result_pos = i;
+                    result = e;
+                    if (diff == 0)
+                        break;
+                }
+            }
+        }
+        if (result_pos != reservedEntries_.end())
+        {
+            //CV_DbgAssert(result == *result_pos);
+            reservedEntries_.erase(result_pos);
+            entry = result;
+            currentReservedSize -= entry.capacity_;
+            return true;
+        }
+        return false;
+    }
+
+    // synchronized
+    void _checkSizeOfReservedEntries()
+    {
+        while (currentReservedSize > maxReservedSize)
+        {
+            CV_DbgAssert(!reservedEntries_.empty());
+            const BufferEntry& entry = reservedEntries_.back();
+            CV_DbgAssert(currentReservedSize >= entry.capacity_);
+            currentReservedSize -= entry.capacity_;
+            _releaseBufferEntry(entry);
+            reservedEntries_.pop_back();
+        }
+    }
+
+    inline size_t _allocationGranularity(size_t size)
+    {
+        // heuristic values
+        if (size < 1024)
+            return 16;
+        else if (size < 64*1024)
+            return 64;
+        else if (size < 1024*1024)
+            return 4096;
+        else if (size < 16*1024*1024)
+            return 64*1024;
+        else
+            return 1024*1024;
+    }
+
+    void _allocateBufferEntry(BufferEntry& entry, size_t size)
+    {
+        CV_DbgAssert(entry.clBuffer_ == NULL);
+        entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
+        Context& ctx = Context::getDefault();
+        cl_int retval = CL_SUCCESS;
+        entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
+        CV_Assert(retval == CL_SUCCESS);
+        CV_Assert(entry.clBuffer_ != NULL);
+        LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
+                (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
+    }
+
+    void _releaseBufferEntry(const BufferEntry& entry)
+    {
+        CV_Assert(entry.capacity_ != 0);
+        CV_Assert(entry.clBuffer_ != NULL);
+        LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
+                entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
+        clReleaseMemObject(entry.clBuffer_);
+    }
+public:
+    OpenCLBufferPoolImpl()
+        : currentReservedSize(0), maxReservedSize(0)
+    {
+        // Note: Buffer pool is disabled by default,
+        //       because we didn't receive significant performance improvement
+        maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
+    }
+    virtual ~OpenCLBufferPoolImpl()
+    {
+        freeAllReservedBuffers();
+        CV_Assert(reservedEntries_.empty());
+    }
+public:
+    virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
+    {
+        BufferEntry entry = {NULL, 0};
+        if (maxReservedSize > 0)
+        {
+            AutoLock locker(mutex_);
+            if (_findAndRemoveEntryFromReservedList(entry, size))
+            {
+                CV_DbgAssert(size <= entry.capacity_);
+                LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
+                capacity = entry.capacity_;
+                return entry.clBuffer_;
+            }
+        }
+        _allocateBufferEntry(entry, size);
+        capacity = entry.capacity_;
+        return entry.clBuffer_;
+    }
+    virtual void release(cl_mem handle, size_t capacity)
+    {
+        BufferEntry entry = {handle, capacity};
+        if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
+        {
+            _releaseBufferEntry(entry);
+        }
+        else
+        {
+            AutoLock locker(mutex_);
+            reservedEntries_.push_front(entry);
+            currentReservedSize += entry.capacity_;
+            _checkSizeOfReservedEntries();
+        }
+    }
+
+    virtual size_t getReservedSize() const { return currentReservedSize; }
+    virtual size_t getMaxReservedSize() const { return maxReservedSize; }
+    virtual void setMaxReservedSize(size_t size)
+    {
+        AutoLock locker(mutex_);
+        size_t oldMaxReservedSize = maxReservedSize;
+        maxReservedSize = size;
+        if (maxReservedSize < oldMaxReservedSize)
+        {
+            std::list<BufferEntry>::iterator i = reservedEntries_.begin();
+            for (; i != reservedEntries_.end();)
+            {
+                const BufferEntry& entry = *i;
+                if (entry.capacity_ > maxReservedSize / 8)
+                {
+                    CV_DbgAssert(currentReservedSize >= entry.capacity_);
+                    currentReservedSize -= entry.capacity_;
+                    _releaseBufferEntry(entry);
+                    i = reservedEntries_.erase(i);
+                    continue;
+                }
+                ++i;
+            }
+            _checkSizeOfReservedEntries();
+        }
+    }
+    virtual void freeAllReservedBuffers()
+    {
+        AutoLock locker(mutex_);
+        std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
+        for (; i != reservedEntries_.end(); ++i)
+        {
+            const BufferEntry& entry = *i;
+            _releaseBufferEntry(entry);
+        }
+        reservedEntries_.clear();
+    }
+};
+
+#if defined _MSC_VER
+#pragma warning(disable:4127) // conditional expression is constant
+#endif
+template <bool readAccess, bool writeAccess>
+class AlignedDataPtr
+{
+protected:
+    const size_t size_;
+    uchar* const originPtr_;
+    const size_t alignment_;
+    uchar* ptr_;
+    uchar* allocatedPtr_;
+
+public:
+    AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
+        : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
+    {
+        CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
+        if (((size_t)ptr_ & (alignment - 1)) != 0)
+        {
+            allocatedPtr_ = new uchar[size_ + alignment - 1];
+            ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
+            if (readAccess)
+            {
+                memcpy(ptr_, originPtr_, size_);
+            }
+        }
+    }
+
+    uchar* getAlignedPtr() const
+    {
+        CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
+        return ptr_;
+    }
+
+    ~AlignedDataPtr()
+    {
+        if (allocatedPtr_)
+        {
+            if (writeAccess)
+            {
+                memcpy(originPtr_, ptr_, size_);
+            }
+            delete[] allocatedPtr_;
+            allocatedPtr_ = NULL;
+        }
+        ptr_ = NULL;
+    }
+private:
+    AlignedDataPtr(const AlignedDataPtr&); // disabled
+    AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
+};
+#if defined _MSC_VER
+#pragma warning(default:4127) // conditional expression is constant
+#endif
+
+#ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
+#define CV_OPENCL_DATA_PTR_ALIGNMENT 16
+#endif
 
 class OpenCLAllocator : public MatAllocator
 {
+    mutable OpenCLBufferPoolImpl bufferPool;
+    enum AllocatorFlags
+    {
+        ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
+    };
 public:
     OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
 
-    UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const
+    UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
+            int flags, UMatUsageFlags usageFlags) const
     {
-        UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags);
+        UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
         return u;
     }
 
-    void getBestFlags(const Context2& ctx, int /*flags*/, int& createFlags, int& flags0) const
+    void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
     {
         const Device& dev = ctx.device(0);
-        createFlags = CL_MEM_READ_WRITE;
+        createFlags = 0;
+        if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
+            createFlags |= CL_MEM_ALLOC_HOST_PTR;
 
         if( dev.hostUnifiedMemory() )
             flags0 = 0;
@@ -2878,10 +3566,10 @@ public:
     }
 
     UMatData* allocate(int dims, const int* sizes, int type,
-                       void* data, size_t* step, int flags) const
+                       void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
     {
         if(!useOpenCL())
-            return defaultAllocate(dims, sizes, type, data, step, flags);
+            return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
         CV_Assert(data == 0);
         size_t total = CV_ELEM_SIZE(type);
         for( int i = dims-1; i >= 0; i-- )
@@ -2891,25 +3579,41 @@ public:
             total *= sizes[i];
         }
 
-        Context2& ctx = Context2::getDefault();
+        Context& ctx = Context::getDefault();
         int createFlags = 0, flags0 = 0;
-        getBestFlags(ctx, flags, createFlags, flags0);
+        getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
 
-        cl_int retval = 0;
-        void* handle = clCreateBuffer((cl_context)ctx.ptr(),
-                                      createFlags, total, 0, &retval);
-        if( !handle || retval < 0 )
-            return defaultAllocate(dims, sizes, type, data, step, flags);
+        size_t capacity = 0;
+        void* handle = NULL;
+        int allocatorFlags = 0;
+        if (createFlags == 0)
+        {
+            handle = bufferPool.allocate(total, capacity);
+            if (!handle)
+                return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
+            allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
+        }
+        else
+        {
+            capacity = total;
+            cl_int retval = 0;
+            handle = clCreateBuffer((cl_context)ctx.ptr(),
+                                          CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
+            if( !handle || retval != CL_SUCCESS )
+                return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
+        }
         UMatData* u = new UMatData(this);
         u->data = 0;
         u->size = total;
+        u->capacity = capacity;
         u->handle = handle;
         u->flags = flags0;
-
+        u->allocatorFlags_ = allocatorFlags;
+        CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
         return u;
     }
 
-    bool allocate(UMatData* u, int accessFlags) const
+    bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
     {
         if(!u)
             return false;
@@ -2919,22 +3623,22 @@ public:
         if(u->handle == 0)
         {
             CV_Assert(u->origdata != 0);
-            Context2& ctx = Context2::getDefault();
+            Context& ctx = Context::getDefault();
             int createFlags = 0, flags0 = 0;
-            getBestFlags(ctx, accessFlags, createFlags, flags0);
+            getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
 
             cl_context ctx_handle = (cl_context)ctx.ptr();
             cl_int retval = 0;
             int tempUMatFlags = UMatData::TEMP_UMAT;
-            u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
+            u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
                                        u->size, u->origdata, &retval);
-            if((!u->handle || retval < 0) && !(accessFlags & ACCESS_FAST))
+            if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
             {
-                u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|createFlags,
+                u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
                                            u->size, u->origdata, &retval);
                 tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
             }
-            if(!u->handle || retval < 0)
+            if(!u->handle || retval != CL_SUCCESS)
                 return false;
             u->prevAllocator = u->currAllocator;
             u->currAllocator = this;
@@ -2988,14 +3692,15 @@ public:
         CV_Assert(u->handle != 0 && u->urefcount == 0);
         if(u->tempUMat())
         {
-            UMatDataAutoLock lock(u);
+//            UMatDataAutoLock lock(u);
             if( u->hostCopyObsolete() && u->refcount > 0 )
             {
                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
                 if( u->tempCopiedUMat() )
                 {
-                    clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
-                                        u->size, u->origdata, 0, 0, 0);
+                    AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
+                    CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                        u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
                 }
                 else
                 {
@@ -3003,8 +3708,9 @@ public:
                     void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                                     (CL_MAP_READ | CL_MAP_WRITE),
                                                     0, u->size, 0, 0, 0, &retval);
-                    clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
-                    clFinish(q);
+                    CV_OclDbgAssert(retval == CL_SUCCESS);
+                    CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
+                    CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
                 }
             }
             u->markHostCopyObsolete(false);
@@ -3025,8 +3731,16 @@ public:
                 fastFree(u->data);
                 u->data = 0;
             }
-            clReleaseMemObject((cl_mem)u->handle);
+            if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
+            {
+                bufferPool.release((cl_mem)u->handle, u->capacity);
+            }
+            else
+            {
+                clReleaseMemObject((cl_mem)u->handle);
+            }
             u->handle = 0;
+            u->capacity = 0;
             delete u;
         }
     }
@@ -3056,7 +3770,7 @@ public:
                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
                                                      (CL_MAP_READ | CL_MAP_WRITE),
                                                      0, u->size, 0, 0, 0, &retval);
-                if(u->data && retval >= 0)
+                if(u->data && retval == CL_SUCCESS)
                 {
                     u->markHostCopyObsolete(false);
                     return;
@@ -3075,8 +3789,9 @@ public:
 
         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
         {
+            AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
-                                           u->size, u->data, 0, 0, 0) >= 0 );
+                                           u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
             u->markHostCopyObsolete(false);
         }
     }
@@ -3095,14 +3810,15 @@ public:
         if( !u->copyOnMap() && u->data )
         {
             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
-                                (cl_mem)u->handle, u->data, 0, 0, 0)) >= 0 );
-            clFinish(q);
+                                (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
+            CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
             u->data = 0;
         }
         else if( u->copyOnMap() && u->deviceCopyObsolete() )
         {
+            AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
-                                u->size, u->data, 0, 0, 0)) >= 0 );
+                                u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
         }
         u->markDeviceCopyObsolete(false);
         u->markHostCopyObsolete(false);
@@ -3207,16 +3923,18 @@ public:
                                             total, new_sz,
                                             srcrawofs, new_srcofs, new_srcstep,
                                             dstrawofs, new_dstofs, new_dststep);
+
+        AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
         if( iscontinuous )
         {
             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
-                                           srcrawofs, total, dstptr, 0, 0, 0) >= 0 );
+                                           srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
         }
         else
         {
             CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
                             new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
-                            new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 );
+                            new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
         }
     }
 
@@ -3246,7 +3964,7 @@ public:
         // we can do it in 2 cases:
         //    1. we overwrite the whole content
         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
-        if( u->data && (u->hostCopyObsolete() <= u->deviceCopyObsolete() || total == u->size))
+        if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
         {
             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
             u->markHostCopyObsolete(false);
@@ -3257,25 +3975,21 @@ public:
         CV_Assert( u->handle != 0 );
         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
 
+        AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
         if( iscontinuous )
         {
-            int crc = 0;
-            for( size_t i = 0; i < total; i++ )
-                crc ^= ((uchar*)srcptr)[i];
             CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
-                CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 );
+                CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
         }
         else
         {
             CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
                 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
-                new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) >= 0 );
+                new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
         }
 
         u->markHostCopyObsolete(true);
         u->markDeviceCopyObsolete(false);
-
-        clFinish(q);
     }
 
     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
@@ -3297,12 +4011,12 @@ public:
         UMatDataAutoLock src_autolock(src);
         UMatDataAutoLock dst_autolock(dst);
 
-        if( !src->handle || (src->data && src->hostCopyObsolete() <= src->deviceCopyObsolete()) )
+        if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
         {
             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
             return;
         }
-        if( !dst->handle || (dst->data && dst->hostCopyObsolete() <= dst->deviceCopyObsolete()) )
+        if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
         {
             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
             dst->markHostCopyObsolete(false);
@@ -3317,7 +4031,7 @@ public:
         if( iscontinuous )
         {
             CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
-                                           srcrawofs, dstrawofs, total, 0, 0, 0) >= 0 );
+                                           srcrawofs, dstrawofs, total, 0, 0, 0) == CL_SUCCESS );
         }
         else
         {
@@ -3326,26 +4040,158 @@ public:
                                                new_srcofs, new_dstofs, new_sz,
                                                new_srcstep[0], new_srcstep[1],
                                                new_dststep[0], new_dststep[1],
-                                               0, 0, 0)) >= 0 );
+                                               0, 0, 0)) == CL_SUCCESS );
         }
 
         dst->markHostCopyObsolete(true);
         dst->markDeviceCopyObsolete(false);
 
         if( _sync )
-            clFinish(q);
+        {
+            CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
+        }
     }
 
+    BufferPoolController* getBufferPoolController() const { return &bufferPool; }
+
     MatAllocator* matStdAllocator;
 };
 
 MatAllocator* getOpenCLAllocator()
 {
-    static OpenCLAllocator allocator;
-    return &allocator;
+    static MatAllocator * allocator = new OpenCLAllocator();
+    return allocator;
 }
 
-const char* typeToStr(int t)
+///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
+
+static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
+{
+    cl_uint numDevices = 0;
+    CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
+                                0, NULL, &numDevices) == CL_SUCCESS);
+
+    if (numDevices == 0)
+    {
+        devices.clear();
+        return;
+    }
+
+    devices.resize((size_t)numDevices);
+    CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
+                                numDevices, &devices[0], &numDevices) == CL_SUCCESS);
+}
+
+struct PlatformInfo::Impl
+{
+    Impl(void* id)
+    {
+        refcount = 1;
+        handle = *(cl_platform_id*)id;
+        getDevices(devices, handle);
+    }
+
+    String getStrProp(cl_device_info prop) const
+    {
+        char buf[1024];
+        size_t sz=0;
+        return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
+            sz < sizeof(buf) ? String(buf) : String();
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+    std::vector<cl_device_id> devices;
+    cl_platform_id handle;
+};
+
+PlatformInfo::PlatformInfo()
+{
+    p = 0;
+}
+
+PlatformInfo::PlatformInfo(void* platform_id)
+{
+    p = new Impl(platform_id);
+}
+
+PlatformInfo::~PlatformInfo()
+{
+    if(p)
+        p->release();
+}
+
+PlatformInfo::PlatformInfo(const PlatformInfo& i)
+{
+    if (i.p)
+        i.p->addref();
+    p = i.p;
+}
+
+PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
+{
+    if (i.p != p)
+    {
+        if (i.p)
+            i.p->addref();
+        if (p)
+            p->release();
+        p = i.p;
+    }
+    return *this;
+}
+
+int PlatformInfo::deviceNumber() const
+{
+    return p ? (int)p->devices.size() : 0;
+}
+
+void PlatformInfo::getDevice(Device& device, int d) const
+{
+    CV_Assert(p && d < (int)p->devices.size() );
+    if(p)
+        device.set(p->devices[d]);
+}
+
+String PlatformInfo::name() const
+{
+    return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
+}
+
+String PlatformInfo::vendor() const
+{
+    return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
+}
+
+String PlatformInfo::version() const
+{
+    return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
+}
+
+static void getPlatforms(std::vector<cl_platform_id>& platforms)
+{
+    cl_uint numPlatforms = 0;
+    CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
+
+    if (numPlatforms == 0)
+    {
+        platforms.clear();
+        return;
+    }
+
+    platforms.resize((size_t)numPlatforms);
+    CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
+}
+
+void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
+{
+    std::vector<cl_platform_id> platforms;
+    getPlatforms(platforms);
+
+    for (size_t i = 0; i < platforms.size(); i++)
+        platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
+}
+
+const char* typeToStr(int type)
 {
     static const char* tab[]=
     {
@@ -3358,13 +4204,13 @@ const char* typeToStr(int t)
         "double", "double2", "double3", "double4",
         "?", "?", "?", "?"
     };
-    int cn = CV_MAT_CN(t);
-    return cn > 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+    int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+    return cn > 4 ? "?" : tab[depth*4 + cn-1];
 }
 
-const char* memopTypeToStr(int t)
+const char* memopTypeToStr(int type)
 {
-    static const char* tab[]=
+    static const char* tab[] =
     {
         "uchar", "uchar2", "uchar3", "uchar4",
         "uchar", "uchar2", "uchar3", "uchar4",
@@ -3372,11 +4218,11 @@ const char* memopTypeToStr(int t)
         "ushort", "ushort2", "ushort3", "ushort4",
         "int", "int2", "int3", "int4",
         "int", "int2", "int3", "int4",
-        "int2", "int4", "?", "int8",
+        "ulong", "ulong2", "ulong3", "ulong4",
         "?", "?", "?", "?"
     };
-    int cn = CV_MAT_CN(t);
-    return cn > 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+    int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
+    return cn > 4 ? "?" : tab[depth*4 + cn-1];
 }
 
 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
@@ -3392,14 +4238,201 @@ const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
         sprintf(buf, "convert_%s", typestr);
     }
     else if( sdepth >= CV_32F )
-    {
         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
-    }
     else
-    {
         sprintf(buf, "convert_%s_sat", typestr);
-    }
+
     return buf;
 }
 
+template <typename T>
+static std::string kerToStr(const Mat & k)
+{
+    int width = k.cols - 1, depth = k.depth();
+    const T * const data = reinterpret_cast<const T *>(k.data);
+
+    std::ostringstream stream;
+    stream.precision(10);
+
+    if (depth <= CV_8S)
+    {
+        for (int i = 0; i < width; ++i)
+            stream << "DIG(" << (int)data[i] << ")";
+        stream << "DIG(" << (int)data[width] << ")";
+    }
+    else if (depth == CV_32F)
+    {
+        stream.setf(std::ios_base::showpoint);
+        for (int i = 0; i < width; ++i)
+            stream << "DIG(" << data[i] << "f)";
+        stream << "DIG(" << data[width] << "f)";
+    }
+    else
+    {
+        for (int i = 0; i < width; ++i)
+            stream << "DIG(" << data[i] << ")";
+        stream << "DIG(" << data[width] << ")";
+    }
+
+    return stream.str();
+}
+
+String kernelToStr(InputArray _kernel, int ddepth)
+{
+    Mat kernel = _kernel.getMat().reshape(1, 1);
+
+    int depth = kernel.depth();
+    if (ddepth < 0)
+        ddepth = depth;
+
+    if (ddepth != depth)
+        kernel.convertTo(kernel, ddepth);
+
+    typedef std::string (*func_t)(const Mat &);
+    static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>,kerToStr<short>,
+                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
+    const func_t func = funcs[depth];
+    CV_Assert(func != 0);
+
+    return cv::format(" -D COEFF=%s", func(kernel).c_str());
+}
+
+/////////////////////////////////////////// Image2D ////////////////////////////////////////////////////
+
+struct Image2D::Impl
+{
+    Impl(const UMat &src)
+    {
+        handle = 0;
+        refcount = 1;
+        init(src);
+    }
+
+    ~Impl()
+    {
+        if (handle)
+            clReleaseMemObject(handle);
+    }
+
+    void init(const UMat &src)
+    {
+        CV_Assert(ocl::Device::getDefault().imageSupport());
+
+        cl_image_format format;
+        int err, depth = src.depth(), cn = src.channels();
+        CV_Assert(cn <= 4);
+
+        static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
+                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
+        static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
+
+        int channelType = channelTypes[depth], channelOrder = channelOrders[cn];
+        if (channelType < 0 || channelOrder < 0)
+            CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
+
+        format.image_channel_data_type = (cl_channel_type)channelType;
+        format.image_channel_order = (cl_channel_order)channelOrder;
+
+        cl_context context = (cl_context)Context::getDefault().ptr();
+        cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
+
+#ifdef CL_VERSION_1_2
+        // this enables backwards portability to
+        // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
+        const Device & d = ocl::Device::getDefault();
+        int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
+        if (1 < major || (1 == major && 2 <= minor))
+        {
+            cl_image_desc desc;
+            desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width      = src.cols;
+            desc.image_height     = src.rows;
+            desc.image_depth      = 0;
+            desc.image_array_size = 1;
+            desc.image_row_pitch  = 0;
+            desc.image_slice_pitch = 0;
+            desc.buffer           = NULL;
+            desc.num_mip_levels   = 0;
+            desc.num_samples      = 0;
+            handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+        }
+        else
+#endif
+        {
+            handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
+        }
+        CV_OclDbgAssert(err == CL_SUCCESS);
+
+        size_t origin[] = { 0, 0, 0 };
+        size_t region[] = { src.cols, src.rows, 1 };
+
+        cl_mem devData;
+        if (!src.isContinuous())
+        {
+            devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
+            CV_OclDbgAssert(err == CL_SUCCESS);
+
+            const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
+            CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
+                roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
+            CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
+        }
+        else
+            devData = (cl_mem)src.handle(ACCESS_READ);
+        CV_Assert(devData != NULL);
+
+        CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
+        if (!src.isContinuous())
+        {
+            CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
+            CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
+        }
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_mem handle;
+};
+
+Image2D::Image2D()
+{
+    p = NULL;
+}
+
+Image2D::Image2D(const UMat &src)
+{
+    p = new Impl(src);
+}
+
+Image2D::Image2D(const Image2D & i)
+{
+    p = i.p;
+    if (p)
+        p->addref();
+}
+
+Image2D & Image2D::operator = (const Image2D & i)
+{
+    if (i.p != p)
+    {
+        if (i.p)
+            i.p->addref();
+        if (p)
+            p->release();
+        p = i.p;
+    }
+    return *this;
+}
+
+Image2D::~Image2D()
+{
+    if (p)
+        p->release();
+}
+
+void* Image2D::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
 }}
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index 9c86057ca..a7dacc428 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -70,147 +70,218 @@
 #define CV_PI M_PI_F
 #endif
 
-#define dstelem *(__global dstT*)(dstptr + dst_index)
-#define dstelem2 *(__global dstT*)(dstptr2 + dst_index2)
+#ifndef cn
+#define cn 1
+#endif
+
+#if cn == 1
+#undef srcT1_C1
+#undef srcT2_C1
+#undef dstT_C1
+#define srcT1_C1 srcT1
+#define srcT2_C1 srcT2
+#define dstT_C1 dstT
+#endif
+
+#if cn != 3
+    #define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
+    #define storedst2(val) *(__global dstT*)(dstptr2 + dst_index2) = val
+#else
+    #define storedst(val) vstore3(val, 0, (__global dstT_C1*)(dstptr + dst_index))
+    #define storedst2(val) vstore3(val, 0, (__global dstT_C1*)(dstptr2 + dst_index2))
+#endif
+
 #define noconvert
 
 #ifndef workT
 
     #ifndef srcT1
     #define srcT1 dstT
+    #define srcT1_C1 dstT_C1
     #endif
     #ifndef srcT2
     #define srcT2 dstT
+    #define srcT2_C1 dstT_C1
     #endif
     #define workT dstT
-    #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index)
-    #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index)
+    #if cn != 3
+        #define srcelem1 *(__global srcT1*)(srcptr1 + src1_index)
+        #define srcelem2 *(__global srcT2*)(srcptr2 + src2_index)
+    #else
+        #define srcelem1 vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index))
+        #define srcelem2 vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index))
+    #endif
     #ifndef convertToDT
     #define convertToDT noconvert
     #endif
 
 #else
 
-    #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index))
-    #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index))
+    #ifndef convertToWT2
+    #define convertToWT2 convertToWT1
+    #endif
+    #if cn != 3
+        #define srcelem1 convertToWT1(*(__global srcT1*)(srcptr1 + src1_index))
+        #define srcelem2 convertToWT2(*(__global srcT2*)(srcptr2 + src2_index))
+    #else
+        #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1*)(srcptr1 + src1_index)))
+        #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1*)(srcptr2 + src2_index)))
+    #endif
 
 #endif
 
+#ifndef workST
+#define workST workT
+#endif
+
 #define EXTRA_PARAMS
 #define EXTRA_INDEX
 
 #if defined OP_ADD
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
+#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
 
 #elif defined OP_SUB
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
+#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))
 
 #elif defined OP_RSUB
-#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
+#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))
 
 #elif defined OP_ABSDIFF
 #define PROCESS_ELEM \
     workT v = srcelem1 - srcelem2; \
-    dstelem = convertToDT(v >= (workT)(0) ? v : -v);
+    storedst(convertToDT(v >= (workT)(0) ? v : -v))
 
 #elif defined OP_AND
-#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
+#define PROCESS_ELEM storedst(srcelem1 & srcelem2)
 
 #elif defined OP_OR
-#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
+#define PROCESS_ELEM storedst(srcelem1 | srcelem2)
 
 #elif defined OP_XOR
-#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
+#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)
 
 #elif defined OP_NOT
-#define PROCESS_ELEM dstelem = ~srcelem1
+#define PROCESS_ELEM storedst(~srcelem1)
 
 #elif defined OP_MIN
-#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
+#define PROCESS_ELEM storedst(min(srcelem1, srcelem2))
 
 #elif defined OP_MAX
-#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
+#define PROCESS_ELEM storedst(max(srcelem1, srcelem2))
 
 #elif defined OP_MUL
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
+#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))
 
 #elif defined OP_MUL_SCALE
 #undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT scale
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
+#ifdef UNARY_OP
+#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
+#undef srcelem2
+#define srcelem2 srcelem2_
+#else
+#define EXTRA_PARAMS , scaleT scale
+#endif
+#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))
 
 #elif defined OP_DIV
 #define PROCESS_ELEM \
         workT e2 = srcelem2, zero = (workT)(0); \
-        dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
+        storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))
 
 #elif defined OP_DIV_SCALE
 #undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT scale
+#ifdef UNARY_OP
+#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
+#undef srcelem2
+#define srcelem2 srcelem2_
+#else
+#define EXTRA_PARAMS , scaleT scale
+#endif
 #define PROCESS_ELEM \
         workT e2 = srcelem2, zero = (workT)(0); \
-        dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
+        storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))
+
+#elif defined OP_RDIV_SCALE
+#undef EXTRA_PARAMS
+#ifdef UNARY_OP
+#define EXTRA_PARAMS , workST srcelem2_, scaleT scale
+#undef srcelem2
+#define srcelem2 srcelem2_
+#else
+#define EXTRA_PARAMS , scaleT scale
+#endif
+#define PROCESS_ELEM \
+        workT e1 = srcelem1, zero = (workT)(0); \
+        storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))
 
 #elif defined OP_RECIP_SCALE
 #undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT scale
+#define EXTRA_PARAMS , scaleT scale
 #define PROCESS_ELEM \
         workT e1 = srcelem1, zero = (workT)(0); \
-        dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
+        storedst(convertToDT(e1 != zero ? scale / e1 : zero))
 
 #elif defined OP_ADDW
 #undef EXTRA_PARAMS
-#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
+#define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma
+#define PROCESS_ELEM storedst(convertToDT(srcelem1*alpha + srcelem2*beta + gamma))
 
 #elif defined OP_MAG
-#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
+#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
 
 #elif defined OP_ABS_NOSAT
 #define PROCESS_ELEM \
     dstT v = convertToDT(srcelem1); \
-    dstelem = v >= 0 ? v : -v
+    storedst(v >= 0 ? v : -v)
 
 #elif defined OP_PHASE_RADIANS
 #define PROCESS_ELEM \
         workT tmp = atan2(srcelem2, srcelem1); \
-        if(tmp < 0) tmp += 6.283185307179586232; \
-        dstelem = tmp
+        if(tmp < 0) tmp += 6.283185307179586232f; \
+        storedst(tmp)
 
 #elif defined OP_PHASE_DEGREES
     #define PROCESS_ELEM \
-    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
+    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465f; \
     if(tmp < 0) tmp += 360; \
-    dstelem = tmp
+    storedst(tmp)
 
 #elif defined OP_EXP
-#define PROCESS_ELEM dstelem = exp(srcelem1)
+#define PROCESS_ELEM storedst(exp(srcelem1))
 
 #elif defined OP_POW
-#define PROCESS_ELEM dstelem = pow(srcelem1, srcelem2)
+#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
+
+#elif defined OP_POWN
+#undef workT
+#define workT int
+#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
 
 #elif defined OP_SQRT
-#define PROCESS_ELEM dstelem = sqrt(srcelem1)
+#define PROCESS_ELEM storedst(sqrt(srcelem1))
 
 #elif defined OP_LOG
 #define PROCESS_ELEM \
-dstT v = (dstT)(srcelem1);\
-dstelem = v > (dstT)(0) ? log(v) : log(-v)
+    dstT v = (dstT)(srcelem1);\
+    storedst(v > (dstT)(0) ? log(v) : log(-v))
 
 #elif defined OP_CMP
 #define dstT uchar
 #define srcT2 srcT1
 #define convertToWT1
-#define convertToWT2
-#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
+#define PROCESS_ELEM storedst(convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0))
 
-#elif defined OP_CONVERT
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
-
-#elif defined OP_CONVERT_SCALE
+#elif defined OP_CONVERT_SCALE_ABS
 #undef EXTRA_PARAMS
 #define EXTRA_PARAMS , workT alpha, workT beta
-#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
+#define PROCESS_ELEM \
+    workT value = srcelem1 * alpha + beta; \
+    storedst(convertToDT(value >= 0 ? value : -value))
+
+#elif defined OP_SCALE_ADD
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha
+#define PROCESS_ELEM storedst(convertToDT(srcelem1 * alpha + srcelem2))
 
 #elif defined OP_CTP_AD || defined OP_CTP_AR
 #ifdef OP_CTP_AD
@@ -227,8 +298,8 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
     dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
     dstT cartToPolar = y2 <= x2 ? x * y / (x2 + 0.28f * y2 + CV_EPSILON) + tmp : (tmp1 - x * y / (y2 + 0.28f * x2 + CV_EPSILON)); \
     TO_DEGREE \
-    dstelem = magnitude; \
-    dstelem2 = cartToPolar
+    storedst(magnitude); \
+    storedst2(cartToPolar)
 
 #elif defined OP_PTC_AD || defined OP_PTC_AR
 #ifdef OP_PTC_AD
@@ -242,8 +313,15 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
 #define PROCESS_ELEM \
     dstT x = srcelem1, y = srcelem2; \
     FROM_DEGREE; \
-    dstelem = cos(alpha) * x; \
-    dstelem2 = sin(alpha) * x
+    storedst(cos(alpha) * x); \
+    storedst2(sin(alpha) * x)
+
+#elif defined OP_PATCH_NANS
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , int val
+#define PROCESS_ELEM \
+    if (( srcelem1 & 0x7fffffff) > 0x7f800000 ) \
+        storedst(val)
 
 #else
 #error "unknown op type"
@@ -253,17 +331,26 @@ dstelem = v > (dstT)(0) ? log(v) : log(-v)
     #undef EXTRA_PARAMS
     #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
     #undef EXTRA_INDEX
-    #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT) + dstoffset2)
+    #define EXTRA_INDEX int dst_index2 = mad24(y, dststep2, x*(int)sizeof(dstT_C1)*cn + dstoffset2)
 #endif
 
 #if defined UNARY_OP || defined MASK_UNARY_OP
-#undef srcelem2
+
 #if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
     defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
-    defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW
+    defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \
+    defined OP_MUL || defined OP_DIV || defined OP_POWN
     #undef EXTRA_PARAMS
-    #define EXTRA_PARAMS , workT srcelem2
+    #define EXTRA_PARAMS , workST srcelem2_
+    #undef srcelem2
+    #define srcelem2 srcelem2_
 #endif
+
+#if cn == 3
+#undef srcelem2
+#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
+#endif
+
 #endif
 
 #if defined BINARY_OP
@@ -278,9 +365,11 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
-        int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2);
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
+        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
+#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
+        int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
+#endif
+        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
         EXTRA_INDEX;
 
         PROCESS_ELEM;
@@ -303,9 +392,9 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
         int mask_index = mad24(y, maskstep, x + maskoffset);
         if( mask[mask_index] )
         {
-            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
-            int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2) + srcoffset2);
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
+            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
+            int src2_index = mad24(y, srcstep2, x*(int)sizeof(srcT2_C1)*cn + srcoffset2);
+            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
 
             PROCESS_ELEM;
         }
@@ -323,9 +412,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
-        EXTRA_INDEX;
+        int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
+        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
 
         PROCESS_ELEM;
     }
@@ -346,8 +434,8 @@ __kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
         int mask_index = mad24(y, maskstep, x + maskoffset);
         if( mask[mask_index] )
         {
-            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1) + srcoffset1);
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
+            int src1_index = mad24(y, srcstep1, x*(int)sizeof(srcT1_C1)*cn + srcoffset1);
+            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT_C1)*cn + dstoffset);
 
             PROCESS_ELEM;
         }
diff --git a/modules/core/src/opencl/copyset.cl b/modules/core/src/opencl/copyset.cl
index 8fb5a00cf..cbafe6705 100644
--- a/modules/core/src/opencl/copyset.cl
+++ b/modules/core/src/opencl/copyset.cl
@@ -41,9 +41,67 @@
 //
 //M*/
 
+#ifdef COPY_TO_MASK
+
+#define DEFINE_DATA \
+    int src_index = mad24(y, src_step, x*(int)sizeof(T)*scn + src_offset); \
+    int dst_index = mad24(y, dst_step, x*(int)sizeof(T)*scn + dst_offset); \
+     \
+    __global const T * src = (__global const T *)(srcptr + src_index); \
+    __global T * dst = (__global T *)(dstptr + dst_index)
+
+__kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_offset,
+                         __global const uchar * maskptr, int mask_step, int mask_offset,
+                         __global uchar * dstptr, int dst_step, int dst_offset,
+                         int dst_rows, int dst_cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int mask_index = mad24(y, mask_step, x * mcn + mask_offset);
+        __global const uchar * mask = (__global const uchar *)(maskptr + mask_index);
+
+#if mcn == 1
+        if (mask[0])
+        {
+            DEFINE_DATA;
+
+            #pragma unroll
+            for (int c = 0; c < scn; ++c)
+                dst[c] = src[c];
+        }
+#elif scn == mcn
+        DEFINE_DATA;
+
+        #pragma unroll
+        for (int c = 0; c < scn; ++c)
+            if (mask[c])
+                dst[c] = src[c];
+#else
+#error "(mcn == 1 || mcn == scn) should be true"
+#endif
+    }
+}
+
+#else
+
+#ifndef dstST
+#define dstST dstT
+#endif
+
+#if cn != 3
+#define value value_
+#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
+#else
+#define value (dstT)(value_.x, value_.y, value_.z)
+#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index))
+#endif
+
 __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
                       __global uchar* dstptr, int dststep, int dstoffset,
-                      int rows, int cols, dstT value )
+                      int rows, int cols, dstST value_ )
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -53,21 +111,23 @@ __kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
         int mask_index = mad24(y, maskstep, x + maskoffset);
         if( mask[mask_index] )
         {
-            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
-            *(__global dstT*)(dstptr + dst_index) = value;
+            int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+            storedst(value);
         }
     }
 }
 
 __kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
-                  int rows, int cols, dstT value )
+                  int rows, int cols, dstST value_ )
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
-        *(__global dstT*)(dstptr + dst_index) = value;
+        int dst_index  = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+        storedst(value);
     }
 }
+
+#endif
diff --git a/modules/ocl/src/opencl/arithm_compare.cl b/modules/core/src/opencl/inrange.cl
similarity index 61%
rename from modules/ocl/src/opencl/arithm_compare.cl
rename to modules/core/src/opencl/inrange.cl
index 73e6299bb..7549cf394 100644
--- a/modules/ocl/src/opencl/arithm_compare.cl
+++ b/modules/core/src/opencl/inrange.cl
@@ -12,11 +12,9 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -33,7 +31,7 @@
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
+// In no event shall the copyright holders or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
@@ -51,24 +49,41 @@
 #endif
 #endif
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_compare(__global T * src1, int src1_step1, int src1_offset1,
-                              __global T * src2, int src2_step1, int src2_offset1,
-                              __global uchar * dst, int dst_step1, int dst_offset1,
-                              int cols1, int rows)
+__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+#ifdef HAVE_SCALAR
+                      __global const T * src2, __global const T * src3
+#else
+                      __global const uchar * src2ptr, int src2_step, int src2_offset,
+                      __global const uchar * src3ptr, int src3_step, int src3_offset
+#endif
+                      )
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols1 && y < rows)
+    if (x < dst_cols && y < dst_rows)
     {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index = mad24(y, dst_step1, x + dst_offset1);
+        int src1_index = mad24(y, src1_step, x*(int)sizeof(T)*cn + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        __global const T * src1 = (__global const T *)(src1ptr + src1_index);
+        __global uchar * dst = dstptr + dst_index;
 
-        dst[dst_index] = convert_uchar(src1[src1_index] Operation src2[src2_index] ? 255 : 0);
+#ifndef HAVE_SCALAR
+        int src2_index = mad24(y, src2_step, x*(int)sizeof(T)*cn + src2_offset);
+        int src3_index = mad24(y, src3_step, x*(int)sizeof(T)*cn + src3_offset);
+        __global const T * src2 = (__global const T *)(src2ptr + src2_index);
+        __global const T * src3 = (__global const T *)(src3ptr + src3_index);
+#endif
+
+        dst[0] = 255;
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            if ( src2[c] > src1[c] || src3[c] < src1[c] )
+            {
+                dst[0] = 0;
+                break;
+            }
     }
 }
diff --git a/modules/ocl/src/opencl/arithm_pow.cl b/modules/core/src/opencl/mixchannels.cl
similarity index 71%
rename from modules/ocl/src/opencl/arithm_pow.cl
rename to modules/core/src/opencl/mixchannels.cl
index 385e4cc15..7abd60af4 100644
--- a/modules/ocl/src/opencl/arithm_pow.cl
+++ b/modules/core/src/opencl/mixchannels.cl
@@ -12,11 +12,9 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -33,7 +31,7 @@
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
+// In no event shall the copyright holders or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
@@ -43,31 +41,24 @@
 //
 //M*/
 
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
+#define DECLARE_INPUT_MAT(i) \
+    __global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,
+#define DECLARE_OUTPUT_MAT(i) \
+    __global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,
+#define PROCESS_ELEM(i) \
+    int src##i##_index = mad24(src##i##_step, y, x * (int)sizeof(T) * scn##i + src##i##_offset); \
+    __global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \
+    int dst##i##_index = mad24(dst##i##_step, y, x * (int)sizeof(T) * dcn##i + dst##i##_offset); \
+    __global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \
+    dst##i[0] = src##i[0];
 
-/************************************** pow **************************************/
-
-__kernel void arithm_pow(__global VT * src, int src_step, int src_offset,
-                         __global VT * dst, int dst_step, int dst_offset,
-                         int rows, int cols, T p)
+__kernel void mixChannels(DECLARE_INPUT_MATS DECLARE_OUTPUT_MATS int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src_index = mad24(y, src_step, x + src_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        VT src_data = src[src_index];
-        VT tmp = src_data > 0 ? exp(p * log(src_data)) : (src_data == 0 ? 0 : exp(p * log(fabs(src_data))));
-
-        dst[dst_index] = tmp;
+        PROCESS_ELEMS
     }
 }
diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl
index 65f0edf6a..248ff006f 100644
--- a/modules/core/src/opencl/mulspectrums.cl
+++ b/modules/core/src/opencl/mulspectrums.cl
@@ -25,7 +25,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
+//     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 4f0d80670..febc1cbd2 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -51,25 +51,46 @@
 #endif
 
 #define noconvert
-#define EXTRA_PARAMS
 
-#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR
-#if OP_SUM
+#ifdef HAVE_MASK
+#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
+#else
+#define EXTRA_PARAMS
+#endif
+
+#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
+#ifdef OP_DOT
+#define FUNC(a, b, c) a += b * c
+#elif defined OP_SUM
 #define FUNC(a, b) a += b
-#elif OP_SUM_ABS
+#elif defined OP_SUM_ABS
 #define FUNC(a, b) a += b >= (dstT)(0) ? b : -b
-#elif OP_SUM_SQR
+#elif defined OP_SUM_SQR
 #define FUNC(a, b) a += b * b
 #endif
 #define DECLARE_LOCAL_MEM \
     __local dstT localmem[WGS2_ALIGNED]
 #define DEFINE_ACCUMULATOR \
     dstT accumulator = (dstT)(0)
+#ifdef HAVE_MASK
+#define REDUCE_GLOBAL \
+    dstT temp = convertToDT(src[0]); \
+    int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
+    if (mask[mask_index]) \
+        FUNC(accumulator, temp)
+#elif defined OP_DOT
+#define REDUCE_GLOBAL \
+    int src2_index = mad24(id / cols, src2_step, src2_offset + (id % cols) * (int)sizeof(srcT)); \
+    __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \
+    dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
+    FUNC(accumulator, temp, temp2)
+#else
 #define REDUCE_GLOBAL \
     dstT temp = convertToDT(src[0]); \
     FUNC(accumulator, temp)
+#endif
 #define SET_LOCAL_1 \
-        localmem[lid] = accumulator
+    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
     localmem[lid - WGS2_ALIGNED] += accumulator
 #define REDUCE_LOCAL_2 \
@@ -88,7 +109,7 @@
 #define REDUCE_GLOBAL \
     accumulator += src[0] == zero ? zero : one
 #define SET_LOCAL_1 \
-        localmem[lid] = accumulator
+    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
     localmem[lid - WGS2_ALIGNED] += accumulator
 #define REDUCE_LOCAL_2 \
@@ -99,37 +120,31 @@
 
 #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
 
-#if defined (DEPTH_0)
+#ifdef DEPTH_0
 #define srcT uchar
 #define MIN_VAL 0
 #define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
+#elif defined DEPTH_1
 #define srcT char
 #define MIN_VAL -128
 #define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
+#elif defined DEPTH_2
 #define srcT ushort
 #define MIN_VAL 0
 #define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
+#elif defined DEPTH_3
 #define srcT short
 #define MIN_VAL -32768
 #define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
+#elif defined DEPTH_4
 #define srcT int
 #define MIN_VAL INT_MIN
 #define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
+#elif defined DEPTH_5
 #define srcT float
 #define MIN_VAL (-FLT_MAX)
 #define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
+#elif defined DEPTH_6
 #define srcT double
 #define MIN_VAL (-DBL_MAX)
 #define MAX_VAL DBL_MAX
@@ -220,17 +235,19 @@
 #error "No operation"
 #endif
 
-#if defined OP_MIN_MAX_LOC
+#ifdef OP_MIN_MAX_LOC
 #undef EXTRA_PARAMS
 #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2
-#endif
-#if defined OP_MIN_MAX_LOC_MASK
+#elif defined OP_MIN_MAX_LOC_MASK
 #undef EXTRA_PARAMS
 #define EXTRA_PARAMS , __global uchar * dstptr2, __global int * dstlocptr, __global int * dstlocptr2, \
-    __global const uchar * maskptr, int mask_step, int mask_offset, __global int * test
+    __global const uchar * maskptr, int mask_step, int mask_offset
+#elif defined OP_DOT
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset
 #endif
 
-__kernel void reduce(__global const uchar * srcptr, int step, int offset, int cols,
+__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int cols,
                      int total, int groupnum, __global uchar * dstptr EXTRA_PARAMS)
 {
     int lid = get_local_id(0);
@@ -242,7 +259,7 @@ __kernel void reduce(__global const uchar * srcptr, int step, int offset, int co
 
     for (int grain = groupnum * WGS; id < total; id += grain)
     {
-        int src_index = mad24(id / cols, step, offset + (id % cols) * (int)sizeof(srcT));
+        int src_index = mad24(id / cols, src_step, src_offset + (id % cols) * (int)sizeof(srcT));
         __global const srcT * src = (__global const srcT *)(srcptr + src_index);
         REDUCE_GLOBAL;
     }
diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl
new file mode 100644
index 000000000..f8ff6a2e1
--- /dev/null
+++ b/modules/core/src/opencl/reduce2.cl
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+#if ddepth == 0
+#define MIN_VAL 0
+#define MAX_VAL 255
+#elif ddepth == 1
+#define MIN_VAL -128
+#define MAX_VAL 127
+#elif ddepth == 2
+#define MIN_VAL 0
+#define MAX_VAL 65535
+#elif ddepth == 3
+#define MIN_VAL -32768
+#define MAX_VAL 32767
+#elif ddepth == 4
+#define MIN_VAL INT_MIN
+#define MAX_VAL INT_MAX
+#elif ddepth == 5
+#define MIN_VAL (-FLT_MAX)
+#define MAX_VAL FLT_MAX
+#elif ddepth == 6
+#define MIN_VAL (-DBL_MAX)
+#define MAX_VAL DBL_MAX
+#else
+#error "Unsupported depth"
+#endif
+
+#define noconvert
+
+#ifdef OCL_CV_REDUCE_SUM
+#define INIT_VALUE 0
+#define PROCESS_ELEM(acc, value) acc += value
+#elif defined(OCL_CV_REDUCE_MAX)
+#define INIT_VALUE MIN_VAL
+#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
+#elif defined(OCL_CV_REDUCE_MIN)
+#define INIT_VALUE MAX_VAL
+#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
+#elif defined(OCL_CV_REDUCE_AVG)
+#error "This operation should be implemented through OCL_CV_REDUCE_SUM"
+#else
+#error "No operation is specified"
+#endif
+
+__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
+                     __global uchar * dstptr, int dst_step, int dst_offset)
+{
+#if dim == 0 // reduce to a single row
+    int x = get_global_id(0);
+    if (x < cols)
+    {
+        int src_index = x * (int)sizeof(srcT) * cn + src_offset;
+        __global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn;
+        dstT tmp[cn] = { INIT_VALUE };
+
+        for (int y = 0; y < rows; ++y, src_index += src_step)
+        {
+            __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                dstT value = convertToDT(src[c]);
+                PROCESS_ELEM(tmp[c], value);
+            }
+        }
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            dst[c] = tmp[c];
+    }
+#elif dim == 1 // reduce to a single column
+    int y = get_global_id(0);
+    if (y < rows)
+    {
+        int src_index = mad24(y, src_step, src_offset);
+        int dst_index = mad24(y, dst_step, dst_offset);
+
+        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        dstT tmp[cn] = { INIT_VALUE };
+
+        for (int x = 0; x < cols; ++x, src += cn)
+        {
+            #pragma unroll
+            for (int c = 0; c < cn; ++c)
+            {
+                dstT value = convertToDT(src[c]);
+                PROCESS_ELEM(tmp[c], value);
+            }
+        }
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            dst[c] = tmp[c];
+    }
+#else
+#error "Dims must be either 0 or 1"
+#endif
+}
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
index 2ea58c220..8ff3cec90 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdblas_impl.hpp
@@ -1,10 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_clamdblas.py
 enum OPENCLAMDBLAS_FN_ID {
 //    OPENCLAMDBLAS_FN_clAmdBlasAddScratchImage = 0,
@@ -1251,7 +1247,6 @@ static const struct DynamicFnEntry* openclamdblas_fn[] = {
     NULL/*&clAmdBlasiDamax_definition*/,
     NULL/*&clAmdBlasiSamax_definition*/,
     NULL/*&clAmdBlasiZamax_definition*/,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 6
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
index 1742ab606..d5bdf7e0b 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_clamdfft_impl.hpp
@@ -1,10 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_clamdfft.py
 enum OPENCLAMDFFT_FN_ID {
     OPENCLAMDFFT_FN_clAmdFftBakePlan = 0,
@@ -393,7 +389,6 @@ static const struct DynamicFnEntry* openclamdfft_fn[] = {
     &clAmdFftSetResultLocation_definition,
     &clAmdFftSetup_definition,
     &clAmdFftTeardown_definition,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 15
diff --git a/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp b/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
index a40a5fd91..913b52313 100644
--- a/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
+++ b/modules/core/src/opencl/runtime/autogenerated/opencl_core_impl.hpp
@@ -1,11 +1,6 @@
 //
 // AUTOGENERATED, DO NOT EDIT
 //
-
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 // generated by parser_cl.py
 enum OPENCL_FN_ID {
     OPENCL_FN_clBuildProgram = 0,
@@ -666,7 +661,6 @@ static const struct DynamicFnEntry* opencl_fn_list[] = {
     &clUnloadCompiler_definition,
     &clUnloadPlatformCompiler_definition,
     &clWaitForEvents_definition,
-    ADDITIONAL_FN_DEFINITIONS // macro for custom functions
 };
 
 // number of enabled functions: 88
diff --git a/modules/core/src/opencl/runtime/generator/common.py b/modules/core/src/opencl/runtime/generator/common.py
index ed0face06..80c545295 100644
--- a/modules/core/src/opencl/runtime/generator/common.py
+++ b/modules/core/src/opencl/runtime/generator/common.py
@@ -161,7 +161,6 @@ def generateListOfDefinitions(fns, name='opencl_fn_list'):
         else:
             print '    NULL/*&%s_definition*/,' % (fn['name'])
         first = False
-    print '    ADDITIONAL_FN_DEFINITIONS // macro for custom functions'
     print '};'
 
 @outputToString
diff --git a/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list b/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
index 1f9820a1c..8b78df175 100644
--- a/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
+++ b/modules/core/src/opencl/runtime/generator/filter/opencl_clamdfft_functions.list
@@ -5,7 +5,7 @@ clAmdFftDestroyPlan
 clAmdFftEnqueueTransform
 //clAmdFftGetLayout
 //clAmdFftGetPlanBatchSize
-clAmdFftGetPlanContext
+//clAmdFftGetPlanContext
 //clAmdFftGetPlanDim
 //clAmdFftGetPlanDistance
 //clAmdFftGetPlanInStride
@@ -22,7 +22,7 @@ clAmdFftSetPlanBatchSize
 //clAmdFftSetPlanDim
 clAmdFftSetPlanDistance
 clAmdFftSetPlanInStride
-clAmdFftSetPlanLength
+//clAmdFftSetPlanLength
 clAmdFftSetPlanOutStride
 clAmdFftSetPlanPrecision
 clAmdFftSetPlanScale
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in
deleted file mode 100644
index 8492edda9..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.cpp.in
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "precomp.hpp"
-
-#ifdef HAVE_CLAMDBLAS
-
-#include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
-#include "opencv2/ocl/cl_runtime/clamdblas_runtime.hpp"
-
-#if defined(_WIN32)
-    static void* WinGetProcAddress(const char* name)
-    {
-        static HMODULE opencl_module = NULL;
-        if (!opencl_module)
-        {
-            opencl_module = GetModuleHandleA("clAmdBlas.dll");
-            if (!opencl_module)
-            {
-                opencl_module = LoadLibraryA("clAmdBlas.dll");
-                if (!opencl_module)
-                    return NULL;
-            }
-        }
-        return (void*)GetProcAddress(opencl_module, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
-#endif // _WIN32
-
-#if defined(linux)
-    #include <dlfcn.h>
-    #include <stdio.h>
-
-    static void* GetProcAddress (const char* name)
-    {
-        static void* h = NULL;
-        if (!h)
-        {
-            h = dlopen("libclAmdBlas.so", RTLD_LAZY | RTLD_GLOBAL);
-            if (!h)
-                return NULL;
-        }
-
-        return dlsym(h, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) GetProcAddress(name)
-#endif
-
-#ifndef CV_CL_GET_PROC_ADDRESS
-#define CV_CL_GET_PROC_ADDRESS(name) NULL
-#endif
-
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-static void* openclamdblas_check_fn(int ID)
-{
-    void* func = CV_CL_GET_PROC_ADDRESS(openclamdblas_fn_names[ID]);
-    if (!func)
-    {
-        std::ostringstream msg;
-        msg << "OpenCL AMD BLAS function is not available: [" << openclamdblas_fn_names[ID] << "]";
-        CV_Error(CV_StsBadFunc, msg.str());
-    }
-    extern void* openclamdblas_fn_ptrs[];
-    *(void**)(openclamdblas_fn_ptrs[ID]) = func;
-    return func;
-}
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
-
-#endif
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in
deleted file mode 100644
index cbffb0861..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdblas_runtime.hpp.in
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
-#define __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
-
-#ifdef HAVE_CLAMDBLAS
-
-@CLAMDBLAS_REMAP_ORIGIN@
-
-#include <clAmdBlas.h>
-
-@CLAMDBLAS_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-
-@CLAMDBLAS_FN_DECLARATIONS@
-
-#endif
-
-#endif // __OPENCV_OCL_CLAMDBLAS_RUNTIME_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in
deleted file mode 100644
index aee6bd8ab..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.cpp.in
+++ /dev/null
@@ -1,75 +0,0 @@
-#include "precomp.hpp"
-
-#ifdef HAVE_CLAMDFFT
-
-#include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
-#include "opencv2/ocl/cl_runtime/clamdfft_runtime.hpp"
-
-#if defined(_WIN32)
-    static void* WinGetProcAddress(const char* name)
-    {
-        static HMODULE opencl_module = NULL;
-        if (!opencl_module)
-        {
-            opencl_module = GetModuleHandleA("clAmdFft.Runtime.dll");
-            if (!opencl_module)
-            {
-                opencl_module = LoadLibraryA("clAmdFft.Runtime.dll");
-                if (!opencl_module)
-                    return NULL;
-            }
-        }
-        return (void*)GetProcAddress(opencl_module, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
-#endif // _WIN32
-
-#if defined(linux)
-    #include <dlfcn.h>
-    #include <stdio.h>
-
-    static void* GetProcAddress (const char* name)
-    {
-        static void* h = NULL;
-        if (!h)
-        {
-            h = dlopen("libclAmdFft.Runtime.so", RTLD_LAZY | RTLD_GLOBAL);
-            if (!h)
-                return NULL;
-        }
-
-        return dlsym(h, name);
-    }
-    #define CV_CL_GET_PROC_ADDRESS(name) GetProcAddress(name)
-#endif
-
-#ifndef CV_CL_GET_PROC_ADDRESS
-#define CV_CL_GET_PROC_ADDRESS(name) NULL
-#endif
-
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-static void* openclamdfft_check_fn(int ID)
-{
-    void* func = CV_CL_GET_PROC_ADDRESS(openclamdfft_fn_names[ID]);
-    if (!func)
-    {
-        std::ostringstream msg;
-        msg << "OpenCL AMD FFT function is not available: [" << openclamdfft_fn_names[ID] << "]";
-        CV_Error(CV_StsBadFunc, msg.str());
-    }
-    extern void* openclamdfft_fn_ptrs[];
-    *(void**)(openclamdfft_fn_ptrs[ID]) = func;
-    return func;
-}
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
-
-#endif
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in
deleted file mode 100644
index 5e26d0154..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_clamdfft_runtime.hpp.in
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
-#define __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
-
-#ifdef HAVE_CLAMDFFT
-
-@CLAMDFFT_REMAP_ORIGIN@
-
-#include <clAmdFft.h>
-
-@CLAMDFFT_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-
-@CLAMDFFT_FN_DECLARATIONS@
-
-#endif
-
-#endif // __OPENCV_OCL_CLAMDFFT_RUNTIME_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in
deleted file mode 100644
index ff0395dcd..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_impl_opencl.hpp.in
+++ /dev/null
@@ -1,10 +0,0 @@
-@CL_FN_ENUMS@
-@CL_FN_NAMES@
-
-namespace {
-@CL_FN_SWITCH@
-}
-
-@CL_FN_DEFINITIONS@
-
-@CL_FN_PTRS@
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in
deleted file mode 100644
index 86690af86..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl.hpp.in
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
-#define __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
-
-@CL_REMAP_ORIGIN@
-
-#if defined __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-@CL_REMAP_DYNAMIC@
-
-#ifndef CL_RUNTIME_EXPORT
-#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_OCL_SHARED)) && (defined WIN32 || defined _WIN32 || defined WINCE)
-#define CL_RUNTIME_EXPORT __declspec(dllimport)
-#else
-#define CL_RUNTIME_EXPORT
-#endif
-#endif
-
-@CL_FN_DECLARATIONS@
-
-#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_HPP__
diff --git a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in b/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in
deleted file mode 100644
index d02d4c5ff..000000000
--- a/modules/core/src/opencl/runtime/generator/template/ocl_runtime_opencl_wrappers.hpp.in
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
-#define __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
-
-@CL_FN_INLINE_WRAPPERS@
-
-#endif // __OPENCV_OCL_CL_RUNTIME_OPENCL_WRAPPERS_HPP__
\ No newline at end of file
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
index 11c834f22..f3d12558e 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_clamdblas_impl.hpp.in
@@ -1,7 +1,3 @@
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
index 11c834f22..f3d12558e 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_clamdfft_impl.hpp.in
@@ -1,7 +1,3 @@
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in b/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
index f3adb6647..14586017a 100644
--- a/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
+++ b/modules/core/src/opencl/runtime/generator/template/opencl_core_impl.hpp.in
@@ -1,8 +1,3 @@
-
-#ifndef ADDITIONAL_FN_DEFINITIONS
-#define ADDITIONAL_FN_DEFINITIONS
-#endif
-
 @CL_FN_ENUMS@
 
 namespace {
diff --git a/modules/core/src/opencl/runtime/opencl_clamdblas.cpp b/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
index 6296ef674..420fdb97d 100644
--- a/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdblas.cpp
@@ -67,7 +67,7 @@
     #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
 #endif // _WIN32
 
-#if defined(linux)
+#if defined(__linux__)
     #include <dlfcn.h>
     #include <stdio.h>
 
@@ -87,6 +87,11 @@
 #endif
 
 #ifndef CV_CL_GET_PROC_ADDRESS
+#ifdef __GNUC__
+#warning("OPENCV: OpenCL BLAS dynamic library loader: check configuration")
+#else
+#pragma message("WARNING: OPENCV: OpenCL BLAS dynamic library loader: check configuration")
+#endif
 #define CV_CL_GET_PROC_ADDRESS(name) NULL
 #endif
 
@@ -100,8 +105,6 @@ static void* openclamdblas_check_fn(int ID);
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
@@ -110,13 +113,14 @@ static void* openclamdblas_check_fn(int ID);
 
 static void* openclamdblas_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
     assert(ID >= 0 && ID < (int)(sizeof(openclamdblas_fn)/sizeof(openclamdblas_fn[0])));
     const struct DynamicFnEntry* e = openclamdblas_fn[ID];
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
     if (!func)
     {
-        CV_Error(cv::Error::OpenCLApiCallError, cv::format("OpenCL AMD BLAS function is not available: [%s]", e->fnName));
+        throw cv::Exception(cv::Error::OpenCLApiCallError,
+                cv::format("OpenCL AMD BLAS function is not available: [%s]", e->fnName),
+                CV_Func, __FILE__, __LINE__);
     }
     *(e->ppFn) = func;
     return func;
diff --git a/modules/core/src/opencl/runtime/opencl_clamdfft.cpp b/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
index 2514b0a57..36a9ed2a7 100644
--- a/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
+++ b/modules/core/src/opencl/runtime/opencl_clamdfft.cpp
@@ -67,7 +67,7 @@
     #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
 #endif // _WIN32
 
-#if defined(linux)
+#if defined(__linux__)
     #include <dlfcn.h>
     #include <stdio.h>
 
@@ -87,6 +87,11 @@
 #endif
 
 #ifndef CV_CL_GET_PROC_ADDRESS
+#ifdef __GNUC__
+#warning("OPENCV: OpenCL FFT dynamic library loader: check configuration")
+#else
+#pragma message("WARNING: OPENCV: OpenCL FFT dynamic library loader: check configuration")
+#endif
 #define CV_CL_GET_PROC_ADDRESS(name) NULL
 #endif
 
@@ -100,8 +105,6 @@ static void* openclamdfft_check_fn(int ID);
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
@@ -110,13 +113,14 @@ static void* openclamdfft_check_fn(int ID);
 
 static void* openclamdfft_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
     assert(ID >= 0 && ID < (int)(sizeof(openclamdfft_fn)/sizeof(openclamdfft_fn[0])));
     const struct DynamicFnEntry* e = openclamdfft_fn[ID];
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
     if (!func)
     {
-        CV_Error(cv::Error::OpenCLApiCallError, cv::format("OpenCL AMD FFT function is not available: [%s]", e->fnName));
+        throw cv::Exception(cv::Error::OpenCLApiCallError,
+                cv::format("OpenCL AMD FFT function is not available: [%s]", e->fnName),
+                CV_Func, __FILE__, __LINE__);
     }
     *(e->ppFn) = func;
     return func;
diff --git a/modules/core/src/opencl/runtime/opencl_core.cpp b/modules/core/src/opencl/runtime/opencl_core.cpp
index d8f231b2d..5dbc85ec1 100644
--- a/modules/core/src/opencl/runtime/opencl_core.cpp
+++ b/modules/core/src/opencl/runtime/opencl_core.cpp
@@ -47,7 +47,7 @@
 
 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
 
-static const char* funcToCheckOpenCL1_1 = "clEnqueueReadBufferRect";
+#define OPENCL_FUNC_TO_CHECK_1_1 "clEnqueueReadBufferRect"
 #define ERROR_MSG_CANT_LOAD "Failed to load OpenCL runtime\n"
 #define ERROR_MSG_INVALID_VERSION "Failed to load OpenCL runtime (expected version 1.1+)\n"
 
@@ -72,7 +72,7 @@ static void* AppleCLGetProcAddress(const char* name)
             {
                 fprintf(stderr, ERROR_MSG_CANT_LOAD);
             }
-            else if (dlsym(handle, funcToCheckOpenCL1_1) == NULL)
+            else if (dlsym(handle, OPENCL_FUNC_TO_CHECK_1_1) == NULL)
             {
                 fprintf(stderr, ERROR_MSG_INVALID_VERSION);
                 handle = NULL;
@@ -110,7 +110,7 @@ static void* WinGetProcAddress(const char* name)
                 {
                     fprintf(stderr, ERROR_MSG_CANT_LOAD);
                 }
-                else if (GetProcAddress(handle, funcToCheckOpenCL1_1) == NULL)
+                else if (GetProcAddress(handle, OPENCL_FUNC_TO_CHECK_1_1) == NULL)
                 {
                     fprintf(stderr, ERROR_MSG_INVALID_VERSION);
                     handle = NULL;
@@ -125,7 +125,7 @@ static void* WinGetProcAddress(const char* name)
 #define CV_CL_GET_PROC_ADDRESS(name) WinGetProcAddress(name)
 #endif // _WIN32
 
-#if defined(linux)
+#if defined(__linux__)
 #include <dlfcn.h>
 #include <stdio.h>
 
@@ -147,7 +147,7 @@ static void* GetProcAddress(const char* name)
             {
                 fprintf(stderr, ERROR_MSG_CANT_LOAD);
             }
-            else if (dlsym(handle, funcToCheckOpenCL1_1) == NULL)
+            else if (dlsym(handle, OPENCL_FUNC_TO_CHECK_1_1) == NULL)
             {
                 fprintf(stderr, ERROR_MSG_INVALID_VERSION);
                 handle = NULL;
@@ -162,6 +162,11 @@ static void* GetProcAddress(const char* name)
 #endif
 
 #ifndef CV_CL_GET_PROC_ADDRESS
+#ifdef __GNUC__
+#warning("OPENCV: OpenCL dynamic library loader: check configuration")
+#else
+#pragma message("WARNING: OPENCV: OpenCL dynamic library loader: check configuration")
+#endif
 #define CV_CL_GET_PROC_ADDRESS(name) NULL
 #endif
 
@@ -169,29 +174,36 @@ static void* opencl_check_fn(int ID);
 
 #include "runtime_common.hpp"
 
+#include "autogenerated/opencl_core_impl.hpp"
+
 //
 // BEGIN OF CUSTOM FUNCTIONS
 //
 
 #define CUSTOM_FUNCTION_ID 1000
 
-#undef ADDITIONAL_FN_DEFINITIONS
-
 //
 // END OF CUSTOM FUNCTIONS HERE
 //
 
-#include "autogenerated/opencl_core_impl.hpp"
-
 static void* opencl_check_fn(int ID)
 {
-    ID = (ID <= CUSTOM_FUNCTION_ID) ? ID : ID - CUSTOM_FUNCTION_ID;
-    assert(ID >= 0 && ID < (int)(sizeof(opencl_fn_list)/sizeof(opencl_fn_list[0])));
-    const struct DynamicFnEntry* e = opencl_fn_list[ID];
+    const struct DynamicFnEntry* e = NULL;
+    if (ID < CUSTOM_FUNCTION_ID)
+    {
+        assert(ID >= 0 && ID < (int)(sizeof(opencl_fn_list)/sizeof(opencl_fn_list[0])));
+        e = opencl_fn_list[ID];
+    }
+    else
+    {
+        CV_ErrorNoReturn(cv::Error::StsBadArg, "Invalid function ID");
+    }
     void* func = CV_CL_GET_PROC_ADDRESS(e->fnName);
     if (!func)
     {
-        CV_Error(cv::Error::OpenCLApiCallError, cv::format("OpenCL function is not available: [%s]", e->fnName));
+        throw cv::Exception(cv::Error::OpenCLApiCallError,
+                cv::format("OpenCL function is not available: [%s]", e->fnName),
+                CV_Func, __FILE__, __LINE__);
     }
     *(e->ppFn) = func;
     return func;
diff --git a/modules/core/src/opengl.cpp b/modules/core/src/opengl.cpp
index 7205afff7..e7b2a7627 100644
--- a/modules/core/src/opengl.cpp
+++ b/modules/core/src/opengl.cpp
@@ -58,16 +58,9 @@ namespace
         inline void throw_no_ogl() { CV_Error(cv::Error::OpenGlNotSupported, "The library is compiled without OpenGL support"); }
     #else
         inline void throw_no_ogl() { CV_Error(cv::Error::OpenGlApiCallError, "OpenGL context doesn't exist"); }
-    #endif
 
     bool checkError(const char* file, const int line, const char* func = 0)
     {
-    #ifndef HAVE_OPENGL
-        (void) file;
-        (void) line;
-        (void) func;
-        return true;
-    #else
         GLenum err = gl::GetError();
 
         if (err != gl::NO_ERROR_)
@@ -102,8 +95,8 @@ namespace
         }
 
         return true;
-    #endif
     }
+    #endif
 
     #define CV_CheckGlError() CV_DbgAssert( (checkError(__FILE__, __LINE__, CV_Func)) )
 } // namespace
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 3727b2f15..ff5943bc6 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -260,11 +260,6 @@ extern TLSData<CoreTLSData> coreTlsData;
 #define CL_RUNTIME_EXPORT
 #endif
 
-namespace ocl
-{
-    MatAllocator* getOpenCLAllocator();
-}
-
 extern bool __termination; // skip some cleanups, because process is terminating
                            // (for example, if ExitProcess() was already called)
 
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 2806efeb3..2830bd165 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -449,6 +449,8 @@ static SumSqrFunc getSumSqrTab(int depth)
     return sumSqrTab[depth];
 }
 
+#ifdef HAVE_OPENCL
+
 template <typename T> Scalar ocl_part_sum(Mat m)
 {
     CV_Assert(m.rows == 1);
@@ -466,20 +468,23 @@ template <typename T> Scalar ocl_part_sum(Mat m)
 
 enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS =  1, OCL_OP_SUM_SQR = 2 };
 
-static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
+static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray() )
 {
     CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
 
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 || _src.dims() > 2 )
+    if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 )
         return false;
 
     int dbsize = ocl::Device::getDefault().maxComputeUnits();
     size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
 
-    int ddepth = std::max(CV_32S, depth), dtype = CV_MAKE_TYPE(ddepth, cn);
+    int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
+            dtype = CV_MAKE_TYPE(ddepth, cn);
+    bool haveMask = _mask.kind() != _InputArray::NONE;
+    CV_Assert(!haveMask || _mask.type() == CV_8UC1);
 
     int wgs2_aligned = 1;
     while (wgs2_aligned < (int)wgs)
@@ -489,19 +494,27 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
     char cvt[40];
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s",
+                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
                          ocl::typeToStr(type), ocl::typeToStr(dtype), ocl::convertTypeStr(depth, ddepth, cn, cvt),
                          opMap[sum_op], (int)wgs, wgs2_aligned,
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         haveMask ? " -D HAVE_MASK" : ""));
     if (k.empty())
         return false;
 
-    UMat src = _src.getUMat(), db(1, dbsize, dtype);
-    k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
-           dbsize, ocl::KernelArg::PtrWriteOnly(db));
+    UMat src = _src.getUMat(), db(1, dbsize, dtype), mask = _mask.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            dbarg = ocl::KernelArg::PtrWriteOnly(db),
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+
+    if (haveMask)
+        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg, maskarg);
+    else
+        k.args(srcarg, src.cols, (int)src.total(), dbsize, dbarg);
 
     size_t globalsize = dbsize * wgs;
-    if (k.run(1, &globalsize, &wgs, true))
+    if (k.run(1, &globalsize, &wgs, false))
     {
         typedef Scalar (*part_sum)(Mat m);
         part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> },
@@ -512,13 +525,18 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op )
     return false;
 }
 
+#endif
+
 }
 
 cv::Scalar cv::sum( InputArray _src )
 {
+#ifdef HAVE_OPENCL
     Scalar _res;
-    if (ocl::useOpenCL() && _src.isUMat() && ocl_sum(_src, _res, OCL_OP_SUM))
-        return _res;
+    CV_OCL_RUN_( _src.isUMat() && _src.dims() <= 2,
+                 ocl_sum(_src, _res, OCL_OP_SUM),
+                 _res)
+#endif
 
     Mat src = _src.getMat();
     int k, cn = src.channels(), depth = src.depth();
@@ -530,25 +548,31 @@ cv::Scalar cv::sum( InputArray _src )
     {
         IppiSize sz = { cols, rows };
         int type = src.type();
-        typedef IppStatus (CV_STDCALL* ippiSumFunc)(const void*, int, IppiSize, double *, int);
-        ippiSumFunc ippFunc =
-            type == CV_8UC1 ? (ippiSumFunc)ippiSum_8u_C1R :
-            type == CV_8UC3 ? (ippiSumFunc)ippiSum_8u_C3R :
-            type == CV_8UC4 ? (ippiSumFunc)ippiSum_8u_C4R :
-            type == CV_16UC1 ? (ippiSumFunc)ippiSum_16u_C1R :
-            type == CV_16UC3 ? (ippiSumFunc)ippiSum_16u_C3R :
-            type == CV_16UC4 ? (ippiSumFunc)ippiSum_16u_C4R :
-            type == CV_16SC1 ? (ippiSumFunc)ippiSum_16s_C1R :
-            type == CV_16SC3 ? (ippiSumFunc)ippiSum_16s_C3R :
-            type == CV_16SC4 ? (ippiSumFunc)ippiSum_16s_C4R :
-            type == CV_32FC1 ? (ippiSumFunc)ippiSum_32f_C1R :
-            type == CV_32FC3 ? (ippiSumFunc)ippiSum_32f_C3R :
-            type == CV_32FC4 ? (ippiSumFunc)ippiSum_32f_C4R :
+        typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
+        typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
+        ippiSumFuncHint ippFuncHint =
+            type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
+            type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
+            type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
             0;
-        if( ippFunc )
+        ippiSumFuncNoHint ippFuncNoHint =
+            type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
+            type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
+            type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
+            type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
+            type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
+            type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
+            type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
+            type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
+            type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
+            0;
+        CV_Assert(!ippFuncHint || !ippFuncNoHint);
+        if( ippFuncHint || ippFuncNoHint )
         {
             Ipp64f res[4];
-            if( ippFunc(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) >= 0 )
+            IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) :
+                            ippFuncNoHint(src.data, (int)src.step[0], sz, res);
+            if( ret >= 0 )
             {
                 Scalar sc;
                 for( int i = 0; i < cn; i++ )
@@ -610,6 +634,8 @@ cv::Scalar cv::sum( InputArray _src )
     return s;
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_countNonZero( InputArray _src, int & res )
@@ -647,13 +673,18 @@ static bool ocl_countNonZero( InputArray _src, int & res )
 
 }
 
+#endif
+
 int cv::countNonZero( InputArray _src )
 {
     CV_Assert( _src.channels() == 1 );
 
+#ifdef HAVE_OPENCL
     int res = -1;
-    if (ocl::useOpenCL() && _src.isUMat() && ocl_countNonZero(_src, res))
-        return res;
+    CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
+                ocl_countNonZero(_src, res),
+                res)
+#endif
 
     Mat src = _src.getMat();
     CountNonZeroFunc func = getCountNonZeroTab(src.depth());
@@ -720,25 +751,32 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
         }
         else
         {
-            typedef IppStatus (CV_STDCALL* ippiMeanFunc)(const void*, int, IppiSize, double *, int);
-            ippiMeanFunc ippFunc =
-                type == CV_8UC1 ? (ippiMeanFunc)ippiMean_8u_C1R :
-                type == CV_8UC3 ? (ippiMeanFunc)ippiMean_8u_C3R :
-                type == CV_8UC4 ? (ippiMeanFunc)ippiMean_8u_C4R :
-                type == CV_16UC1 ? (ippiMeanFunc)ippiMean_16u_C1R :
-                type == CV_16UC3 ? (ippiMeanFunc)ippiMean_16u_C3R :
-                type == CV_16UC4 ? (ippiMeanFunc)ippiMean_16u_C4R :
-                type == CV_16SC1 ? (ippiMeanFunc)ippiMean_16s_C1R :
-                type == CV_16SC3 ? (ippiMeanFunc)ippiMean_16s_C3R :
-                type == CV_16SC4 ? (ippiMeanFunc)ippiMean_16s_C4R :
-                type == CV_32FC1 ? (ippiMeanFunc)ippiMean_32f_C1R :
-                type == CV_32FC3 ? (ippiMeanFunc)ippiMean_32f_C3R :
-                type == CV_32FC4 ? (ippiMeanFunc)ippiMean_32f_C4R :
+            typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
+            typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *);
+            ippiMeanFuncHint ippFuncHint =
+                type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R :
+                type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R :
+                type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R :
                 0;
-            if( ippFunc )
+            ippiMeanFuncNoHint ippFuncNoHint =
+                type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R :
+                type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R :
+                type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R :
+                type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R :
+                type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R :
+                type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R :
+                type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R :
+                type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R :
+                type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R :
+                0;
+            // Make sure only zero or one version of the function pointer is valid
+            CV_Assert(!ippFuncHint || !ippFuncNoHint);
+            if( ippFuncHint || ippFuncNoHint )
             {
                 Ipp64f res[4];
-                if( ippFunc(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) >= 0 )
+                IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, res, ippAlgHintAccurate) :
+                                ippFuncNoHint(src.data, (int)src.step[0], sz, res);
+                if( ret >= 0 )
                 {
                     Scalar sc;
                     for( int i = 0; i < cn; i++ )
@@ -804,17 +842,22 @@ cv::Scalar cv::mean( InputArray _src, InputArray _mask )
     return s*(nz0 ? 1./nz0 : 0);
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
-static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv )
+static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 {
+    bool haveMask = _mask.kind() != _InputArray::NONE;
+
     Scalar mean, stddev;
-    if (!ocl_sum(_src, mean, OCL_OP_SUM))
+    if (!ocl_sum(_src, mean, OCL_OP_SUM, _mask))
         return false;
-    if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR))
+    if (!ocl_sum(_src, stddev, OCL_OP_SUM_SQR, _mask))
         return false;
 
-    double total = 1.0 / _src.total();
+    int nz = haveMask ? countNonZero(_mask) : (int)_src.total();
+    double total = nz != 0 ? 1.0 / nz : 0;
     int k, j, cn = _src.channels();
     for (int i = 0; i < cn; ++i)
     {
@@ -847,10 +890,12 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv
 
 }
 
+#endif
+
 void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask )
 {
-    if (ocl::useOpenCL() && _src.isUMat() && _mask.empty() && ocl_meanStdDev(_src, _mean, _sdv))
-        return;
+    CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2,
+               ocl_meanStdDev(_src, _mean, _sdv, _mask))
 
     Mat src = _src.getMat(), mask = _mask.getMat();
     CV_Assert( mask.empty() || mask.type() == CV_8U );
@@ -1157,10 +1202,7 @@ static void ofs2idx(const Mat& a, size_t ofs, int* idx)
     }
 }
 
-}
-
-namespace cv
-{
+#ifdef HAVE_OPENCL
 
 template <typename T>
 void getMinMaxRes(const Mat &minv, const Mat &maxv, const Mat &minl, const Mat &maxl, double* minVal,
@@ -1224,8 +1266,8 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
         wgs2_aligned <<= 1;
     wgs2_aligned >>= 1;
 
-    String opts = format("-D DEPTH_%d -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d %s",
-        depth, _mask.empty() ? "" : "_MASK", (int)wgs, wgs2_aligned, doubleSupport ? "-D DOUBLE_SUPPORT" : "");
+    String opts = format("-D DEPTH_%d -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s",
+        depth, _mask.empty() ? "" : "_MASK", (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "");
 
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
     if (k.empty())
@@ -1234,13 +1276,13 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
     UMat src = _src.getUMat(), minval(1, groupnum, src.type()),
         maxval(1, groupnum, src.type()), minloc( 1, groupnum, CV_32SC1),
         maxloc( 1, groupnum, CV_32SC1), mask;
-    if(!_mask.empty())
+    if (!_mask.empty())
         mask = _mask.getUMat();
 
-    if(src.channels()>1)
+    if (src.channels() > 1)
         src = src.reshape(1);
 
-    if(mask.empty())
+    if (mask.empty())
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(),
             groupnum, ocl::KernelArg::PtrWriteOnly(minval), ocl::KernelArg::PtrWriteOnly(maxval),
             ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc));
@@ -1250,7 +1292,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
             ocl::KernelArg::PtrWriteOnly(minloc), ocl::KernelArg::PtrWriteOnly(maxloc), ocl::KernelArg::ReadOnlyNoSize(mask));
 
     size_t globalsize = groupnum * wgs;
-    if (!k.run(1, &globalsize, &wgs, true))
+    if (!k.run(1, &globalsize, &wgs, false))
         return false;
 
     Mat minv = minval.getMat(ACCESS_READ), maxv = maxval.getMat(ACCESS_READ),
@@ -1274,6 +1316,9 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
 
     return true;
 }
+
+#endif
+
 }
 
 void cv::minMaxIdx(InputArray _src, double* minVal,
@@ -1283,9 +1328,8 @@ void cv::minMaxIdx(InputArray _src, double* minVal,
     CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
         (_src.channels() >= 1 && _mask.empty() && !minIdx && !maxIdx) );
 
-     if( ocl::useOpenCL() && _src.isUMat() && _src.dims() <= 2  && ( _mask.empty() || _src.size() == _mask.size() )
-         && ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask) )
-        return;
+    CV_OCL_RUN(_src.isUMat() && _src.dims() <= 2  && (_mask.empty() || _src.size() == _mask.size()),
+               ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask))
 
     Mat src = _src.getMat(), mask = _mask.getMat();
     int depth = src.depth(), cn = src.channels();
@@ -1878,17 +1922,16 @@ static NormDiffFunc getNormDiffFunc(int normType, int depth)
     return normDiffTab[normType][depth];
 }
 
-}
+#ifdef HAVE_OPENCL
 
-namespace cv {
-
-static bool ocl_norm( InputArray _src, int normType, double & result )
+static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double & result )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            haveMask = _mask.kind() != _InputArray::NONE;
 
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2) ||
-         (!doubleSupport && depth == CV_64F))
+    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
+         (!doubleSupport && depth == CV_64F) || (normType == NORM_INF && haveMask && cn != 1))
         return false;
 
     UMat src = _src.getUMat();
@@ -1920,21 +1963,32 @@ static bool ocl_norm( InputArray _src, int normType, double & result )
         else
             abssrc = src;
 
-        cv::minMaxIdx(abssrc.reshape(1), NULL, &result);
+        cv::minMaxIdx(haveMask ? abssrc : abssrc.reshape(1), NULL, &result, NULL, NULL, _mask);
     }
-    else if (normType == NORM_L1 || normType == NORM_L2)
+    else if (normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR)
     {
-        Scalar s;
+        Scalar sc;
         bool unstype = depth == CV_8U || depth == CV_16U;
 
-        ocl_sum(src.reshape(1), s, normType == NORM_L2 ?
-                    OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS) );
-        result = normType == NORM_L1 ? s[0] : std::sqrt(s[0]);
+        if ( !ocl_sum(haveMask ? src : src.reshape(1), sc, normType == NORM_L2 || normType == NORM_L2SQR ?
+                    OCL_OP_SUM_SQR : (unstype ? OCL_OP_SUM : OCL_OP_SUM_ABS), _mask) )
+            return false;
+
+        if (!haveMask)
+            cn = 1;
+
+        double s = 0.0;
+        for (int i = 0; i < cn; ++i)
+            s += sc[i];
+
+        result = normType == NORM_L1 || normType == NORM_L2SQR ? s : std::sqrt(s);
     }
 
     return true;
 }
 
+#endif
+
 }
 
 double cv::norm( InputArray _src, int normType, InputArray _mask )
@@ -1944,9 +1998,12 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
                normType == NORM_L2 || normType == NORM_L2SQR ||
                ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && _src.type() == CV_8U) );
 
+#ifdef HAVE_OPENCL
     double _result = 0;
-    if (ocl::useOpenCL() && _mask.empty() && _src.isUMat() && _src.dims() <= 2 && ocl_norm(_src, normType, _result))
-        return _result;
+    CV_OCL_RUN_(_src.isUMat() && _src.dims() <= 2,
+                ocl_norm(_src, normType, _mask, _result),
+                _result)
+#endif
 
     Mat src = _src.getMat(), mask = _mask.getMat();
     int depth = src.depth(), cn = src.channels();
@@ -2029,54 +2086,64 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
         }
         else
         {
-            typedef IppStatus (CV_STDCALL* ippiNormFunc)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
-            ippiNormFunc ippFunc =
-                normType == NORM_INF ?
-                (type == CV_8UC1 ? (ippiNormFunc)ippiNorm_Inf_8u_C1R :
-                type == CV_8UC3 ? (ippiNormFunc)ippiNorm_Inf_8u_C3R :
-                type == CV_8UC4 ? (ippiNormFunc)ippiNorm_Inf_8u_C4R :
-                type == CV_16UC1 ? (ippiNormFunc)ippiNorm_Inf_16u_C1R :
-                type == CV_16UC3 ? (ippiNormFunc)ippiNorm_Inf_16u_C3R :
-                type == CV_16UC4 ? (ippiNormFunc)ippiNorm_Inf_16u_C4R :
-                type == CV_16SC1 ? (ippiNormFunc)ippiNorm_Inf_16s_C1R :
-                //type == CV_16SC3 ? (ippiNormFunc)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
-                //type == CV_16SC4 ? (ippiNormFunc)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
-                type == CV_32FC1 ? (ippiNormFunc)ippiNorm_Inf_32f_C1R :
-                type == CV_32FC3 ? (ippiNormFunc)ippiNorm_Inf_32f_C3R :
-                type == CV_32FC4 ? (ippiNormFunc)ippiNorm_Inf_32f_C4R :
-                0) :
+            typedef IppStatus (CV_STDCALL* ippiNormFuncHint)(const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
+            typedef IppStatus (CV_STDCALL* ippiNormFuncNoHint)(const void *, int, IppiSize, Ipp64f *);
+            ippiNormFuncHint ippFuncHint =
                 normType == NORM_L1 ?
-                (type == CV_8UC1 ? (ippiNormFunc)ippiNorm_L1_8u_C1R :
-                type == CV_8UC3 ? (ippiNormFunc)ippiNorm_L1_8u_C3R :
-                type == CV_8UC4 ? (ippiNormFunc)ippiNorm_L1_8u_C4R :
-                type == CV_16UC1 ? (ippiNormFunc)ippiNorm_L1_16u_C1R :
-                type == CV_16UC3 ? (ippiNormFunc)ippiNorm_L1_16u_C3R :
-                type == CV_16UC4 ? (ippiNormFunc)ippiNorm_L1_16u_C4R :
-                type == CV_16SC1 ? (ippiNormFunc)ippiNorm_L1_16s_C1R :
-                type == CV_16SC3 ? (ippiNormFunc)ippiNorm_L1_16s_C3R :
-                type == CV_16SC4 ? (ippiNormFunc)ippiNorm_L1_16s_C4R :
-                type == CV_32FC1 ? (ippiNormFunc)ippiNorm_L1_32f_C1R :
-                type == CV_32FC3 ? (ippiNormFunc)ippiNorm_L1_32f_C3R :
-                type == CV_32FC4 ? (ippiNormFunc)ippiNorm_L1_32f_C4R :
+                (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L1_32f_C1R :
+                type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L1_32f_C3R :
+                type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L1_32f_C4R :
                 0) :
                 normType == NORM_L2 || normType == NORM_L2SQR ?
-                (type == CV_8UC1 ? (ippiNormFunc)ippiNorm_L2_8u_C1R :
-                type == CV_8UC3 ? (ippiNormFunc)ippiNorm_L2_8u_C3R :
-                type == CV_8UC4 ? (ippiNormFunc)ippiNorm_L2_8u_C4R :
-                type == CV_16UC1 ? (ippiNormFunc)ippiNorm_L2_16u_C1R :
-                type == CV_16UC3 ? (ippiNormFunc)ippiNorm_L2_16u_C3R :
-                type == CV_16UC4 ? (ippiNormFunc)ippiNorm_L2_16u_C4R :
-                type == CV_16SC1 ? (ippiNormFunc)ippiNorm_L2_16s_C1R :
-                type == CV_16SC3 ? (ippiNormFunc)ippiNorm_L2_16s_C3R :
-                type == CV_16SC4 ? (ippiNormFunc)ippiNorm_L2_16s_C4R :
-                type == CV_32FC1 ? (ippiNormFunc)ippiNorm_L2_32f_C1R :
-                type == CV_32FC3 ? (ippiNormFunc)ippiNorm_L2_32f_C3R :
-                type == CV_32FC4 ? (ippiNormFunc)ippiNorm_L2_32f_C4R :
+                (type == CV_32FC1 ? (ippiNormFuncHint)ippiNorm_L2_32f_C1R :
+                type == CV_32FC3 ? (ippiNormFuncHint)ippiNorm_L2_32f_C3R :
+                type == CV_32FC4 ? (ippiNormFuncHint)ippiNorm_L2_32f_C4R :
                 0) : 0;
-            if( ippFunc )
+            ippiNormFuncNoHint ippFuncNoHint =
+                normType == NORM_INF ?
+                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C1R :
+                type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C3R :
+                type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_8u_C4R :
+                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C1R :
+                type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C3R :
+                type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_16u_C4R :
+                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_16s_C1R :
+                //type == CV_16SC3 ? (ippiNormFunc)ippiNorm_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
+                //type == CV_16SC4 ? (ippiNormFunc)ippiNorm_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
+                type == CV_32FC1 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C1R :
+                type == CV_32FC3 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C3R :
+                type == CV_32FC4 ? (ippiNormFuncNoHint)ippiNorm_Inf_32f_C4R :
+                0) :
+                normType == NORM_L1 ?
+                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C1R :
+                type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C3R :
+                type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_8u_C4R :
+                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C1R :
+                type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C3R :
+                type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16u_C4R :
+                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C1R :
+                type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C3R :
+                type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L1_16s_C4R :
+                0) :
+                normType == NORM_L2 || normType == NORM_L2SQR ?
+                (type == CV_8UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C1R :
+                type == CV_8UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C3R :
+                type == CV_8UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_8u_C4R :
+                type == CV_16UC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C1R :
+                type == CV_16UC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C3R :
+                type == CV_16UC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16u_C4R :
+                type == CV_16SC1 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C1R :
+                type == CV_16SC3 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C3R :
+                type == CV_16SC4 ? (ippiNormFuncNoHint)ippiNorm_L2_16s_C4R :
+                0) : 0;
+            // Make sure only zero or one version of the function pointer is valid
+            CV_Assert(!ippFuncHint || !ippFuncNoHint);
+            if( ippFuncHint || ippFuncNoHint )
             {
                 Ipp64f norm_array[4];
-                if( ippFunc(src.data, (int)src.step[0], sz, norm_array, ippAlgHintAccurate) >= 0 )
+                IppStatus ret = ippFuncHint ? ippFuncHint(src.data, (int)src.step[0], sz, norm_array, ippAlgHintAccurate) :
+                                ippFuncNoHint(src.data, (int)src.step[0], sz, norm_array);
+                if( ret >= 0 )
                 {
                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
                     for( int i = 1; i < cn; i++ )
@@ -2228,6 +2295,8 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
     return result.d;
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
 static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, double & result )
@@ -2237,7 +2306,7 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, double &
     bool relative = (normType & NORM_RELATIVE) != 0;
     normType &= ~NORM_RELATIVE;
 
-    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2) ||
+    if ( !(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR) ||
          (!doubleSupport && depth == CV_64F))
         return false;
 
@@ -2269,22 +2338,25 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, double &
 
 }
 
+#endif
+
 double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
 {
-    CV_Assert( _src1.size() == _src2.size() && _src1.type() == _src2.type() );
+    CV_Assert( _src1.sameSize(_src2) && _src1.type() == _src2.type() );
 
+#ifdef HAVE_OPENCL
     double _result = 0;
-    if (ocl::useOpenCL() && _mask.empty() && _src1.isUMat() && _src2.isUMat() &&
-            _src1.dims() <= 2 && _src2.dims() <= 2 && ocl_norm(_src1, _src2, normType, _result))
-        return _result;
+    CV_OCL_RUN_(_mask.empty() && _src1.isUMat() && _src2.isUMat() &&
+                _src1.dims() <= 2 && _src2.dims() <= 2,
+                ocl_norm(_src1, _src2, normType, _result),
+                _result)
+#endif
 
     if( normType & CV_RELATIVE )
     {
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
         Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
 
-        CV_Assert( src1.size == src2.size && src1.type() == src2.type() );
-
         normType &= 7;
         CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 || normType == NORM_L2SQR ||
                 ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
@@ -2363,8 +2435,6 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat();
     int depth = src1.depth(), cn = src1.channels();
 
-    CV_Assert( src1.size == src2.size );
-
     normType &= 7;
     CV_Assert( normType == NORM_INF || normType == NORM_L1 ||
                normType == NORM_L2 || normType == NORM_L2SQR ||
@@ -2446,54 +2516,64 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
         }
         else
         {
-            typedef IppStatus (CV_STDCALL* ippiNormDiffFunc)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
-            ippiNormDiffFunc ippFunc =
-                normType == NORM_INF ?
-                (type == CV_8UC1 ? (ippiNormDiffFunc)ippiNormDiff_Inf_8u_C1R :
-                type == CV_8UC3 ? (ippiNormDiffFunc)ippiNormDiff_Inf_8u_C3R :
-                type == CV_8UC4 ? (ippiNormDiffFunc)ippiNormDiff_Inf_8u_C4R :
-                type == CV_16UC1 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16u_C1R :
-                type == CV_16UC3 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16u_C3R :
-                type == CV_16UC4 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16u_C4R :
-                type == CV_16SC1 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16s_C1R :
-                //type == CV_16SC3 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
-                //type == CV_16SC4 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
-                type == CV_32FC1 ? (ippiNormDiffFunc)ippiNormDiff_Inf_32f_C1R :
-                type == CV_32FC3 ? (ippiNormDiffFunc)ippiNormDiff_Inf_32f_C3R :
-                type == CV_32FC4 ? (ippiNormDiffFunc)ippiNormDiff_Inf_32f_C4R :
-                0) :
+            typedef IppStatus (CV_STDCALL* ippiNormDiffFuncHint)(const void *, int, const void *, int, IppiSize, Ipp64f *, IppHintAlgorithm hint);
+            typedef IppStatus (CV_STDCALL* ippiNormDiffFuncNoHint)(const void *, int, const void *, int, IppiSize, Ipp64f *);
+            ippiNormDiffFuncHint ippFuncHint =
                 normType == NORM_L1 ?
-                (type == CV_8UC1 ? (ippiNormDiffFunc)ippiNormDiff_L1_8u_C1R :
-                type == CV_8UC3 ? (ippiNormDiffFunc)ippiNormDiff_L1_8u_C3R :
-                type == CV_8UC4 ? (ippiNormDiffFunc)ippiNormDiff_L1_8u_C4R :
-                type == CV_16UC1 ? (ippiNormDiffFunc)ippiNormDiff_L1_16u_C1R :
-                type == CV_16UC3 ? (ippiNormDiffFunc)ippiNormDiff_L1_16u_C3R :
-                type == CV_16UC4 ? (ippiNormDiffFunc)ippiNormDiff_L1_16u_C4R :
-                type == CV_16SC1 ? (ippiNormDiffFunc)ippiNormDiff_L1_16s_C1R :
-                type == CV_16SC3 ? (ippiNormDiffFunc)ippiNormDiff_L1_16s_C3R :
-                type == CV_16SC4 ? (ippiNormDiffFunc)ippiNormDiff_L1_16s_C4R :
-                type == CV_32FC1 ? (ippiNormDiffFunc)ippiNormDiff_L1_32f_C1R :
-                type == CV_32FC3 ? (ippiNormDiffFunc)ippiNormDiff_L1_32f_C3R :
-                type == CV_32FC4 ? (ippiNormDiffFunc)ippiNormDiff_L1_32f_C4R :
+                (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C1R :
+                type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C3R :
+                type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L1_32f_C4R :
                 0) :
                 normType == NORM_L2 || normType == NORM_L2SQR ?
-                (type == CV_8UC1 ? (ippiNormDiffFunc)ippiNormDiff_L2_8u_C1R :
-                type == CV_8UC3 ? (ippiNormDiffFunc)ippiNormDiff_L2_8u_C3R :
-                type == CV_8UC4 ? (ippiNormDiffFunc)ippiNormDiff_L2_8u_C4R :
-                type == CV_16UC1 ? (ippiNormDiffFunc)ippiNormDiff_L2_16u_C1R :
-                type == CV_16UC3 ? (ippiNormDiffFunc)ippiNormDiff_L2_16u_C3R :
-                type == CV_16UC4 ? (ippiNormDiffFunc)ippiNormDiff_L2_16u_C4R :
-                type == CV_16SC1 ? (ippiNormDiffFunc)ippiNormDiff_L2_16s_C1R :
-                type == CV_16SC3 ? (ippiNormDiffFunc)ippiNormDiff_L2_16s_C3R :
-                type == CV_16SC4 ? (ippiNormDiffFunc)ippiNormDiff_L2_16s_C4R :
-                type == CV_32FC1 ? (ippiNormDiffFunc)ippiNormDiff_L2_32f_C1R :
-                type == CV_32FC3 ? (ippiNormDiffFunc)ippiNormDiff_L2_32f_C3R :
-                type == CV_32FC4 ? (ippiNormDiffFunc)ippiNormDiff_L2_32f_C4R :
+                (type == CV_32FC1 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C1R :
+                type == CV_32FC3 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C3R :
+                type == CV_32FC4 ? (ippiNormDiffFuncHint)ippiNormDiff_L2_32f_C4R :
                 0) : 0;
-            if( ippFunc )
+            ippiNormDiffFuncNoHint ippFuncNoHint =
+                normType == NORM_INF ?
+                (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C1R :
+                type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C3R :
+                type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_8u_C4R :
+                type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C1R :
+                type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C3R :
+                type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16u_C4R :
+                type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_16s_C1R :
+                //type == CV_16SC3 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16s_C3R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
+                //type == CV_16SC4 ? (ippiNormDiffFunc)ippiNormDiff_Inf_16s_C4R : //Aug 2013: problem in IPP 7.1, 8.0 : -32768
+                type == CV_32FC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C1R :
+                type == CV_32FC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C3R :
+                type == CV_32FC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_Inf_32f_C4R :
+                0) :
+                normType == NORM_L1 ?
+                (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C1R :
+                type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C3R :
+                type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_8u_C4R :
+                type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C1R :
+                type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C3R :
+                type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16u_C4R :
+                type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C1R :
+                type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C3R :
+                type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L1_16s_C4R :
+                0) :
+                normType == NORM_L2 || normType == NORM_L2SQR ?
+                (type == CV_8UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C1R :
+                type == CV_8UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C3R :
+                type == CV_8UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_8u_C4R :
+                type == CV_16UC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C1R :
+                type == CV_16UC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C3R :
+                type == CV_16UC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16u_C4R :
+                type == CV_16SC1 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C1R :
+                type == CV_16SC3 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C3R :
+                type == CV_16SC4 ? (ippiNormDiffFuncNoHint)ippiNormDiff_L2_16s_C4R :
+                0) : 0;
+            // Make sure only zero or one version of the function pointer is valid
+            CV_Assert(!ippFuncHint || !ippFuncNoHint);
+            if( ippFuncHint || ippFuncNoHint )
             {
                 Ipp64f norm_array[4];
-                if( ippFunc(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) >= 0 )
+                IppStatus ret = ippFuncHint ? ippFuncHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, norm_array, ippAlgHintAccurate) :
+                                ippFuncNoHint(src1.data, (int)src1.step[0], src2.data, (int)src2.step[0], sz, norm_array);
+                if( ret >= 0 )
                 {
                     Ipp64f norm = (normType == NORM_L2 || normType == NORM_L2SQR) ? norm_array[0] * norm_array[0] : norm_array[0];
                     for( int i = 1; i < src1.channels(); i++ )
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index 00fc578f6..d8d8ae632 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -107,7 +107,7 @@ std::wstring GetTempPathWinRT()
     if (FAILED(WindowsCreateStringReference(RuntimeClass_Windows_Storage_ApplicationData,
                                             (UINT32)wcslen(RuntimeClass_Windows_Storage_ApplicationData), &hstrHead, &str)))
         return wstr;
-    if (FAILED(Windows::Foundation::GetActivationFactory(str, appdataFactory.ReleaseAndGetAddressOf())))
+    if (FAILED(RoGetActivationFactory(str, IID_PPV_ARGS(appdataFactory.ReleaseAndGetAddressOf()))))
         return wstr;
     if (FAILED(appdataFactory->get_Current(appdataRef.ReleaseAndGetAddressOf())))
         return wstr;
@@ -426,6 +426,7 @@ String format( const char* fmt, ... )
         String s(len, '\0');
         va_start(va, fmt);
         len = vsnprintf((char*)s.c_str(), len + 1, fmt, va);
+        (void)len;
         va_end(va);
         return s;
     }
@@ -849,7 +850,9 @@ public:
 };
 
 #ifdef WIN32
+#ifdef _MSC_VER
 #pragma warning(disable:4505) // unreferenced local function has been removed
+#endif
 
 #ifdef HAVE_WINRT
     // using C++11 thread attribute for local thread data
@@ -997,17 +1000,24 @@ public:
         }
     }
 };
-static TLSContainerStorage tlsContainerStorage;
+
+// This is a wrapper function that will ensure 'tlsContainerStorage' is constructed on first use.
+// For more information: http://www.parashift.com/c++-faq/static-init-order-on-first-use.html
+static TLSContainerStorage& getTLSContainerStorage()
+{
+    static TLSContainerStorage *tlsContainerStorage = new TLSContainerStorage();
+    return *tlsContainerStorage;
+}
 
 TLSDataContainer::TLSDataContainer()
     : key_(-1)
 {
-    key_ = tlsContainerStorage.allocateKey(this);
+    key_ = getTLSContainerStorage().allocateKey(this);
 }
 
 TLSDataContainer::~TLSDataContainer()
 {
-    tlsContainerStorage.releaseKey(key_, this);
+    getTLSContainerStorage().releaseKey(key_, this);
     key_ = -1;
 }
 
@@ -1032,7 +1042,7 @@ TLSStorage::~TLSStorage()
         void*& data = tlsData_[i];
         if (data)
         {
-            tlsContainerStorage.destroyData(i, data);
+            getTLSContainerStorage().destroyData(i, data);
             data = NULL;
         }
     }
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index 0b6137488..578fefbb9 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -56,10 +56,11 @@ UMatData::UMatData(const MatAllocator* allocator)
     prevAllocator = currAllocator = allocator;
     urefcount = refcount = 0;
     data = origdata = 0;
-    size = 0;
+    size = 0; capacity = 0;
     flags = 0;
     handle = 0;
     userdata = 0;
+    allocatorFlags_ = 0;
 }
 
 UMatData::~UMatData()
@@ -67,10 +68,11 @@ UMatData::~UMatData()
     prevAllocator = currAllocator = 0;
     urefcount = refcount = 0;
     data = origdata = 0;
-    size = 0;
+    size = 0; capacity = 0;
     flags = 0;
     handle = 0;
     userdata = 0;
+    allocatorFlags_ = 0;
 }
 
 void UMatData::lock()
@@ -204,8 +206,7 @@ static void finalizeHdr(UMat& m)
         m.rows = m.cols = -1;
 }
 
-
-UMat Mat::getUMat(int accessFlags) const
+UMat Mat::getUMat(int accessFlags, UMatUsageFlags usageFlags) const
 {
     UMat hdr;
     if(!data)
@@ -216,10 +217,10 @@ UMat Mat::getUMat(int accessFlags) const
         MatAllocator *a = allocator, *a0 = getStdAllocator();
         if(!a)
             a = a0;
-        temp_u = a->allocate(dims, size.p, type(), data, step.p, accessFlags);
+        temp_u = a->allocate(dims, size.p, type(), data, step.p, accessFlags, usageFlags);
         temp_u->refcount = 1;
     }
-    UMat::getStdAllocator()->allocate(temp_u, accessFlags);
+    UMat::getStdAllocator()->allocate(temp_u, accessFlags, usageFlags);
     hdr.flags = flags;
     setSize(hdr, dims, size.p, step.p);
     finalizeHdr(hdr);
@@ -229,8 +230,10 @@ UMat Mat::getUMat(int accessFlags) const
     return hdr;
 }
 
-void UMat::create(int d, const int* _sizes, int _type)
+void UMat::create(int d, const int* _sizes, int _type, UMatUsageFlags _usageFlags)
 {
+    this->usageFlags = _usageFlags;
+
     int i;
     CV_Assert(0 <= d && d <= CV_MAX_DIM && _sizes);
     _type = CV_MAT_TYPE(_type);
@@ -260,13 +263,13 @@ void UMat::create(int d, const int* _sizes, int _type)
             a = a0;
         try
         {
-            u = a->allocate(dims, size, _type, 0, step.p, 0);
+            u = a->allocate(dims, size, _type, 0, step.p, 0, usageFlags);
             CV_Assert(u != 0);
         }
         catch(...)
         {
             if(a != a0)
-                u = a0->allocate(dims, size, _type, 0, step.p, 0);
+                u = a0->allocate(dims, size, _type, 0, step.p, 0, usageFlags);
             CV_Assert(u != 0);
         }
         CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
@@ -302,7 +305,7 @@ void UMat::deallocate()
 
 
 UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
-    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
 {
     CV_Assert( m.dims >= 2 );
     if( m.dims > 2 )
@@ -347,7 +350,7 @@ UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
 
 UMat::UMat(const UMat& m, const Rect& roi)
     : flags(m.flags), dims(2), rows(roi.height), cols(roi.width),
-    allocator(m.allocator), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
+    allocator(m.allocator), usageFlags(m.usageFlags), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
 {
     CV_Assert( m.dims <= 2 );
     flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
@@ -373,7 +376,7 @@ UMat::UMat(const UMat& m, const Rect& roi)
 
 
 UMat::UMat(const UMat& m, const Range* ranges)
-    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
 {
     int i, d = m.dims;
 
@@ -551,14 +554,6 @@ int UMat::checkVector(int _elemChannels, int _depth, bool _requireContinuous) co
     ? (int)(total()*channels()/_elemChannels) : -1;
 }
 
-
-UMat UMat::cross(InputArray) const
-{
-    CV_Error(CV_StsNotImplemented, "");
-    return UMat();
-}
-
-
 UMat UMat::reshape(int _cn, int _newndims, const int* _newsz) const
 {
     if(_newndims == dims)
@@ -644,21 +639,62 @@ void UMat::copyTo(OutputArray _dst) const
     srcofs[dims-1] *= esz;
 
     _dst.create( dims, size.p, type() );
-    if( _dst.kind() == _InputArray::UMAT )
+    if( _dst.isUMat() )
     {
         UMat dst = _dst.getUMat();
         if( u == dst.u && dst.offset == offset )
             return;
-        dst.ndoffset(dstofs);
-        dstofs[dims-1] *= esz;
-        CV_Assert(u->currAllocator == dst.u->currAllocator);
-        u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
+
+        if (u->currAllocator == dst.u->currAllocator)
+        {
+            dst.ndoffset(dstofs);
+            dstofs[dims-1] *= esz;
+            u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
+            return;
+        }
     }
-    else
+
+    Mat dst = _dst.getMat();
+    u->currAllocator->download(u, dst.data, dims, sz, srcofs, step.p, dst.step.p);
+}
+
+void UMat::copyTo(OutputArray _dst, InputArray _mask) const
+{
+    if( _mask.empty() )
     {
-        Mat dst = _dst.getMat();
-        u->currAllocator->download(u, dst.data, dims, sz, srcofs, step.p, dst.step.p);
+        copyTo(_dst);
+        return;
     }
+
+    int cn = channels(), mtype = _mask.type(), mdepth = CV_MAT_DEPTH(mtype), mcn = CV_MAT_CN(mtype);
+    CV_Assert( mdepth == CV_8U && (mcn == 1 || mcn == cn) );
+
+    if (ocl::useOpenCL() && _dst.isUMat() && dims <= 2)
+    {
+        UMatData * prevu = _dst.getUMat().u;
+        _dst.create( dims, size, type() );
+
+        UMat dst = _dst.getUMat();
+
+        if( prevu != dst.u ) // do not leave dst uninitialized
+            dst = Scalar(0);
+
+        ocl::Kernel k("copyToMask", ocl::core::copyset_oclsrc,
+                      format("-D COPY_TO_MASK -D T=%s -D scn=%d -D mcn=%d",
+                             ocl::memopTypeToStr(depth()), cn, mcn));
+        if (!k.empty())
+        {
+            k.args(ocl::KernelArg::ReadOnlyNoSize(*this), ocl::KernelArg::ReadOnlyNoSize(_mask.getUMat()),
+                   ocl::KernelArg::WriteOnly(dst));
+
+            size_t globalsize[2] = { cols, rows };
+            if (k.run(2, globalsize, NULL, false))
+                return;
+        }
+    }
+
+    Mat src = getMat(ACCESS_READ);
+    src.copyTo(_dst, _mask);
 }
 
 void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
@@ -690,11 +726,12 @@ void UMat::convertTo(OutputArray _dst, int _type, double alpha, double beta) con
                              doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
         if (!k.empty())
         {
+            UMat src = *this;
             _dst.create( size(), _type );
             UMat dst = _dst.getUMat();
 
             float alphaf = (float)alpha, betaf = (float)beta;
-            k.args(ocl::KernelArg::ReadOnlyNoSize(*this), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
+            k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst, cn), alphaf, betaf);
 
             size_t globalsize[2] = { dst.cols * cn, dst.rows };
             if (k.run(2, globalsize, NULL, false))
@@ -710,20 +747,23 @@ UMat& UMat::setTo(InputArray _value, InputArray _mask)
 {
     bool haveMask = !_mask.empty();
     int tp = type(), cn = CV_MAT_CN(tp);
-    if( dims <= 2 && cn <= 4 && cn != 3 && ocl::useOpenCL() )
+    if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() )
     {
         Mat value = _value.getMat();
         CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
-        double buf[4];
+        double buf[4]={0,0,0,0};
         convertAndUnrollScalar(value, tp, (uchar*)buf, 1);
 
+        int scalarcn = cn == 3 ? 4 : cn;
         char opts[1024];
-        sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(tp));
+        sprintf(opts, "-D dstT=%s -D dstST=%s -D dstT1=%s -D cn=%d", ocl::memopTypeToStr(tp),
+                ocl::memopTypeToStr(CV_MAKETYPE(tp,scalarcn)),
+                ocl::memopTypeToStr(CV_MAT_DEPTH(tp)), cn);
 
         ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
         if( !setK.empty() )
         {
-            ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(tp));
+            ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn);
             UMat mask;
 
             if( haveMask )
@@ -756,6 +796,127 @@ UMat& UMat::operator = (const Scalar& s)
     return *this;
 }
 
+UMat UMat::t() const
+{
+    UMat m;
+    transpose(*this, m);
+    return m;
+}
+
+UMat UMat::inv(int method) const
+{
+    UMat m;
+    invert(*this, m, method);
+    return m;
+}
+
+UMat UMat::mul(InputArray m, double scale) const
+{
+    UMat dst;
+    multiply(*this, m, dst, scale);
+    return dst;
+}
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
+{
+    int type = _src1.type(), depth = CV_MAT_DEPTH(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if ( !doubleSupport && depth == CV_64F )
+        return false;
+
+    int dbsize = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    int ddepth = std::max(CV_32F, depth);
+
+    int wgs2_aligned = 1;
+    while (wgs2_aligned < (int)wgs)
+        wgs2_aligned <<= 1;
+    wgs2_aligned >>= 1;
+
+    char cvt[40];
+    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
+                  format("-D srcT=%s -D dstT=%s -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(ddepth), ocl::convertTypeStr(depth, ddepth, 1, cvt),
+                         (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1), db(1, dbsize, ddepth);
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dbarg = ocl::KernelArg::PtrWriteOnly(db);
+
+    k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
+
+    size_t globalsize = dbsize * wgs;
+    if (k.run(1, &globalsize, &wgs, false))
+    {
+        res = sum(db.getMat(ACCESS_READ))[0];
+        return true;
+    }
+    return false;
+}
+
+#endif
+
+double UMat::dot(InputArray m) const
+{
+    CV_Assert(m.sameSize(*this) && m.type() == type());
+
+#ifdef HAVE_OPENCL
+    double r = 0;
+    CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
+#endif
+
+    return getMat(ACCESS_READ).dot(m);
+}
+
+UMat UMat::zeros(int rows, int cols, int type)
+{
+    return UMat(rows, cols, type, Scalar::all(0));
+}
+
+UMat UMat::zeros(Size size, int type)
+{
+    return UMat(size, type, Scalar::all(0));
+}
+
+UMat UMat::zeros(int ndims, const int* sz, int type)
+{
+    return UMat(ndims, sz, type, Scalar::all(0));
+}
+
+UMat UMat::ones(int rows, int cols, int type)
+{
+    return UMat::ones(Size(cols, rows), type);
+}
+
+UMat UMat::ones(Size size, int type)
+{
+    return UMat(size, type, Scalar(1));
+}
+
+UMat UMat::ones(int ndims, const int* sz, int type)
+{
+    return UMat(ndims, sz, type, Scalar(1));
+}
+
+UMat UMat::eye(int rows, int cols, int type)
+{
+    return UMat::eye(Size(cols, rows), type);
+}
+
+UMat UMat::eye(Size size, int type)
+{
+    UMat m(size, type);
+    setIdentity(m);
+    return m;
+}
+
 }
 
 /* End of file. */
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index 7a24f317a..e6bcf4e78 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -42,6 +42,8 @@
 #include "test_precomp.hpp"
 #include "opencv2/ts/ocl_test.hpp"
 
+#include <cmath>
+
 #ifdef HAVE_OPENCL
 
 namespace cvtest {
@@ -293,7 +295,7 @@ OCL_TEST_P(Mul, Mat)
     }
 }
 
-OCL_TEST_P(Mul, DISABLED_Scalar)
+OCL_TEST_P(Mul, Scalar)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -306,7 +308,7 @@ OCL_TEST_P(Mul, DISABLED_Scalar)
     }
 }
 
-OCL_TEST_P(Mul, DISABLED_Mat_Scale)
+OCL_TEST_P(Mul, Mat_Scale)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -319,6 +321,20 @@ OCL_TEST_P(Mul, DISABLED_Mat_Scale)
     }
 }
 
+OCL_TEST_P(Mul, Mat_Scalar_Scale)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::multiply(src1_roi, val, dst1_roi, val[0]));
+        OCL_ON(cv::multiply(usrc1_roi, val, udst1_roi, val[0]));
+
+        Near(udst1_roi.depth() >= CV_32F ? 1e-2 : 1);
+    }
+}
+
+
 //////////////////////////////// Div /////////////////////////////////////////////////
 
 typedef ArithmTestBase Div;
@@ -335,7 +351,7 @@ OCL_TEST_P(Div, Mat)
     }
 }
 
-OCL_TEST_P(Div, DISABLED_Scalar)
+OCL_TEST_P(Div, Scalar)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -348,6 +364,19 @@ OCL_TEST_P(Div, DISABLED_Scalar)
     }
 }
 
+OCL_TEST_P(Div, Scalar2)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::divide(src1_roi, val, dst1_roi));
+        OCL_ON(cv::divide(usrc1_roi, val, udst1_roi));
+
+        Near(udst1_roi.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
 OCL_TEST_P(Div, Mat_Scale)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -361,8 +390,7 @@ OCL_TEST_P(Div, Mat_Scale)
     }
 }
 
-
-OCL_TEST_P(Div, DISABLED_Mat_Scalar_Scale)
+OCL_TEST_P(Div, Mat_Scalar_Scale)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -375,6 +403,19 @@ OCL_TEST_P(Div, DISABLED_Mat_Scalar_Scale)
     }
 }
 
+OCL_TEST_P(Div, Recip)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::divide(val[0], src1_roi, dst1_roi));
+        OCL_ON(cv::divide(val[0], usrc1_roi, udst1_roi));
+
+        Near(udst1_roi.depth() >= CV_32F ? 1e-3 : 1);
+    }
+}
+
 //////////////////////////////// Min/Max /////////////////////////////////////////////////
 
 typedef ArithmTestBase Min;
@@ -732,16 +773,18 @@ typedef ArithmTestBase Pow;
 
 OCL_TEST_P(Pow, Mat)
 {
+    static const double pows[] = { -4, -1, -2.5, 0, 1, 2, 3.7, 4 };
+
     for (int j = 0; j < test_loop_times; j++)
-    {
-        generateTestData();
-        double p = 4.5;
+        for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
+        {
+            generateTestData();
 
-        OCL_OFF(cv::pow(src1_roi, p, dst1_roi));
-        OCL_ON(cv::pow(usrc1_roi, p, udst1_roi));
+            OCL_OFF(cv::pow(src1_roi, pows[k], dst1_roi));
+            OCL_ON(cv::pow(usrc1_roi, pows[k], udst1_roi));
 
-        Near(1);
-    }
+            Near(1);
+        }
 }
 
 //////////////////////////////// AddWeighted /////////////////////////////////////////////////
@@ -883,6 +926,44 @@ OCL_TEST_P(MeanStdDev, Mat)
     }
 }
 
+OCL_TEST_P(MeanStdDev, Mat_Mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Scalar cpu_mean, cpu_stddev;
+        Scalar gpu_mean, gpu_stddev;
+
+        OCL_OFF(cv::meanStdDev(src1_roi, cpu_mean, cpu_stddev, mask_roi));
+        OCL_ON(cv::meanStdDev(usrc1_roi, gpu_mean, gpu_stddev, umask_roi));
+
+        for (int i = 0; i < cn; ++i)
+        {
+            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
+            EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
+        }
+    }
+}
+
+OCL_TEST(MeanStdDev_, ZeroMask)
+{
+    Size size(5, 5);
+    UMat um(size, CV_32SC1), umask(size, CV_8UC1, Scalar::all(0));
+    Mat m(size, CV_32SC1), mask(size, CV_8UC1, Scalar::all(0));
+
+    Scalar cpu_mean, cpu_stddev;
+    Scalar gpu_mean, gpu_stddev;
+
+    OCL_OFF(cv::meanStdDev(m, cpu_mean, cpu_stddev, mask));
+    OCL_ON(cv::meanStdDev(um, gpu_mean, gpu_stddev, umask));
+
+    for (int i = 0; i < 4; ++i)
+    {
+        EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
+        EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
+    }
+}
 
 //////////////////////////////////////// Log /////////////////////////////////////////
 
@@ -1083,6 +1164,19 @@ OCL_TEST_P(Norm, NORM_INF_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_INF_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_INF, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_INF, umask_roi));
+
+        EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_L1_1arg)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -1096,6 +1190,19 @@ OCL_TEST_P(Norm, NORM_L1_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_L1_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_L1, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_L1, umask_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_L2_1arg)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -1109,6 +1216,19 @@ OCL_TEST_P(Norm, NORM_L2_1arg)
     }
 }
 
+OCL_TEST_P(Norm, NORM_L2_1arg_mask)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = cv::norm(src1_roi, NORM_L2, mask_roi));
+        OCL_ON(const double gpuRes = cv::norm(usrc1_roi, NORM_L2, umask_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 OCL_TEST_P(Norm, NORM_INF_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1127,6 +1247,24 @@ OCL_TEST_P(Norm, NORM_INF_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_INF_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_INF;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
+        }
+}
+
 OCL_TEST_P(Norm, NORM_L1_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1145,6 +1283,24 @@ OCL_TEST_P(Norm, NORM_L1_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_L1_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_L1;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+        }
+}
+
 OCL_TEST_P(Norm, NORM_L2_2args)
 {
     for (int relative = 0; relative < 2; ++relative)
@@ -1163,6 +1319,41 @@ OCL_TEST_P(Norm, NORM_L2_2args)
         }
 }
 
+OCL_TEST_P(Norm, NORM_L2_2args_mask)
+{
+    for (int relative = 0; relative < 2; ++relative)
+        for (int j = 0; j < test_loop_times; j++)
+        {
+            generateTestData();
+
+            int type = NORM_L2;
+            if (relative == 1)
+                type |= NORM_RELATIVE;
+
+            OCL_OFF(const double cpuRes = cv::norm(src1_roi, src2_roi, type, mask_roi));
+            OCL_ON(const double gpuRes = cv::norm(usrc1_roi, usrc2_roi, type, umask_roi));
+
+            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+        }
+}
+
+//////////////////////////////// UMat::dot ////////////////////////////////////////////////
+
+typedef ArithmTestBase UMatDot;
+
+OCL_TEST_P(UMatDot, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(const double cpuRes = src1_roi.dot(src2_roi));
+        OCL_ON(const double gpuRes = usrc1_roi.dot(usrc2_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
 //////////////////////////////// Sqrt ////////////////////////////////////////////////
 
 typedef ArithmTestBase Sqrt;
@@ -1180,6 +1371,320 @@ OCL_TEST_P(Sqrt, Mat)
     }
 }
 
+//////////////////////////////// Normalize ////////////////////////////////////////////////
+
+typedef ArithmTestBase Normalize;
+
+OCL_TEST_P(Normalize, Mat)
+{
+    static int modes[] = { CV_MINMAX, CV_L2, CV_L1, CV_C };
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        for (int i = 0, size = sizeof(modes) / sizeof(modes[0]); i < size; ++i)
+        {
+            OCL_OFF(cv::normalize(src1_roi, dst1_roi, 10, 110, modes[i], src1_roi.type(), mask_roi));
+            OCL_ON(cv::normalize(usrc1_roi, udst1_roi, 10, 110, modes[i], src1_roi.type(), umask_roi));
+
+            Near(1);
+        }
+    }
+}
+
+//////////////////////////////////////// InRange ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(InRange, MatDepth, Channels, bool /*Scalar or not*/, bool /*Roi*/)
+{
+    int depth;
+    int cn;
+    bool scalars, use_roi;
+    cv::Scalar val1, val2;
+
+    TEST_DECLARE_INPUT_PARAMETER(src1)
+    TEST_DECLARE_INPUT_PARAMETER(src2)
+    TEST_DECLARE_INPUT_PARAMETER(src3)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        scalars = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+    }
+
+    virtual void generateTestData()
+    {
+        const int type = CV_MAKE_TYPE(depth, cn);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src1, src1_roi, roiSize, src1Border, type, -40, 40);
+
+        Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src2, src2_roi, roiSize, src2Border, type, -40, 40);
+
+        Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src3, src3_roi, roiSize, src3Border, type, -40, 40);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_8UC1, 5, 16);
+
+        val1 = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
+                          rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
+        val2 = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
+                          rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src1)
+        UMAT_UPLOAD_INPUT_PARAMETER(src2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src3)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+
+    void Near()
+    {
+        OCL_EXPECT_MATS_NEAR(dst, 0)
+    }
+};
+
+OCL_TEST_P(InRange, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::inRange(src1_roi, src2_roi, src3_roi, dst_roi));
+        OCL_ON(cv::inRange(usrc1_roi, usrc2_roi, usrc3_roi, udst_roi));
+
+        Near();
+    }
+}
+
+OCL_TEST_P(InRange, Scalar)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::inRange(src1_roi, val1, val2, dst_roi));
+        OCL_ON(cv::inRange(usrc1_roi, val1, val2, udst_roi));
+
+        Near();
+    }
+}
+
+//////////////////////////////// ConvertScaleAbs ////////////////////////////////////////////////
+
+typedef ArithmTestBase ConvertScaleAbs;
+
+OCL_TEST_P(ConvertScaleAbs, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::convertScaleAbs(src1_roi, dst1_roi, val[0], val[1]));
+        OCL_ON(cv::convertScaleAbs(usrc1_roi, udst1_roi, val[0], val[1]));
+
+        Near(depth <= CV_32S ? 1 : 1e-6);
+    }
+}
+
+//////////////////////////////// ScaleAdd ////////////////////////////////////////////////
+
+typedef ArithmTestBase ScaleAdd;
+
+OCL_TEST_P(ScaleAdd, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::scaleAdd(src1_roi, val[0], src2_roi, dst1_roi));
+        OCL_ON(cv::scaleAdd(usrc1_roi, val[0], usrc2_roi, udst1_roi));
+
+        Near(depth <= CV_32S ? 1 : 1e-3);
+    }
+}
+
+//////////////////////////////// PatchNans ////////////////////////////////////////////////
+
+PARAM_TEST_CASE(PatchNaNs, Channels, bool)
+{
+    int cn;
+    bool use_roi;
+    double value;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+
+    virtual void SetUp()
+    {
+        cn = GET_PARAM(0);
+        use_roi = GET_PARAM(1);
+    }
+
+    virtual void generateTestData()
+    {
+        const int type = CV_MAKE_TYPE(CV_32F, cn);
+
+        Size roiSize = randomSize(1, 10);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, -40, 40);
+
+        // generating NaNs
+        roiSize.width *= cn;
+        for (int y = 0; y < roiSize.height; ++y)
+        {
+            float * const ptr = src_roi.ptr<float>(y);
+            for (int x = 0; x < roiSize.width; ++x)
+                ptr[x] = randomInt(-1, 1) == 0 ? std::numeric_limits<float>::quiet_NaN() : ptr[x];
+        }
+
+        value = randomDouble(-100, 100);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+    }
+
+    void Near()
+    {
+        OCL_EXPECT_MATS_NEAR(src, 0)
+    }
+};
+
+OCL_TEST_P(PatchNaNs, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::patchNaNs(src_roi, value));
+        OCL_ON(cv::patchNaNs(usrc_roi, value));
+
+        Near();
+    }
+}
+
+//////////////////////////////// Psnr ////////////////////////////////////////////////
+
+typedef ArithmTestBase Psnr;
+
+OCL_TEST_P(Psnr, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        double cpuRes = 0, gpuRes = 0;
+
+        OCL_OFF(cpuRes = cv::PSNR(src1_roi, src2_roi));
+        OCL_ON(gpuRes = cv::PSNR(usrc1_roi, usrc2_roi));
+
+        EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
+    }
+}
+
+//////////////////////////////////////// Reduce /////////////////////////////////////////////
+
+PARAM_TEST_CASE(Reduce, std::pair<MatDepth, MatDepth>, Channels, int, bool)
+{
+    int sdepth, ddepth, cn, dim, dtype;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        const std::pair<MatDepth, MatDepth> p = GET_PARAM(0);
+        sdepth = p.first;
+        ddepth = p.second;
+        cn = GET_PARAM(1);
+        dim = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+    }
+
+    virtual void generateTestData()
+    {
+        const int stype = CV_MAKE_TYPE(sdepth, cn);
+        dtype = CV_MAKE_TYPE(ddepth, cn);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, stype, -40, 40);
+
+        Size dstRoiSize = Size(dim == 0 ? roiSize.width : 1, dim == 0 ? 1 : roiSize.height);
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, dstRoiSize, dstBorder, dtype, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
+typedef Reduce ReduceSum;
+
+OCL_TEST_P(ReduceSum, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_SUM, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_SUM, dtype));
+
+        double eps = ddepth <= CV_32S ? 1 : 1e-4;
+        OCL_EXPECT_MATS_NEAR(dst, eps)
+    }
+}
+
+typedef Reduce ReduceMax;
+
+OCL_TEST_P(ReduceMax, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MAX, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MAX, dtype));
+
+        OCL_EXPECT_MATS_NEAR(dst, 0)
+    }
+}
+
+typedef Reduce ReduceMin;
+
+OCL_TEST_P(ReduceMin, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_MIN, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_MIN, dtype));
+
+        OCL_EXPECT_MATS_NEAR(dst, 0)
+    }
+}
+
+typedef Reduce ReduceAvg;
+
+OCL_TEST_P(ReduceAvg, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, CV_REDUCE_AVG, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, CV_REDUCE_AVG, dtype));
+
+        double eps = ddepth <= CV_32S ? 1 : 5e-6;
+        OCL_EXPECT_MATS_NEAR(dst, eps)
+    }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////
 
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(::testing::Values(CV_8U, CV_8S), OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
@@ -1214,6 +1719,49 @@ OCL_INSTANTIATE_TEST_CASE_P(Arithm, MinMaxIdx, Combine(OCL_ALL_DEPTHS, OCL_ALL_C
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, MinMaxIdx_Mask, Combine(OCL_ALL_DEPTHS, ::testing::Values(Channels(1)), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Norm, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(::testing::Values(CV_32F, CV_64F), OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, Normalize, Combine(OCL_ALL_DEPTHS, Values(Channels(1)), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, InRange, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ConvertScaleAbs, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ScaleAdd, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, PatchNaNs, Combine(OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, Psnr, Combine(::testing::Values((MatDepth)CV_8U), OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, UMatDot, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ReduceSum, Combine(testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32S),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_8U, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F)),
+                                                       OCL_ALL_CHANNELS, testing::Values(0, 1), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ReduceAvg, Combine(testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32S),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_8U, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_64F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F)),
+                                                       OCL_ALL_CHANNELS, testing::Values(0, 1), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ReduceMax, Combine(testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_8U),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_16U),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_16S),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F)),
+                                                       OCL_ALL_CHANNELS, testing::Values(0, 1), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Arithm, ReduceMin, Combine(testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_8U),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16U, CV_16U),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_16S, CV_16S),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F),
+                                                                       std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F)),
+                                                       OCL_ALL_CHANNELS, testing::Values(0, 1), Bool()));
+
 
 } } // namespace cvtest::ocl
 
diff --git a/modules/core/test/ocl/test_channels.cpp b/modules/core/test/ocl/test_channels.cpp
new file mode 100644
index 000000000..f0dc10250
--- /dev/null
+++ b/modules/core/test/ocl/test_channels.cpp
@@ -0,0 +1,458 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////////////////// Merge ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(Merge, MatDepth, Channels, bool)
+{
+    int depth, cn;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src1)
+    TEST_DECLARE_INPUT_PARAMETER(src2)
+    TEST_DECLARE_INPUT_PARAMETER(src3)
+    TEST_DECLARE_INPUT_PARAMETER(src4)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    std::vector<Mat> src_roi;
+    std::vector<UMat> usrc_roi;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+
+        CV_Assert(cn >= 1 && cn <= 4);
+    }
+
+    void generateTestData()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+
+        {
+            Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src1, src1_roi, roiSize, src1Border, depth, 2, 11);
+
+            Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src2, src2_roi, roiSize, src2Border, depth, -1540, 1740);
+
+            Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src3, src3_roi, roiSize, src3Border, depth, -1540, 1740);
+
+            Border src4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src4, src4_roi, roiSize, src4Border, depth, -1540, 1740);
+        }
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src1)
+        UMAT_UPLOAD_INPUT_PARAMETER(src2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src3)
+        UMAT_UPLOAD_INPUT_PARAMETER(src4)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+
+        src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
+        if (cn >= 2)
+            src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
+        if (cn >= 3)
+            src_roi.push_back(src3_roi), usrc_roi.push_back(usrc3_roi);
+        if (cn >= 4)
+            src_roi.push_back(src4_roi), usrc_roi.push_back(usrc4_roi);
+    }
+
+    void Near(double threshold = 0.)
+    {
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
+    }
+};
+
+OCL_TEST_P(Merge, Accuracy)
+{
+    for(int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::merge(src_roi, dst_roi));
+        OCL_ON(cv::merge(usrc_roi, udst_roi));
+
+        Near();
+    }
+}
+
+//////////////////////////////////////// Split ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(Split, MatType, Channels, bool)
+{
+    int depth, cn;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+
+    std::vector<Mat> dst_roi, dst;
+    std::vector<UMat> udst_roi, udst;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+
+        CV_Assert(cn >= 1 && cn <= 4);
+    }
+
+    void generateTestData()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
+
+        {
+            Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst1, dst1_roi, roiSize, dst1Border, depth, 2, 11);
+
+            Border dst2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst2, dst2_roi, roiSize, dst2Border, depth, -1540, 1740);
+
+            Border dst3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst3, dst3_roi, roiSize, dst3Border, depth, -1540, 1740);
+
+            Border dst4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst4, dst4_roi, roiSize, dst4Border, depth, -1540, 1740);
+        }
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+
+        dst_roi.push_back(dst1_roi), udst_roi.push_back(udst1_roi),
+                dst.push_back(dst1), udst.push_back(udst1);
+        if (cn >= 2)
+            dst_roi.push_back(dst2_roi), udst_roi.push_back(udst2_roi),
+                    dst.push_back(dst2), udst.push_back(udst2);
+        if (cn >= 3)
+            dst_roi.push_back(dst3_roi), udst_roi.push_back(udst3_roi),
+                    dst.push_back(dst3), udst.push_back(udst3);
+        if (cn >= 4)
+            dst_roi.push_back(dst4_roi), udst_roi.push_back(udst4_roi),
+                    dst.push_back(dst4), udst.push_back(udst4);
+    }
+};
+
+OCL_TEST_P(Split, Accuracy)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::split(src_roi, dst_roi));
+        OCL_ON(cv::split(usrc_roi, udst_roi));
+
+        for (int i = 0; i < cn; ++i)
+        {
+            EXPECT_MAT_NEAR(dst[i], udst[i], 0.0);
+            EXPECT_MAT_NEAR(dst_roi[i], udst_roi[i], 0.0);
+        }
+    }
+}
+
+//////////////////////////////////////// MixChannels ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(MixChannels, MatType, bool)
+{
+    int depth;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src1)
+    TEST_DECLARE_INPUT_PARAMETER(src2)
+    TEST_DECLARE_INPUT_PARAMETER(src3)
+    TEST_DECLARE_INPUT_PARAMETER(src4)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
+
+    std::vector<Mat> src_roi, dst_roi, dst;
+    std::vector<UMat> usrc_roi, udst_roi, udst;
+    std::vector<int> fromTo;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        use_roi = GET_PARAM(1);
+    }
+
+    // generate number of channels and create type
+    int type()
+    {
+        int cn = randomInt(1, 5);
+        return CV_MAKE_TYPE(depth, cn);
+    }
+
+    void generateTestData()
+    {
+        src_roi.clear();
+        dst_roi.clear();
+        dst.clear();
+        usrc_roi.clear();
+        udst_roi.clear();
+        udst.clear();
+        fromTo.clear();
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+
+        {
+            Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src1, src1_roi, roiSize, src1Border, type(), 2, 11);
+
+            Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src2, src2_roi, roiSize, src2Border, type(), -1540, 1740);
+
+            Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src3, src3_roi, roiSize, src3Border, type(), -1540, 1740);
+
+            Border src4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(src4, src4_roi, roiSize, src4Border, type(), -1540, 1740);
+        }
+
+        {
+            Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst1, dst1_roi, roiSize, dst1Border, type(), 2, 11);
+
+            Border dst2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst2, dst2_roi, roiSize, dst2Border, type(), -1540, 1740);
+
+            Border dst3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst3, dst3_roi, roiSize, dst3Border, type(), -1540, 1740);
+
+            Border dst4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            randomSubMat(dst4, dst4_roi, roiSize, dst4Border, type(), -1540, 1740);
+        }
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src1)
+        UMAT_UPLOAD_INPUT_PARAMETER(src2)
+        UMAT_UPLOAD_INPUT_PARAMETER(src3)
+        UMAT_UPLOAD_INPUT_PARAMETER(src4)
+
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
+
+        int nsrc = randomInt(1, 5), ndst = randomInt(1, 5);
+
+        src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
+        if (nsrc >= 2)
+            src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
+        if (nsrc >= 3)
+            src_roi.push_back(src3_roi), usrc_roi.push_back(usrc3_roi);
+        if (nsrc >= 4)
+            src_roi.push_back(src4_roi), usrc_roi.push_back(usrc4_roi);
+
+        dst_roi.push_back(dst1_roi), udst_roi.push_back(udst1_roi),
+                dst.push_back(dst1), udst.push_back(udst1);
+        if (ndst >= 2)
+            dst_roi.push_back(dst2_roi), udst_roi.push_back(udst2_roi),
+                    dst.push_back(dst2), udst.push_back(udst2);
+        if (ndst >= 3)
+            dst_roi.push_back(dst3_roi), udst_roi.push_back(udst3_roi),
+                    dst.push_back(dst3), udst.push_back(udst3);
+        if (ndst >= 4)
+            dst_roi.push_back(dst4_roi), udst_roi.push_back(udst4_roi),
+                    dst.push_back(dst4), udst.push_back(udst4);
+
+        int scntotal = 0, dcntotal = 0;
+        for (int i = 0; i < nsrc; ++i)
+            scntotal += src_roi[i].channels();
+        for (int i = 0; i < ndst; ++i)
+            dcntotal += dst_roi[i].channels();
+
+        int npairs = randomInt(1, std::min(scntotal, dcntotal) + 1);
+        fromTo.resize(npairs << 1);
+
+        for (int i = 0; i < npairs; ++i)
+        {
+            fromTo[i<<1] = randomInt(0, scntotal);
+            fromTo[(i<<1)+1] = randomInt(0, dcntotal);
+        }
+    }
+};
+
+OCL_TEST_P(MixChannels, Accuracy)
+{
+    for (int j = 0; j < test_loop_times + 10; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::mixChannels(src_roi, dst_roi, fromTo));
+        OCL_ON(cv::mixChannels(usrc_roi, udst_roi, fromTo));
+
+        for (size_t i = 0, size = dst_roi.size(); i < size; ++i)
+        {
+            EXPECT_MAT_NEAR(dst[i], udst[i], 0.0);
+            EXPECT_MAT_NEAR(dst_roi[i], udst_roi[i], 0.0);
+        }
+    }
+}
+
+//////////////////////////////////////// InsertChannel ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(InsertChannel, MatDepth, Channels, bool)
+{
+    int depth, cn, coi;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+    }
+
+    void generateTestData()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+        coi = randomInt(0, cn);
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, depth, 2, 11);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
+OCL_TEST_P(InsertChannel, Accuracy)
+{
+    for(int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::insertChannel(src_roi, dst_roi, coi));
+        OCL_ON(cv::insertChannel(usrc_roi, udst_roi, coi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 0);
+    }
+}
+
+//////////////////////////////////////// ExtractChannel ///////////////////////////////////////////////
+
+PARAM_TEST_CASE(ExtractChannel, MatDepth, Channels, bool)
+{
+    int depth, cn, coi;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+    }
+
+    void generateTestData()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+        coi = randomInt(0, cn);
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKE_TYPE(depth, cn), 2, 11);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, depth, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
+OCL_TEST_P(ExtractChannel, Accuracy)
+{
+    for(int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::extractChannel(src_roi, dst_roi, coi));
+        OCL_ON(cv::extractChannel(usrc_roi, udst_roi, coi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 0);
+    }
+}
+
+//////////////////////////////////////// Instantiation ///////////////////////////////////////////////
+
+OCL_INSTANTIATE_TEST_CASE_P(Channels, Merge, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, Split, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, MixChannels, Combine(OCL_ALL_DEPTHS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, InsertChannel, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Channels, ExtractChannel, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/core/test/ocl/test_matrix_expr.cpp b/modules/core/test/ocl/test_matrix_expr.cpp
new file mode 100644
index 000000000..167026d8c
--- /dev/null
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////////// UMat Expressions /////////////////////////////////////////////////
+
+PARAM_TEST_CASE(UMatExpr, MatDepth, Channels)
+{
+    int type;
+    Size size;
+
+    virtual void SetUp()
+    {
+        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+    }
+
+    void generateTestData()
+    {
+        size = randomSize(1, MAX_VALUE);
+    }
+};
+
+//////////////////////////////// UMat::eye /////////////////////////////////////////////////
+
+OCL_TEST_P(UMatExpr, Eye)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Mat m = Mat::eye(size, type);
+        UMat um = UMat::eye(size, type);
+
+        EXPECT_MAT_NEAR(m, um, 0);
+    }
+}
+
+//////////////////////////////// UMat::zeros /////////////////////////////////////////////////
+
+OCL_TEST_P(UMatExpr, Zeros)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Mat m = Mat::zeros(size, type);
+        UMat um = UMat::zeros(size, type);
+
+        EXPECT_MAT_NEAR(m, um, 0);
+    }
+}
+
+//////////////////////////////// UMat::ones /////////////////////////////////////////////////
+
+OCL_TEST_P(UMatExpr, Ones)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Mat m = Mat::ones(size, type);
+        UMat um = UMat::ones(size, type);
+
+        EXPECT_MAT_NEAR(m, um, 0);
+    }
+}
+
+//////////////////////////////// Instantiation /////////////////////////////////////////////////
+
+OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS));
+
+} } // namespace cvtest::ocl
+
+#endif
diff --git a/modules/core/test/ocl/test_matrix_operation.cpp b/modules/core/test/ocl/test_matrix_operation.cpp
index aabbb3f6c..77c5dad95 100644
--- a/modules/core/test/ocl/test_matrix_operation.cpp
+++ b/modules/core/test/ocl/test_matrix_operation.cpp
@@ -54,7 +54,7 @@ namespace ocl {
 
 ////////////////////////////////converto/////////////////////////////////////////////////
 
-PARAM_TEST_CASE(MatrixTestBase, MatDepth, MatDepth, Channels, bool)
+PARAM_TEST_CASE(ConvertTo, MatDepth, MatDepth, Channels, bool)
 {
     int src_depth, cn, dstType;
     bool use_roi;
@@ -85,8 +85,6 @@ PARAM_TEST_CASE(MatrixTestBase, MatDepth, MatDepth, Channels, bool)
     }
 };
 
-typedef MatrixTestBase ConvertTo;
-
 OCL_TEST_P(ConvertTo, Accuracy)
 {
     for (int j = 0; j < test_loop_times; j++)
@@ -103,7 +101,51 @@ OCL_TEST_P(ConvertTo, Accuracy)
     }
 }
 
-typedef MatrixTestBase CopyTo;
+//////////////////////////////// CopyTo /////////////////////////////////////////////////
+
+PARAM_TEST_CASE(CopyTo, MatDepth, Channels, bool, bool)
+{
+    int depth, cn;
+    bool use_roi, use_mask;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(mask)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        use_mask = GET_PARAM(3);
+    }
+
+    void generateTestData()
+    {
+        const int type = CV_MAKE_TYPE(depth, cn);
+
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        if (use_mask)
+        {
+            Border maskBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+            int mask_cn = randomDouble(0.0, 2.0) > 1.0 ? cn : 1;
+            randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC(mask_cn), 0, 2);
+            cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
+        }
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        if (use_mask)
+            UMAT_UPLOAD_INPUT_PARAMETER(mask)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+};
+
 
 OCL_TEST_P(CopyTo, Accuracy)
 {
@@ -111,8 +153,16 @@ OCL_TEST_P(CopyTo, Accuracy)
     {
         generateTestData();
 
-        OCL_OFF(src_roi.copyTo(dst_roi));
-        OCL_ON(usrc_roi.copyTo(udst_roi));
+        if (use_mask)
+        {
+            OCL_OFF(src_roi.copyTo(dst_roi, mask_roi));
+            OCL_ON(usrc_roi.copyTo(udst_roi, umask_roi));
+        }
+        else
+        {
+            OCL_OFF(src_roi.copyTo(dst_roi));
+            OCL_ON(usrc_roi.copyTo(udst_roi));
+        }
 
         OCL_EXPECT_MATS_NEAR(dst, 0);
     }
@@ -122,7 +172,7 @@ OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
                             OCL_ALL_DEPTHS, OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
 
 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                                OCL_ALL_DEPTHS, Values((MatDepth)0), OCL_ALL_CHANNELS, Bool()));
+                                OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
 
 } } // namespace cvtest::ocl
 
diff --git a/modules/core/test/ocl/test_split_merge.cpp b/modules/core/test/ocl/test_split_merge.cpp
deleted file mode 100644
index c1c0f0e30..000000000
--- a/modules/core/test/ocl/test_split_merge.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/ts/ocl_test.hpp"
-
-#ifdef HAVE_OPENCL
-
-namespace cvtest {
-namespace ocl {
-
-PARAM_TEST_CASE(MergeTestBase, MatDepth, Channels, bool)
-{
-    int depth, cn;
-    bool use_roi;
-
-    TEST_DECLARE_INPUT_PARAMETER(src1)
-    TEST_DECLARE_INPUT_PARAMETER(src2)
-    TEST_DECLARE_INPUT_PARAMETER(src3)
-    TEST_DECLARE_INPUT_PARAMETER(src4)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst)
-
-    std::vector<Mat> src_roi;
-    std::vector<UMat> usrc_roi;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
-
-        CV_Assert(cn >= 1 && cn <= 4);
-    }
-
-    void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-
-        {
-            Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src1, src1_roi, roiSize, src1Border, depth, 2, 11);
-
-            Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src2, src2_roi, roiSize, src2Border, depth, -1540, 1740);
-
-            Border src3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src3, src3_roi, roiSize, src3Border, depth, -1540, 1740);
-
-            Border src4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(src4, src4_roi, roiSize, src4Border, depth, -1540, 1740);
-        }
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src1)
-        UMAT_UPLOAD_INPUT_PARAMETER(src2)
-        UMAT_UPLOAD_INPUT_PARAMETER(src3)
-        UMAT_UPLOAD_INPUT_PARAMETER(src4)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
-
-        src_roi.push_back(src1_roi), usrc_roi.push_back(usrc1_roi);
-        if (cn >= 2)
-            src_roi.push_back(src2_roi), usrc_roi.push_back(usrc2_roi);
-        if (cn >= 3)
-            src_roi.push_back(src3_roi), usrc_roi.push_back(usrc3_roi);
-        if (cn >= 4)
-            src_roi.push_back(src4_roi), usrc_roi.push_back(usrc4_roi);
-    }
-
-    void Near(double threshold = 0.)
-    {
-        OCL_EXPECT_MATS_NEAR(dst, threshold);
-    }
-};
-
-typedef MergeTestBase Merge;
-
-OCL_TEST_P(Merge, Accuracy)
-{
-    for(int j = 0; j < test_loop_times; j++)
-    {
-        random_roi();
-
-        OCL_OFF(cv::merge(src_roi, dst_roi));
-        OCL_ON(cv::merge(usrc_roi, udst_roi));
-
-        Near();
-    }
-}
-
-PARAM_TEST_CASE(SplitTestBase, MatType, Channels, bool)
-{
-    int depth, cn;
-    bool use_roi;
-
-    TEST_DECLARE_INPUT_PARAMETER(src)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst1)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst2)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst3)
-    TEST_DECLARE_OUTPUT_PARAMETER(dst4)
-
-    std::vector<Mat> dst_roi, dst;
-    std::vector<UMat> udst_roi, udst;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
-
-        CV_Assert(cn >= 1 && cn <= 4);
-    }
-
-    void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKE_TYPE(depth, cn), 5, 16);
-
-        {
-            Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(dst1, dst1_roi, roiSize, dst1Border, depth, 2, 11);
-
-            Border dst2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(dst2, dst2_roi, roiSize, dst2Border, depth, -1540, 1740);
-
-            Border dst3Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(dst3, dst3_roi, roiSize, dst3Border, depth, -1540, 1740);
-
-            Border dst4Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(dst4, dst4_roi, roiSize, dst4Border, depth, -1540, 1740);
-        }
-
-        UMAT_UPLOAD_INPUT_PARAMETER(src)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst1)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst2)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst3)
-        UMAT_UPLOAD_OUTPUT_PARAMETER(dst4)
-
-        dst_roi.push_back(dst1_roi), udst_roi.push_back(udst1_roi),
-                dst.push_back(dst1), udst.push_back(udst1);
-        if (cn >= 2)
-            dst_roi.push_back(dst2_roi), udst_roi.push_back(udst2_roi),
-                    dst.push_back(dst2), udst.push_back(udst2);
-        if (cn >= 3)
-            dst_roi.push_back(dst3_roi), udst_roi.push_back(udst3_roi),
-                    dst.push_back(dst3), udst.push_back(udst3);
-        if (cn >= 4)
-            dst_roi.push_back(dst4_roi), udst_roi.push_back(udst4_roi),
-                    dst.push_back(dst4), udst.push_back(udst4);
-    }
-};
-
-typedef SplitTestBase Split;
-
-OCL_TEST_P(Split, DISABLED_Accuracy)
-{
-    for (int j = 0; j < test_loop_times; j++)
-    {
-        random_roi();
-
-        OCL_OFF(cv::split(src_roi, dst_roi));
-        OCL_ON(cv::split(usrc_roi, udst_roi));
-
-        for (int i = 0; i < cn; ++i)
-        {
-            EXPECT_MAT_NEAR(dst[i], udst[i], 0.0);
-            EXPECT_MAT_NEAR(dst_roi[i], udst_roi[i], 0.0);
-        }
-    }
-}
-
-OCL_INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
-OCL_INSTANTIATE_TEST_CASE_P(SplitMerge, Split, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool()));
-
-} } // namespace cvtest::ocl
-
-#endif // HAVE_OPENCL
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index 7486e134a..626b44cbc 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -18,7 +18,7 @@ struct BaseElemWiseOp
     BaseElemWiseOp(int _ninputs, int _flags, double _alpha, double _beta,
                    Scalar _gamma=Scalar::all(0), int _context=1)
     : ninputs(_ninputs), flags(_flags), alpha(_alpha), beta(_beta), gamma(_gamma), context(_context) {}
-    BaseElemWiseOp() { flags = 0; alpha = beta = 0; gamma = Scalar::all(0); }
+    BaseElemWiseOp() { flags = 0; alpha = beta = 0; gamma = Scalar::all(0); ninputs = 0; context = 1; }
     virtual ~BaseElemWiseOp() {}
     virtual void op(const vector<Mat>&, Mat&, const Mat&) {}
     virtual void refop(const vector<Mat>&, Mat&, const Mat&) {}
@@ -89,7 +89,6 @@ struct BaseElemWiseOp
     double alpha;
     double beta;
     Scalar gamma;
-    int maxErr;
     int context;
 };
 
@@ -115,7 +114,7 @@ struct BaseAddOp : public BaseElemWiseOp
 
 struct AddOp : public BaseAddOp
 {
-    AddOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {};
+    AddOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( mask.empty() )
@@ -128,7 +127,7 @@ struct AddOp : public BaseAddOp
 
 struct SubOp : public BaseAddOp
 {
-    SubOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, -1, Scalar::all(0)) {};
+    SubOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, -1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( mask.empty() )
@@ -141,7 +140,7 @@ struct SubOp : public BaseAddOp
 
 struct AddSOp : public BaseAddOp
 {
-    AddSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, 1, 0, Scalar::all(0)) {};
+    AddSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, 1, 0, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( mask.empty() )
@@ -154,7 +153,7 @@ struct AddSOp : public BaseAddOp
 
 struct SubRSOp : public BaseAddOp
 {
-    SubRSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, -1, 0, Scalar::all(0)) {};
+    SubRSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, -1, 0, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( mask.empty() )
@@ -167,7 +166,7 @@ struct SubRSOp : public BaseAddOp
 
 struct ScaleAddOp : public BaseAddOp
 {
-    ScaleAddOp() : BaseAddOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    ScaleAddOp() : BaseAddOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         scaleAdd(src[0], alpha, src[1], dst);
@@ -181,7 +180,7 @@ struct ScaleAddOp : public BaseAddOp
 
 struct AddWeightedOp : public BaseAddOp
 {
-    AddWeightedOp() : BaseAddOp(2, REAL_GAMMA, 1, 1, Scalar::all(0)) {};
+    AddWeightedOp() : BaseAddOp(2, REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         addWeighted(src[0], alpha, src[1], beta, gamma[0], dst);
@@ -194,7 +193,7 @@ struct AddWeightedOp : public BaseAddOp
 
 struct MulOp : public BaseElemWiseOp
 {
-    MulOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    MulOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void getValueRange(int depth, double& minval, double& maxval)
     {
         minval = depth < CV_32S ? cvtest::getMinVal(depth) : depth == CV_32S ? -1000000 : -1000.;
@@ -218,7 +217,7 @@ struct MulOp : public BaseElemWiseOp
 
 struct DivOp : public BaseElemWiseOp
 {
-    DivOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    DivOp() : BaseElemWiseOp(2, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::divide(src[0], src[1], dst, alpha);
@@ -235,7 +234,7 @@ struct DivOp : public BaseElemWiseOp
 
 struct RecipOp : public BaseElemWiseOp
 {
-    RecipOp() : BaseElemWiseOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    RecipOp() : BaseElemWiseOp(1, FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::divide(alpha, src[0], dst);
@@ -252,7 +251,7 @@ struct RecipOp : public BaseElemWiseOp
 
 struct AbsDiffOp : public BaseAddOp
 {
-    AbsDiffOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, -1, Scalar::all(0)) {};
+    AbsDiffOp() : BaseAddOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, -1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         absdiff(src[0], src[1], dst);
@@ -265,7 +264,7 @@ struct AbsDiffOp : public BaseAddOp
 
 struct AbsDiffSOp : public BaseAddOp
 {
-    AbsDiffSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA, 1, 0, Scalar::all(0)) {};
+    AbsDiffSOp() : BaseAddOp(1, FIX_ALPHA+FIX_BETA, 1, 0, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         absdiff(src[0], gamma, dst);
@@ -278,7 +277,7 @@ struct AbsDiffSOp : public BaseAddOp
 
 struct LogicOp : public BaseElemWiseOp
 {
-    LogicOp(char _opcode) : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)), opcode(_opcode) {};
+    LogicOp(char _opcode) : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)), opcode(_opcode) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( opcode == '&' )
@@ -309,7 +308,7 @@ struct LogicOp : public BaseElemWiseOp
 struct LogicSOp : public BaseElemWiseOp
 {
     LogicSOp(char _opcode)
-    : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+(_opcode != '~' ? SUPPORT_MASK : 0), 1, 1, Scalar::all(0)), opcode(_opcode) {};
+    : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+(_opcode != '~' ? SUPPORT_MASK : 0), 1, 1, Scalar::all(0)), opcode(_opcode) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         if( opcode == '&' )
@@ -341,7 +340,7 @@ struct LogicSOp : public BaseElemWiseOp
 
 struct MinOp : public BaseElemWiseOp
 {
-    MinOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    MinOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::min(src[0], src[1], dst);
@@ -358,7 +357,7 @@ struct MinOp : public BaseElemWiseOp
 
 struct MaxOp : public BaseElemWiseOp
 {
-    MaxOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    MaxOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::max(src[0], src[1], dst);
@@ -375,7 +374,7 @@ struct MaxOp : public BaseElemWiseOp
 
 struct MinSOp : public BaseElemWiseOp
 {
-    MinSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {};
+    MinSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::min(src[0], gamma[0], dst);
@@ -392,7 +391,7 @@ struct MinSOp : public BaseElemWiseOp
 
 struct MaxSOp : public BaseElemWiseOp
 {
-    MaxSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {};
+    MaxSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::max(src[0], gamma[0], dst);
@@ -409,7 +408,7 @@ struct MaxSOp : public BaseElemWiseOp
 
 struct CmpOp : public BaseElemWiseOp
 {
-    CmpOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    CmpOp() : BaseElemWiseOp(2, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
     void generateScalars(int depth, RNG& rng)
     {
         BaseElemWiseOp::generateScalars(depth, rng);
@@ -437,7 +436,7 @@ struct CmpOp : public BaseElemWiseOp
 
 struct CmpSOp : public BaseElemWiseOp
 {
-    CmpSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {};
+    CmpSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) { cmpop = 0; }
     void generateScalars(int depth, RNG& rng)
     {
         BaseElemWiseOp::generateScalars(depth, rng);
@@ -467,7 +466,7 @@ struct CmpSOp : public BaseElemWiseOp
 
 struct CopyOp : public BaseElemWiseOp
 {
-    CopyOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {};
+    CopyOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {  }
     void op(const vector<Mat>& src, Mat& dst, const Mat& mask)
     {
         src[0].copyTo(dst, mask);
@@ -484,13 +483,12 @@ struct CopyOp : public BaseElemWiseOp
     {
         return 0;
     }
-    int cmpop;
 };
 
 
 struct SetOp : public BaseElemWiseOp
 {
-    SetOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {};
+    SetOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA+SUPPORT_MASK, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>&, Mat& dst, const Mat& mask)
     {
         dst.setTo(gamma, mask);
@@ -651,7 +649,7 @@ static void inRangeS(const Mat& src, const Scalar& lb, const Scalar& rb, Mat& ds
 
 struct InRangeSOp : public BaseElemWiseOp
 {
-    InRangeSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {};
+    InRangeSOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::inRange(src[0], gamma, gamma1, dst);
@@ -681,7 +679,7 @@ struct InRangeSOp : public BaseElemWiseOp
 
 struct InRangeOp : public BaseElemWiseOp
 {
-    InRangeOp() : BaseElemWiseOp(3, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    InRangeOp() : BaseElemWiseOp(3, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         Mat lb, rb;
@@ -707,7 +705,7 @@ struct InRangeOp : public BaseElemWiseOp
 
 struct ConvertScaleOp : public BaseElemWiseOp
 {
-    ConvertScaleOp() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)), ddepth(0) { };
+    ConvertScaleOp() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)), ddepth(0) { }
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         src[0].convertTo(dst, ddepth, alpha, gamma[0]);
@@ -742,7 +740,7 @@ struct ConvertScaleOp : public BaseElemWiseOp
 
 struct ConvertScaleAbsOp : public BaseElemWiseOp
 {
-    ConvertScaleAbsOp() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {};
+    ConvertScaleAbsOp() : BaseElemWiseOp(1, FIX_BETA+REAL_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>& src, Mat& dst, const Mat&)
     {
         cv::convertScaleAbs(src[0], dst, alpha, gamma[0]);
@@ -810,7 +808,7 @@ static void setIdentity(Mat& dst, const Scalar& s)
 
 struct FlipOp : public BaseElemWiseOp
 {
-    FlipOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    FlipOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) { flipcode = 0; }
     void getRandomSize(RNG& rng, vector<int>& size)
     {
         cvtest::randomSize(rng, 2, 2, cvtest::ARITHM_MAX_SIZE_LOG, size);
@@ -836,7 +834,7 @@ struct FlipOp : public BaseElemWiseOp
 
 struct TransposeOp : public BaseElemWiseOp
 {
-    TransposeOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    TransposeOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void getRandomSize(RNG& rng, vector<int>& size)
     {
         cvtest::randomSize(rng, 2, 2, cvtest::ARITHM_MAX_SIZE_LOG, size);
@@ -857,7 +855,7 @@ struct TransposeOp : public BaseElemWiseOp
 
 struct SetIdentityOp : public BaseElemWiseOp
 {
-    SetIdentityOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {};
+    SetIdentityOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA, 1, 1, Scalar::all(0)) {}
     void getRandomSize(RNG& rng, vector<int>& size)
     {
         cvtest::randomSize(rng, 2, 2, cvtest::ARITHM_MAX_SIZE_LOG, size);
@@ -878,7 +876,7 @@ struct SetIdentityOp : public BaseElemWiseOp
 
 struct SetZeroOp : public BaseElemWiseOp
 {
-    SetZeroOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    SetZeroOp() : BaseElemWiseOp(0, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     void op(const vector<Mat>&, Mat& dst, const Mat&)
     {
         dst = Scalar::all(0);
@@ -954,7 +952,7 @@ static void log(const Mat& src, Mat& dst)
 
 struct ExpOp : public BaseElemWiseOp
 {
-    ExpOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    ExpOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, ARITHM_MAX_CHANNELS);
@@ -981,7 +979,7 @@ struct ExpOp : public BaseElemWiseOp
 
 struct LogOp : public BaseElemWiseOp
 {
-    LogOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {};
+    LogOp() : BaseElemWiseOp(1, FIX_ALPHA+FIX_BETA+FIX_GAMMA, 1, 1, Scalar::all(0)) {}
     int getRandomType(RNG& rng)
     {
         return cvtest::randomType(rng, _OutputArray::DEPTH_MASK_FLT, 1, ARITHM_MAX_CHANNELS);
@@ -1564,3 +1562,19 @@ TEST(Core_round, CvRound)
     ASSERT_EQ(-2, cvRound(-2.5));
     ASSERT_EQ(-4, cvRound(-3.5));
 }
+
+
+typedef testing::TestWithParam<Size> Mul1;
+
+TEST_P(Mul1, One)
+{
+    Size size = GetParam();
+    cv::Mat src(size, CV_32FC1, cv::Scalar::all(2)), dst,
+            ref_dst(size, CV_32FC1, cv::Scalar::all(6));
+
+    cv::multiply(3, src, dst);
+
+    ASSERT_EQ(0, cv::norm(dst, ref_dst, cv::NORM_INF));
+}
+
+INSTANTIATE_TEST_CASE_P(Arithm, Mul1, testing::Values(Size(2, 2), Size(1, 1)));
diff --git a/modules/core/test/test_ds.cpp b/modules/core/test/test_ds.cpp
index c71deed06..25a5f11b1 100644
--- a/modules/core/test/test_ds.cpp
+++ b/modules/core/test/test_ds.cpp
@@ -1355,7 +1355,7 @@ int  Core_SetTest::test_set_ops( int iters )
                                   (cvset->total == 0 || cvset->total >= prev_total),
                                   "The total number of cvset elements is not correct" );
 
-        // CvSet and simple set do not neccessary have the same "total" (active & free) number,
+        // CvSet and simple set do not necessary have the same "total" (active & free) number,
         // so pass "set->total" to skip that check
         test_seq_block_consistence( struct_idx, (CvSeq*)cvset, cvset->total );
         update_progressbar();
@@ -1777,7 +1777,7 @@ int  Core_GraphTest::test_graph_ops( int iters )
                                   (graph->edges->total == 0 || graph->edges->total >= prev_edge_total),
                                   "The total number of graph vertices is not correct" );
 
-        // CvGraph and simple graph do not neccessary have the same "total" (active & free) number,
+        // CvGraph and simple graph do not necessary have the same "total" (active & free) number,
         // so pass "graph->total" (or "graph->edges->total") to skip that check
         test_seq_block_consistence( struct_idx, (CvSeq*)graph, graph->total );
         test_seq_block_consistence( struct_idx, (CvSeq*)graph->edges, graph->edges->total );
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index ba6656761..23c0aad62 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -83,7 +83,7 @@ static bool cvTsCheckSparse(const CvSparseMat* m1, const CvSparseMat* m2, double
 class Core_IOTest : public cvtest::BaseTest
 {
 public:
-    Core_IOTest() {};
+    Core_IOTest() { }
 protected:
     void run(int)
     {
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 3c8ae8bf9..86aca0160 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -9,7 +9,7 @@ using namespace std;
 class Core_ReduceTest : public cvtest::BaseTest
 {
 public:
-    Core_ReduceTest() {};
+    Core_ReduceTest() {}
 protected:
     void run( int);
     int checkOp( const Mat& src, int dstType, int opType, const Mat& opRes, int dim );
@@ -1139,3 +1139,24 @@ TEST(Core_Mat, reshape_1942)
     );
     ASSERT_EQ(1, cn);
 }
+
+TEST(Core_Mat, copyNx1ToVector)
+{
+    cv::Mat_<uchar> src(5, 1);
+    cv::Mat_<uchar> ref_dst8;
+    cv::Mat_<ushort> ref_dst16;
+    std::vector<uchar> dst8;
+    std::vector<ushort> dst16;
+
+    src << 1, 2, 3, 4, 5;
+
+    src.copyTo(ref_dst8);
+    src.copyTo(dst8);
+
+    ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst8, cv::Mat_<uchar>(dst8));
+
+    src.convertTo(ref_dst16, CV_16U);
+    src.convertTo(dst16, CV_16U);
+
+    ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), ref_dst16, cv::Mat_<ushort>(dst16));
+}
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 377742386..859ebe60e 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -617,6 +617,7 @@ Core_GEMMTest::Core_GEMMTest() : Core_MatrixTest( 5, 1, false, false, 2 )
 {
     test_case_count = 100;
     max_log_array_size = 10;
+    tabc_flag = 0;
     alpha = beta = 0;
 }
 
@@ -821,6 +822,8 @@ protected:
 
 Core_TransformTest::Core_TransformTest() : Core_MatrixTest( 3, 1, true, false, 4 )
 {
+    scale = 1;
+    diagMtx = false;
 }
 
 
@@ -1154,7 +1157,7 @@ protected:
 
 
 Core_CovarMatrixTest::Core_CovarMatrixTest() : Core_MatrixTest( 1, 1, true, false, 1 ),
-flags(0), t_flag(0), are_images(false)
+    flags(0), t_flag(0), len(0), count(0), are_images(false)
 {
     test_case_count = 100;
     test_array[INPUT_OUTPUT].push_back(NULL);
@@ -2485,16 +2488,14 @@ TYPED_TEST_P(Core_CheckRange, Negative)
     double min_bound = 4.5;
     double max_bound = 16.0;
 
-    TypeParam data[] = {5, 10, 15, 4, 10 ,2, 8, 12, 14};
+    TypeParam data[] = {5, 10, 15, 4, 10, 2, 8, 12, 14};
     cv::Mat src = cv::Mat(3,3, cv::DataDepth<TypeParam>::value, data);
 
-    cv::Point* bad_pt = new cv::Point(0, 0);
+    cv::Point bad_pt(0, 0);
 
-    ASSERT_FALSE(checkRange(src, true, bad_pt, min_bound, max_bound));
-    ASSERT_EQ(bad_pt->x,0);
-    ASSERT_EQ(bad_pt->y,1);
-
-    delete bad_pt;
+    ASSERT_FALSE(checkRange(src, true, &bad_pt, min_bound, max_bound));
+    ASSERT_EQ(bad_pt.x, 0);
+    ASSERT_EQ(bad_pt.y, 1);
 }
 
 TYPED_TEST_P(Core_CheckRange, Positive)
@@ -2502,16 +2503,14 @@ TYPED_TEST_P(Core_CheckRange, Positive)
     double min_bound = -1;
     double max_bound = 16.0;
 
-    TypeParam data[] = {5, 10, 15, 4, 10 ,2, 8, 12, 14};
+    TypeParam data[] = {5, 10, 15, 4, 10, 2, 8, 12, 14};
     cv::Mat src = cv::Mat(3,3, cv::DataDepth<TypeParam>::value, data);
 
-    cv::Point* bad_pt = new cv::Point(0, 0);
+    cv::Point bad_pt(0, 0);
 
-    ASSERT_TRUE(checkRange(src, true, bad_pt, min_bound, max_bound));
-    ASSERT_EQ(bad_pt->x,0);
-    ASSERT_EQ(bad_pt->y,0);
-
-    delete bad_pt;
+    ASSERT_TRUE(checkRange(src, true, &bad_pt, min_bound, max_bound));
+    ASSERT_EQ(bad_pt.x, 0);
+    ASSERT_EQ(bad_pt.y, 0);
 }
 
 TYPED_TEST_P(Core_CheckRange, Bounds)
@@ -2519,16 +2518,14 @@ TYPED_TEST_P(Core_CheckRange, Bounds)
     double min_bound = 24.5;
     double max_bound = 1.0;
 
-    TypeParam data[] = {5, 10, 15, 4, 10 ,2, 8, 12, 14};
+    TypeParam data[] = {5, 10, 15, 4, 10, 2, 8, 12, 14};
     cv::Mat src = cv::Mat(3,3, cv::DataDepth<TypeParam>::value, data);
 
-    cv::Point* bad_pt = new cv::Point(0, 0);
+    cv::Point bad_pt(0, 0);
 
-    ASSERT_FALSE(checkRange(src, true, bad_pt, min_bound, max_bound));
-    ASSERT_EQ(bad_pt->x,0);
-    ASSERT_EQ(bad_pt->y,0);
-
-    delete bad_pt;
+    ASSERT_FALSE(checkRange(src, true, &bad_pt, min_bound, max_bound));
+    ASSERT_EQ(bad_pt.x, 0);
+    ASSERT_EQ(bad_pt.y, 0);
 }
 
 TYPED_TEST_P(Core_CheckRange, Zero)
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index 5af419c93..e40d40de3 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -25,7 +25,7 @@ TEST(Core_Drawing, _914)
 }
 
 
-TEST(Core_OutputArraySreate, _1997)
+TEST(Core_OutputArrayCreate, _1997)
 {
     struct local {
         static void create(OutputArray arr, Size submatSize, int type)
diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp
index 8d3341e59..8215ea93f 100644
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -63,7 +63,7 @@ protected:
 
     struct test_excep
     {
-        test_excep(const string& _s=string("")) : s(_s) {};
+        test_excep(const string& _s=string("")) : s(_s) { }
         string s;
     };
 
diff --git a/modules/ocl/test/test_precomp.hpp b/modules/core/test/test_rotatedrect.cpp
similarity index 50%
rename from modules/ocl/test/test_precomp.hpp
rename to modules/core/test/test_rotatedrect.cpp
index af467f5b8..c600ef1dd 100644
--- a/modules/ocl/test/test_precomp.hpp
+++ b/modules/core/test/test_rotatedrect.cpp
@@ -39,39 +39,69 @@
 //
 //M*/
 
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
+#include "test_precomp.hpp"
 
-#ifndef __OPENCV_TEST_PRECOMP_HPP__
-#define __OPENCV_TEST_PRECOMP_HPP__
+using namespace cv;
+using namespace std;
 
-#include <cmath>
-#include <cstdio>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <string>
-#include <limits>
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <cstdarg>
-#include "opencv2/ts.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/features2d.hpp"
-#include "opencv2/video.hpp"
-#include "opencv2/ocl.hpp"
+class Core_RotatedRectConstructorTest : public cvtest::BaseTest
+{
+public:
+    Core_RotatedRectConstructorTest();
+protected:
+    int prepare_test_case( int );
+    void run_func();
+    int validate_test_results( int );
+    float MAX_COORD_VAL;
+    Point2f a, b, c;
+    RotatedRect rec;
+};
 
-#include "utility.hpp"
+Core_RotatedRectConstructorTest::Core_RotatedRectConstructorTest()
+{
+    test_case_count = 100;
+    MAX_COORD_VAL = 1000.0f;
+}
 
-#include "opencv2/core/private.hpp"
+int Core_RotatedRectConstructorTest::prepare_test_case( int test_case_idx )
+{
+    cvtest::BaseTest::prepare_test_case( test_case_idx );
+    RNG& rng = ts->get_rng();
+    a = Point2f( rng.uniform(-MAX_COORD_VAL, MAX_COORD_VAL), rng.uniform(-MAX_COORD_VAL, MAX_COORD_VAL) );
+    do
+    {
+        b = Point2f( rng.uniform(-MAX_COORD_VAL, MAX_COORD_VAL), rng.uniform(-MAX_COORD_VAL, MAX_COORD_VAL) );
+    }
+    while( norm(a - b) <= FLT_EPSILON );
+    Vec2f along(a - b);
+    Vec2f perp = Vec2f(-along[1], along[0]);
+    double d = (double) rng.uniform(1.0f, 5.0f);
+    if( cvtest::randInt(rng) % 2 == 0 ) d = -d;
+    c = Point2f( (float) ((double) b.x + d * perp[0]), (float) ((double) b.y + d * perp[1]) );
+    return 1;
+}
 
-using namespace cvtest;
+void Core_RotatedRectConstructorTest::run_func()
+{
+    rec = RotatedRect(a, b, c);
+}
 
-#endif
+int Core_RotatedRectConstructorTest::validate_test_results( int )
+{
+    Point2f vertices[4];
+    rec.points(vertices);
+    int count_match = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        if( norm(vertices[i] - a) <= 0.001 ) count_match++;
+        else if( norm(vertices[i] - b) <= 0.001 ) count_match++;
+        else if( norm(vertices[i] - c) <= 0.001 ) count_match++;
+    }
+    if( count_match == 3 )
+        return cvtest::TS::OK;
+    ts->printf( cvtest::TS::LOG, "RotatedRect end points don't match those supplied in constructor");
+    ts->set_failed_test_info( cvtest::TS::FAIL_INVALID_OUTPUT );
+    return cvtest::TS::OK;
+}
+
+TEST(Core_RotatedRect, three_point_constructor) { Core_RotatedRectConstructorTest test; test.safe_run(); }
diff --git a/modules/core/test/test_umat.cpp b/modules/core/test/test_umat.cpp
index 1dfe1d79a..fd344a9b0 100644
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -40,13 +40,509 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
 
-#include <string>
-#include <iostream>
-#include "opencv2/core/ocl.hpp"
-
+using namespace cvtest;
+using namespace testing;
 using namespace cv;
-using namespace std;
+
+namespace cvtest {
+namespace ocl {
+
+#define UMAT_TEST_SIZES testing::Values(cv::Size(1, 1), cv::Size(1,128), cv::Size(128, 1), \
+    cv::Size(128, 128), cv::Size(640, 480), cv::Size(751, 373), cv::Size(1200, 1200))
+
+/////////////////////////////// Basic Tests ////////////////////////////////
+
+PARAM_TEST_CASE(UMatBasicTests, int, int, Size, bool)
+{
+    Mat a;
+    UMat ua;
+    int type;
+    int depth;
+    int cn;
+    Size size;
+    bool useRoi;
+    Size roi_size;
+    Rect roi;
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        size = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+        type = CV_MAKE_TYPE(depth, cn);
+        a = randomMat(size, type, -100, 100);
+        a.copyTo(ua);
+        int roi_shift_x = randomInt(0, size.width-1);
+        int roi_shift_y = randomInt(0, size.height-1);
+        roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+        roi = Rect(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+    }
+};
+
+TEST_P(UMatBasicTests, createUMat)
+{
+    if(useRoi)
+    {
+        ua = UMat(ua, roi);
+    }
+    int dims = randomInt(2,6);
+    int _sz[CV_MAX_DIM];
+    for( int i = 0; i<dims; i++)
+    {
+        _sz[i] = randomInt(1,50);
+    }
+    int *sz = _sz;
+    int new_depth = randomInt(CV_8S, CV_64F);
+    int new_cn = randomInt(1,4);
+    ua.create(dims, sz, CV_MAKE_TYPE(new_depth, new_cn));
+
+    for(int i = 0; i<dims; i++)
+    {
+        ASSERT_EQ(ua.size[i], sz[i]);
+    }
+    ASSERT_EQ(ua.dims, dims);
+    ASSERT_EQ(ua.type(), CV_MAKE_TYPE(new_depth, new_cn) );
+    Size new_size = randomSize(1, 1000);
+    ua.create(new_size, CV_MAKE_TYPE(new_depth, new_cn) );
+    ASSERT_EQ( ua.size(), new_size);
+    ASSERT_EQ(ua.type(), CV_MAKE_TYPE(new_depth, new_cn) );
+    ASSERT_EQ( ua.dims, 2);
+}
+
+TEST_P(UMatBasicTests, swap)
+{
+    Mat b = randomMat(size, type, -100, 100);
+    UMat ub;
+    b.copyTo(ub);
+    if(useRoi)
+    {
+        ua = UMat(ua,roi);
+        ub = UMat(ub,roi);
+    }
+    UMat uc = ua, ud = ub;
+    swap(ua,ub);
+    EXPECT_MAT_NEAR(ub,uc, 0);
+    EXPECT_MAT_NEAR(ud, ua, 0);
+}
+
+TEST_P(UMatBasicTests, base)
+{
+    if(useRoi)
+    {
+        ua = UMat(ua,roi);
+    }
+    UMat ub = ua.clone();
+    EXPECT_MAT_NEAR(ub,ua,0);
+
+    ASSERT_EQ(ua.channels(), cn);
+    ASSERT_EQ(ua.depth(), depth);
+    ASSERT_EQ(ua.type(), type);
+    ASSERT_EQ(ua.elemSize(), a.elemSize());
+    ASSERT_EQ(ua.elemSize1(), a.elemSize1());
+    ASSERT_EQ(ub.empty(), ub.cols*ub.rows == 0);
+    ub.release();
+    ASSERT_TRUE( ub.empty() );
+    if(useRoi && a.size() != ua.size())
+    {
+        ASSERT_EQ(ua.isSubmatrix(), true);
+    }
+    else
+    {
+        ASSERT_EQ(ua.isSubmatrix(), false);
+    }
+
+    int dims = randomInt(2,6);
+    int sz[CV_MAX_DIM];
+    size_t total = 1;
+    for(int i = 0; i<dims; i++)
+    {
+        sz[i] = randomInt(1,45);
+        total *= (size_t)sz[i];
+    }
+    int new_type = CV_MAKE_TYPE(randomInt(CV_8S,CV_64F),randomInt(1,4));
+    ub = UMat(dims, sz, new_type);
+    ASSERT_EQ(ub.total(), total);
+}
+
+TEST_P(UMatBasicTests, DISABLED_copyTo)
+{
+    UMat roi_ua;
+    Mat roi_a;
+    int i;
+    if(useRoi)
+    {
+        roi_ua = UMat(ua, roi);
+        roi_a = Mat(a, roi);
+        roi_a.copyTo(roi_ua);
+        EXPECT_MAT_NEAR(roi_a, roi_ua, 0);
+        roi_ua.copyTo(roi_a);
+        EXPECT_MAT_NEAR(roi_ua, roi_a, 0);
+        roi_ua.copyTo(ua);
+        EXPECT_MAT_NEAR(roi_ua, ua, 0);
+        ua.copyTo(a);
+        EXPECT_MAT_NEAR(ua, a, 0);
+    }
+    {
+        UMat ub;
+        ua.copyTo(ub);
+        EXPECT_MAT_NEAR(ua, ub, 0);
+    }
+    {
+        UMat ub;
+        i = randomInt(0, ua.cols-1);
+        a.col(i).copyTo(ub);
+        EXPECT_MAT_NEAR(a.col(i), ub, 0);
+    }
+    {
+        UMat ub;
+        ua.col(i).copyTo(ub);
+        EXPECT_MAT_NEAR(ua.col(i), ub, 0);
+    }
+    {
+        Mat b;
+        ua.col(i).copyTo(b);
+        EXPECT_MAT_NEAR(ua.col(i), b, 0);
+    }
+    {
+        UMat ub;
+        i = randomInt(0, a.rows-1);
+        ua.row(i).copyTo(ub);
+        EXPECT_MAT_NEAR(ua.row(i), ub, 0);
+    }
+    {
+        UMat ub;
+        a.row(i).copyTo(ub);
+        EXPECT_MAT_NEAR(a.row(i), ub, 0);
+    }
+    {
+        Mat b;
+        ua.row(i).copyTo(b);
+        EXPECT_MAT_NEAR(ua.row(i), b, 0);
+    }
+}
+
+TEST_P(UMatBasicTests, DISABLED_GetUMat)
+{
+    if(useRoi)
+    {
+        a = Mat(a, roi);
+        ua = UMat(ua,roi);
+    }
+    {
+        UMat ub;
+        ub = a.getUMat(ACCESS_RW);
+        EXPECT_MAT_NEAR(ub, ua, 0);
+    }
+    {
+        Mat b;
+        b = a.getUMat(ACCESS_RW).getMat(ACCESS_RW);
+        EXPECT_MAT_NEAR(b, a, 0);
+    }
+    {
+        Mat b;
+        b = ua.getMat(ACCESS_RW);
+        EXPECT_MAT_NEAR(b, a, 0);
+    }
+    {
+        UMat ub;
+        ub = ua.getMat(ACCESS_RW).getUMat(ACCESS_RW);
+        EXPECT_MAT_NEAR(ub, ua, 0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(UMat, UMatBasicTests, Combine(testing::Values(CV_8U), testing::Values(1, 2),
+    testing::Values(cv::Size(1, 1), cv::Size(1, 128), cv::Size(128, 1), cv::Size(128, 128), cv::Size(640, 480)), Bool()));
+
+//////////////////////////////////////////////////////////////// Reshape ////////////////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(UMatTestReshape,  int, int, Size, bool)
+{
+    Mat a;
+    UMat ua, ub;
+    int type;
+    int depth;
+    int cn;
+    Size size;
+    bool useRoi;
+    Size roi_size;
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        size = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+TEST_P(UMatTestReshape, DISABLED_reshape)
+{
+    a = randomMat(size,type, -100, 100);
+    a.copyTo(ua);
+    if(useRoi)
+    {
+        int roi_shift_x = randomInt(0, size.width-1);
+        int roi_shift_y = randomInt(0, size.height-1);
+        roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+        Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+        ua = UMat(ua, roi).clone();
+        a = Mat(a, roi).clone();
+    }
+
+    int nChannels = randomInt(1,4);
+
+    if ((ua.cols*ua.channels()*ua.rows)%nChannels != 0)
+    {
+        EXPECT_ANY_THROW(ua.reshape(nChannels));
+    }
+    else
+    {
+        ub = ua.reshape(nChannels);
+        ASSERT_EQ(ub.channels(),nChannels);
+        ASSERT_EQ(ub.channels()*ub.cols*ub.rows, ua.channels()*ua.cols*ua.rows);
+
+        EXPECT_MAT_NEAR(ua.reshape(nChannels), a.reshape(nChannels), 0);
+
+        int new_rows = randomInt(1, INT_MAX);
+        if ( ((int)ua.total()*ua.channels())%(new_rows*nChannels) != 0)
+        {
+            EXPECT_ANY_THROW (ua.reshape(nChannels, new_rows) );
+        }
+        else
+        {
+            EXPECT_NO_THROW ( ub = ua.reshape(nChannels, new_rows) );
+            ASSERT_EQ(ub.channels(),nChannels);
+            ASSERT_EQ(ub.rows, new_rows);
+            ASSERT_EQ(ub.channels()*ub.cols*ub.rows, ua.channels()*ua.cols*ua.rows);
+
+            EXPECT_MAT_NEAR(ua.reshape(nChannels,new_rows), a.reshape(nChannels,new_rows), 0);
+        }
+
+        new_rows = (int)ua.total()*ua.channels()/(nChannels*randomInt(1, size.width*size.height));
+        if (new_rows == 0) new_rows = 1;
+        int new_cols = (int)ua.total()*ua.channels()/(new_rows*nChannels);
+        int sz[] = {new_rows, new_cols};
+        if( ((int)ua.total()*ua.channels()) % (new_rows*new_cols) != 0 )
+        {
+            EXPECT_ANY_THROW( ua.reshape(nChannels, ua.dims, sz) );
+        }
+        else
+        {
+            EXPECT_NO_THROW ( ub = ua.reshape(nChannels, ua.dims, sz) );
+            ASSERT_EQ(ub.channels(),nChannels);
+            ASSERT_EQ(ub.rows, new_rows);
+            ASSERT_EQ(ub.cols, new_cols);
+            ASSERT_EQ(ub.channels()*ub.cols*ub.rows, ua.channels()*ua.cols*ua.rows);
+
+            EXPECT_MAT_NEAR(ua.reshape(nChannels, ua.dims, sz), a.reshape(nChannels, a.dims, sz), 0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(UMat, UMatTestReshape, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, UMAT_TEST_SIZES, Bool() ));
+
+////////////////////////////////////////////////////////////////// ROI testing ///////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(UMatTestRoi, int, int, Size)
+{
+    Mat a, roi_a;
+    UMat ua, roi_ua;
+    int type;
+    int depth;
+    int cn;
+    Size size;
+    Size roi_size;
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        size = GET_PARAM(2);
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+TEST_P(UMatTestRoi, createRoi)
+{
+    int roi_shift_x = randomInt(0, size.width-1);
+    int roi_shift_y = randomInt(0, size.height-1);
+    roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+    a = randomMat(size, type, -100, 100);
+    Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+    roi_a = Mat(a, roi);
+    a.copyTo(ua);
+    roi_ua = UMat(ua, roi);
+
+    EXPECT_MAT_NEAR(roi_a, roi_ua, 0);
+}
+
+TEST_P(UMatTestRoi, locateRoi)
+{
+    int roi_shift_x = randomInt(0, size.width-1);
+    int roi_shift_y = randomInt(0, size.height-1);
+    roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+    a = randomMat(size, type, -100, 100);
+    Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+    roi_a = Mat(a, roi);
+    a.copyTo(ua);
+    roi_ua = UMat(ua,roi);
+    Size sz, usz;
+    Point p, up;
+    roi_a.locateROI(sz, p);
+    roi_ua.locateROI(usz, up);
+    ASSERT_EQ(sz, usz);
+    ASSERT_EQ(p, up);
+}
+
+TEST_P(UMatTestRoi, adjustRoi)
+{
+    int roi_shift_x = randomInt(0, size.width-1);
+    int roi_shift_y = randomInt(0, size.height-1);
+    roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+    a = randomMat(size, type, -100, 100);
+    Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+    a.copyTo(ua);
+    roi_ua = UMat( ua, roi);
+    int adjLeft = randomInt(-(roi_ua.cols/2), (size.width-1)/2);
+    int adjRight = randomInt(-(roi_ua.cols/2), (size.width-1)/2);
+    int adjTop = randomInt(-(roi_ua.rows/2), (size.height-1)/2);
+    int adjBot = randomInt(-(roi_ua.rows/2), (size.height-1)/2);
+    roi_ua.adjustROI(adjTop, adjBot, adjLeft, adjRight);
+    roi_shift_x = std::max(0, roi.x-adjLeft);
+    roi_shift_y = std::max(0, roi.y-adjTop);
+    Rect new_roi( roi_shift_x, roi_shift_y, std::min(roi.width+adjRight+adjLeft, size.width-roi_shift_x), std::min(roi.height+adjBot+adjTop, size.height-roi_shift_y) );
+    UMat test_roi = UMat(ua, new_roi);
+    EXPECT_MAT_NEAR(roi_ua, test_roi, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(UMat, UMatTestRoi, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, UMAT_TEST_SIZES ));
+
+/////////////////////////////////////////////////////////////// Size ////////////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(UMatTestSizeOperations, int, int, Size, bool)
+{
+    Mat a, b, roi_a, roi_b;
+    UMat ua, ub, roi_ua, roi_ub;
+    int type;
+    int depth;
+    int cn;
+    Size size;
+    Size roi_size;
+    bool useRoi;
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        size = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+TEST_P(UMatTestSizeOperations, copySize)
+{
+    Size s = randomSize(1,300);
+    a = randomMat(size, type, -100, 100);
+    b = randomMat(s, type, -100, 100);
+    a.copyTo(ua);
+    b.copyTo(ub);
+    if(useRoi)
+    {
+        int roi_shift_x = randomInt(0, size.width-1);
+        int roi_shift_y = randomInt(0, size.height-1);
+        roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+        Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+        ua = UMat(ua,roi);
+
+        roi_shift_x = randomInt(0, s.width-1);
+        roi_shift_y = randomInt(0, s.height-1);
+        roi_size = Size(s.width - roi_shift_x, s.height - roi_shift_y);
+        roi = Rect(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+        ub = UMat(ub, roi);
+    }
+    ua.copySize(ub);
+    ASSERT_EQ(ua.size, ub.size);
+}
+
+INSTANTIATE_TEST_CASE_P(UMat, UMatTestSizeOperations, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, UMAT_TEST_SIZES, Bool() ));
+
+///////////////////////////////////////////////////////////////// UMat operations ////////////////////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(UMatTestUMatOperations, int, int, Size, bool)
+{
+    Mat a, b;
+    UMat ua, ub;
+    int type;
+    int depth;
+    int cn;
+    Size size;
+    Size roi_size;
+    bool useRoi;
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
+        size = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
+
+TEST_P(UMatTestUMatOperations, diag)
+{
+    a = randomMat(size, type, -100, 100);
+    a.copyTo(ua);
+    Mat new_diag;
+    if(useRoi)
+    {
+        int roi_shift_x = randomInt(0, size.width-1);
+        int roi_shift_y = randomInt(0, size.height-1);
+        roi_size = Size(size.width - roi_shift_x, size.height - roi_shift_y);
+        Rect roi(roi_shift_x, roi_shift_y, roi_size.width, roi_size.height);
+        ua = UMat(ua,roi);
+        a = Mat(a, roi);
+    }
+    int n = randomInt(0, ua.cols-1);
+    ub = ua.diag(n);
+    b = a.diag(n);
+    EXPECT_MAT_NEAR(b, ub, 0);
+    new_diag = randomMat(Size(ua.rows, 1), type, -100, 100);
+    new_diag.copyTo(ub);
+    ua = cv::UMat::diag(ub);
+    EXPECT_MAT_NEAR(ua.diag(), new_diag.t(), 0);
+}
+
+INSTANTIATE_TEST_CASE_P(UMat, UMatTestUMatOperations, Combine(OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, UMAT_TEST_SIZES, Bool()));
+
+///////////////////////////////////////////////////////////////// OpenCL ////////////////////////////////////////////////////////////////////////////
+
+TEST(UMat, BufferPoolGrowing)
+{
+#ifdef _DEBUG
+    const int ITERATIONS = 100;
+#else
+    const int ITERATIONS = 200;
+#endif
+    const Size sz(1920, 1080);
+    BufferPoolController* c = cv::ocl::getOpenCLAllocator()->getBufferPoolController();
+    if (c)
+    {
+        size_t oldMaxReservedSize = c->getMaxReservedSize();
+        c->freeAllReservedBuffers();
+        c->setMaxReservedSize(sz.area() * 10);
+        for (int i = 0; i < ITERATIONS; i++)
+        {
+            UMat um(Size(sz.width + i, sz.height + i), CV_8UC1);
+            UMat um2(Size(sz.width + 2 * i, sz.height + 2 * i), CV_8UC1);
+        }
+        c->setMaxReservedSize(oldMaxReservedSize);
+        c->freeAllReservedBuffers();
+    }
+    else
+        std::cout << "Skipped, no OpenCL" << std::endl;
+}
 
 class CV_UMatTest :
         public cvtest::BaseTest
@@ -59,7 +555,7 @@ protected:
 
     struct test_excep
     {
-        test_excep(const string& _s=string("")) : s(_s) {};
+        test_excep(const string& _s=string("")) : s(_s) { }
         string s;
     };
 
@@ -187,8 +683,8 @@ bool CV_UMatTest::TestUMat()
 void CV_UMatTest::run( int /* start_from */)
 {
     printf("Use OpenCL: %s\nHave OpenCL: %s\n",
-           ocl::useOpenCL() ? "TRUE" : "FALSE",
-           ocl::haveOpenCL() ? "TRUE" : "FALSE" );
+           cv::ocl::useOpenCL() ? "TRUE" : "FALSE",
+           cv::ocl::haveOpenCL() ? "TRUE" : "FALSE" );
 
     if (!TestUMat())
         return;
@@ -248,3 +744,46 @@ TEST(UMat, Sync)
 
     EXPECT_EQ(0, cv::norm(um.getMat(ACCESS_READ), cv::Mat(um.size(), um.type(), 19), NORM_INF));
 }
+
+TEST(UMat, setOpenCL)
+{
+    // save the current state
+    bool useOCL = cv::ocl::useOpenCL();
+
+    Mat m = (Mat_<uchar>(3,3)<<0,1,2,3,4,5,6,7,8);
+
+    cv::ocl::setUseOpenCL(true);
+    UMat um1;
+    m.copyTo(um1);
+
+    cv::ocl::setUseOpenCL(false);
+    UMat um2;
+    m.copyTo(um2);
+
+    cv::ocl::setUseOpenCL(true);
+    countNonZero(um1);
+    countNonZero(um2);
+
+    um1.copyTo(um2);
+    EXPECT_MAT_NEAR(um1, um2, 0);
+    EXPECT_MAT_NEAR(um1, m, 0);
+    um2.copyTo(um1);
+    EXPECT_MAT_NEAR(um1, m, 0);
+    EXPECT_MAT_NEAR(um1, um2, 0);
+
+    cv::ocl::setUseOpenCL(false);
+    countNonZero(um1);
+    countNonZero(um2);
+
+    um1.copyTo(um2);
+    EXPECT_MAT_NEAR(um1, um2, 0);
+    EXPECT_MAT_NEAR(um1, m, 0);
+    um2.copyTo(um1);
+    EXPECT_MAT_NEAR(um1, um2, 0);
+    EXPECT_MAT_NEAR(um1, m, 0);
+
+    // reset state to the previous one
+    cv::ocl::setUseOpenCL(useOCL);
+}
+
+} } // namespace cvtest::ocl
diff --git a/modules/cuda/src/cuda/ccomponetns.cu b/modules/cuda/src/cuda/ccomponetns.cu
index 681ca8fe4..441413c34 100644
--- a/modules/cuda/src/cuda/ccomponetns.cu
+++ b/modules/cuda/src/cuda/ccomponetns.cu
@@ -133,7 +133,7 @@ namespace cv { namespace cuda { namespace device
         template<typename T> struct InInterval<T, 1>
         {
             typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
+            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) { }
             T lo, hi;
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
@@ -148,7 +148,7 @@ namespace cv { namespace cuda { namespace device
         {
             typedef typename VecTraits<T>::elem_type E;
             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)) { }
             T lo, hi;
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
@@ -164,7 +164,7 @@ namespace cv { namespace cuda { namespace device
         {
             typedef typename VecTraits<T>::elem_type E;
             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
+            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)) { }
             T lo, hi;
 
             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
diff --git a/modules/cuda/src/precomp.hpp b/modules/cuda/src/precomp.hpp
index 93f987e4c..60c71b52b 100644
--- a/modules/cuda/src/precomp.hpp
+++ b/modules/cuda/src/precomp.hpp
@@ -50,6 +50,7 @@
 #include "opencv2/objdetect.hpp"
 
 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/utility.hpp"
 
 #include "opencv2/opencv_modules.hpp"
 
diff --git a/modules/cuda/test/test_gpumat.cpp b/modules/cuda/test/test_gpumat.cpp
index 9a1325951..dcd368c08 100644
--- a/modules/cuda/test/test_gpumat.cpp
+++ b/modules/cuda/test/test_gpumat.cpp
@@ -281,7 +281,7 @@ CUDA_TEST_P(ConvertTo, WithOutScaling)
         cv::Mat dst_gold;
         src.convertTo(dst_gold, depth2);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth2 < CV_32F ? 1.0 : 1e-4);
     }
 }
 
diff --git a/modules/cuda/test/test_labeling.cpp b/modules/cuda/test/test_labeling.cpp
index fd6bfd631..8bc620aef 100644
--- a/modules/cuda/test/test_labeling.cpp
+++ b/modules/cuda/test/test_labeling.cpp
@@ -62,7 +62,7 @@ namespace
 
         struct InInterval
         {
-            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {}
             const int lo, hi;
 
             bool operator() (const unsigned char a, const unsigned char b) const
diff --git a/modules/cuda/test/test_objdetect.cpp b/modules/cuda/test/test_objdetect.cpp
index 658508f39..8c7b5ec91 100644
--- a/modules/cuda/test/test_objdetect.cpp
+++ b/modules/cuda/test/test_objdetect.cpp
@@ -177,7 +177,7 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
 };
 
 // desabled while resize does not fixed
-CUDA_TEST_P(HOG, Detect)
+CUDA_TEST_P(HOG, DISABLED_Detect)
 {
     cv::Mat img_rgb = readImage("hog/road.png");
     ASSERT_FALSE(img_rgb.empty());
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index 4af48b028..e493fd759 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -338,11 +338,11 @@ CV_EXPORTS void gemm(InputArray src1, InputArray src2, double alpha,
                      InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());
 
 //! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrixes only (interleaved format)
+//! supports 32FC2 matrices only (interleaved format)
 CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());
 
 //! performs per-element multiplication of two full (not packed) Fourier spectrums
-//! supports 32FC2 matrixes only (interleaved format)
+//! supports 32FC2 matrices only (interleaved format)
 CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
 
 //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
diff --git a/modules/cudaarithm/perf/perf_arithm.cpp b/modules/cudaarithm/perf/perf_arithm.cpp
index 900415501..42dd7724b 100644
--- a/modules/cudaarithm/perf/perf_arithm.cpp
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@@ -49,6 +49,8 @@ using namespace perf;
 //////////////////////////////////////////////////////////////////////
 // GEMM
 
+#ifdef HAVE_CUBLAS
+
 CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T)
 #define ALL_GEMM_FLAGS Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), \
                               GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
@@ -98,6 +100,8 @@ PERF_TEST_P(Sz_Type_Flags, GEMM,
     }
 }
 
+#endif
+
 //////////////////////////////////////////////////////////////////////
 // MulSpectrums
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
index 8069d28ca..4a43d9d30 100644
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2514,7 +2514,7 @@ CUDA_TEST_P(AddWeighted, Accuracy)
         cv::Mat dst_gold;
         cv::addWeighted(src1, alpha, src2, beta, gamma, dst_gold, dst_depth);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 1.0 : 1e-3);
+        EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 2.0 : 1e-3);
     }
 }
 
diff --git a/modules/cudaarithm/test/test_reductions.cpp b/modules/cudaarithm/test/test_reductions.cpp
index 68974bcef..5fd7e2dec 100644
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -734,7 +734,7 @@ CUDA_TEST_P(Normalize, WithOutMask)
     cv::Mat dst_gold;
     cv::normalize(src, dst_gold, alpha, beta, norm_type, type);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+    EXPECT_MAT_NEAR(dst_gold, dst, type < CV_32F ? 1.0 : 1e-4);
 }
 
 CUDA_TEST_P(Normalize, WithMask)
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index be141b1af..d451b93b1 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -224,7 +224,7 @@ CV_EXPORTS Ptr<HoughLinesDetector> createHoughLinesDetector(float rho, float the
 //////////////////////////////////////
 // HoughLinesP
 
-//! finds line segments in the black-n-white image using probabalistic Hough transform
+//! finds line segments in the black-n-white image using probabilistic Hough transform
 class CV_EXPORTS HoughSegmentDetector : public Algorithm
 {
 public:
diff --git a/modules/cudaimgproc/perf/perf_color.cpp b/modules/cudaimgproc/perf/perf_color.cpp
index 2ff0f1ff6..099e0f9eb 100644
--- a/modules/cudaimgproc/perf/perf_color.cpp
+++ b/modules/cudaimgproc/perf/perf_color.cpp
@@ -243,7 +243,14 @@ PERF_TEST_P(Sz_Type_Op, AlphaComp,
 
         TEST_CYCLE() cv::cuda::alphaComp(d_img1, d_img2, dst, alpha_op);
 
-        CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+        if (CV_MAT_DEPTH(type) < CV_32F)
+        {
+            CUDA_SANITY_CHECK(dst, 1);
+        }
+        else
+        {
+            CUDA_SANITY_CHECK(dst, 1e-3, ERROR_RELATIVE);
+        }
     }
     else
     {
diff --git a/modules/cudaimgproc/perf/perf_match_template.cpp b/modules/cudaimgproc/perf/perf_match_template.cpp
index c026ec27b..321250c38 100644
--- a/modules/cudaimgproc/perf/perf_match_template.cpp
+++ b/modules/cudaimgproc/perf/perf_match_template.cpp
@@ -90,7 +90,7 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
 
         CPU_SANITY_CHECK(dst);
     }
-};
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // MatchTemplate32F
diff --git a/modules/cudaimgproc/src/canny.cpp b/modules/cudaimgproc/src/canny.cpp
index b22094d60..eed4a284e 100644
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -58,9 +58,9 @@ namespace canny
 
     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
 
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1);
 
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);
 
     void getEdges(PtrStepSzi map, PtrStepSzb dst);
 }
@@ -194,6 +194,8 @@ namespace
 
     void CannyImpl::createBuf(Size image_size)
     {
+        CV_Assert(image_size.width < std::numeric_limits<short>::max() && image_size.height < std::numeric_limits<short>::max());
+
         ensureSizeIsEnough(image_size, CV_32SC1, dx_);
         ensureSizeIsEnough(image_size, CV_32SC1, dy_);
 
@@ -209,8 +211,8 @@ namespace
         ensureSizeIsEnough(image_size, CV_32FC1, mag_);
         ensureSizeIsEnough(image_size, CV_32SC1, map_);
 
-        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1_);
-        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2_);
+        ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st1_);
+        ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2_);
     }
 
     void CannyImpl::CannyCaller(GpuMat& edges)
@@ -218,9 +220,9 @@ namespace
         map_.setTo(Scalar::all(0));
         canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));
 
-        canny::edgesHysteresisLocal(map_, st1_.ptr<ushort2>());
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>());
 
-        canny::edgesHysteresisGlobal(map_, st1_.ptr<ushort2>(), st2_.ptr<ushort2>());
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>());
 
         canny::getEdges(map_, edges);
     }
diff --git a/modules/cudaimgproc/src/cuda/canny.cu b/modules/cudaimgproc/src/cuda/canny.cu
index 9b691e404..3d770e179 100644
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -42,8 +42,6 @@
 
 #if !defined CUDA_DISABLER
 
-#include <utility>
-#include <algorithm>
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/transform.hpp"
@@ -239,30 +237,35 @@ namespace canny
 {
     __device__ int counter = 0;
 
-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
+    {
+        return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
+    }
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
     {
         __shared__ volatile int smem[18][18];
 
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
         if (threadIdx.y == 0)
-            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+            smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
         if (threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+            smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
         if (threadIdx.x == 0)
-            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+            smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
         if (threadIdx.x == blockDim.x - 1)
-            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+            smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
         if (threadIdx.x == 0 && threadIdx.y == 0)
-            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+            smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+            smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+            smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+            smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
 
         __syncthreads();
 
@@ -290,8 +293,12 @@ namespace canny
                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
             }
 
+            __syncthreads();
+
             if (n > 0)
                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+
+            __syncthreads();
         }
 
         const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
@@ -317,11 +324,11 @@ namespace canny
         if (n > 0)
         {
             const int ind =  ::atomicAdd(&counter, 1);
-            st[ind] = make_ushort2(x, y);
+            st[ind] = make_short2(x, y);
         }
     }
 
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
@@ -345,13 +352,13 @@ namespace canny
     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
     {
         const int stack_size = 512;
 
         __shared__ int s_counter;
         __shared__ int s_ind;
-        __shared__ ushort2 s_st[stack_size];
+        __shared__ short2 s_st[stack_size];
 
         if (threadIdx.x == 0)
             s_counter = 0;
@@ -363,14 +370,14 @@ namespace canny
         if (ind >= count)
             return;
 
-        ushort2 pos = st1[ind];
+        short2 pos = st1[ind];
 
         if (threadIdx.x < 8)
         {
             pos.x += c_dx[threadIdx.x];
             pos.y += c_dy[threadIdx.x];
 
-            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
             {
                 map(pos.y, pos.x) = 2;
 
@@ -402,7 +409,7 @@ namespace canny
                 pos.x += c_dx[threadIdx.x & 7];
                 pos.y += c_dy[threadIdx.x & 7];
 
-                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+                if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
                 {
                     map(pos.y, pos.x) = 2;
 
@@ -419,8 +426,10 @@ namespace canny
         {
             if (threadIdx.x == 0)
             {
-                ind = ::atomicAdd(&counter, s_counter);
-                s_ind = ind - s_counter;
+                s_ind = ::atomicAdd(&counter, s_counter);
+
+                if (s_ind + s_counter > map.cols * map.rows)
+                    s_counter = 0;
             }
 
             __syncthreads();
@@ -432,7 +441,7 @@ namespace canny
         }
     }
 
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
     {
         void* counter_ptr;
         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
@@ -454,7 +463,12 @@ namespace canny
 
             cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
 
-            std::swap(st1, st2);
+            count = min(count, map.cols * map.rows);
+
+            //std::swap(st1, st2);
+            short2* tmp = st1;
+            st1 = st2;
+            st2 = tmp;
         }
     }
 }
diff --git a/modules/cudaimgproc/src/hough_circles.cpp b/modules/cudaimgproc/src/hough_circles.cpp
index 0cf94a63d..3f9b9334c 100644
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -133,6 +133,7 @@ namespace
         GpuMat dx_, dy_;
         GpuMat edges_;
         GpuMat accum_;
+        Mat tt; //CPU copy of accum_
         GpuMat list_;
         GpuMat result_;
         Ptr<cuda::Filter> filterDx_;
@@ -140,6 +141,8 @@ namespace
         Ptr<cuda::CannyEdgeDetector> canny_;
     };
 
+    bool centersCompare(Vec3f a, Vec3f b) {return (a[2] > b[2]);}
+
     HoughCirclesDetectorImpl::HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold,
                                                        int minRadius, int maxRadius, int maxCircles) :
         dp_(dp), minDist_(minDist), cannyThreshold_(cannyThreshold), votesThreshold_(votesThreshold),
@@ -193,6 +196,8 @@ namespace
 
         circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
 
+        accum_.download(tt);
+
         int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
         if (centersCount == 0)
         {
@@ -219,9 +224,21 @@ namespace
 
             const float minDist2 = minDist_ * minDist_;
 
+            std::vector<Vec3f> sortBuf;
+            for(int i=0; i<centersCount; i++){
+                Vec3f temp;
+                temp[0] = oldBuf[i].x;
+                temp[1] = oldBuf[i].y;
+                temp[2] = tt.at<int>(temp[1]+1, temp[0]+1);
+                sortBuf.push_back(temp);
+            }
+            std::sort(sortBuf.begin(), sortBuf.end(), centersCompare);
+
             for (int i = 0; i < centersCount; ++i)
             {
-                ushort2 p = oldBuf[i];
+                ushort2 p;
+                p.x = sortBuf[i][0];
+                p.y = sortBuf[i][1];
 
                 bool good = true;
 
diff --git a/modules/cudaimgproc/src/precomp.hpp b/modules/cudaimgproc/src/precomp.hpp
index 964c28c15..3bbb2a8f0 100644
--- a/modules/cudaimgproc/src/precomp.hpp
+++ b/modules/cudaimgproc/src/precomp.hpp
@@ -60,5 +60,6 @@
 #endif
 
 #include <limits>
+#include <algorithm>
 
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cudaimgproc/test/test_color.cpp b/modules/cudaimgproc/test/test_color.cpp
index 918872502..449444277 100644
--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@@ -715,7 +715,7 @@ CUDA_TEST_P(CvtColor, BGR2YCrCb)
     cv::Mat dst_gold;
     cv::cvtColor(src, dst_gold, cv::COLOR_BGR2YCrCb);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
 CUDA_TEST_P(CvtColor, RGB2YCrCb)
@@ -728,7 +728,7 @@ CUDA_TEST_P(CvtColor, RGB2YCrCb)
     cv::Mat dst_gold;
     cv::cvtColor(src, dst_gold, cv::COLOR_RGB2YCrCb);
 
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, depth == CV_32F ? 1e-2 : 1);
 }
 
 CUDA_TEST_P(CvtColor, BGR2YCrCb4)
@@ -749,7 +749,7 @@ CUDA_TEST_P(CvtColor, BGR2YCrCb4)
     cv::split(h_dst, channels);
     cv::merge(channels, 3, h_dst);
 
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
 CUDA_TEST_P(CvtColor, RGBA2YCrCb4)
@@ -771,7 +771,7 @@ CUDA_TEST_P(CvtColor, RGBA2YCrCb4)
     cv::split(h_dst, channels);
     cv::merge(channels, 3, h_dst);
 
-    EXPECT_MAT_NEAR(dst_gold, h_dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, h_dst, depth == CV_32F ? 1e-2 : 1);
 }
 
 CUDA_TEST_P(CvtColor, YCrCb2BGR)
diff --git a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
index e094f6bc0..2d06cda85 100644
--- a/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
+++ b/modules/cudalegacy/src/cuda/NCVPixelOperations.hpp
@@ -104,7 +104,7 @@ template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
 template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
 
 //TODO: consider using CUDA intrinsics to avoid branching
-template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);}
 template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
 template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
 template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
diff --git a/modules/cudalegacy/test/TestHaarCascadeLoader.cpp b/modules/cudalegacy/test/TestHaarCascadeLoader.cpp
index b1e840a54..8ca44dd13 100644
--- a/modules/cudalegacy/test/TestHaarCascadeLoader.cpp
+++ b/modules/cudalegacy/test/TestHaarCascadeLoader.cpp
@@ -95,7 +95,7 @@ bool TestHaarCascadeLoader::process()
     NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting());
     NCV_SKIP_COND_BEGIN
 
-    const std::string testNvbinName = "test.nvbin";
+    const std::string testNvbinName = cv::tempfile("test.nvbin");
     ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures);
     ncvAssertReturn(ncvStat == NCV_SUCCESS, false);
 
diff --git a/modules/cudaoptflow/perf/perf_optflow.cpp b/modules/cudaoptflow/perf/perf_optflow.cpp
index 7bf383c15..6c312ad0b 100644
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -444,7 +444,7 @@ PERF_TEST_P(ImagePair, OpticalFlowBM,
     }
 }
 
-PERF_TEST_P(ImagePair, FastOpticalFlowBM,
+PERF_TEST_P(ImagePair, DISABLED_FastOpticalFlowBM,
             Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
 {
     declare.time(400);
diff --git a/modules/cudaoptflow/test/test_optflow.cpp b/modules/cudaoptflow/test/test_optflow.cpp
index e80116a75..110fed033 100644
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -405,13 +405,15 @@ CUDA_TEST_P(OpticalFlowBM, Accuracy)
 
     cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(frame0.empty());
+    cv::resize(frame0, frame0, cv::Size(), 0.5, 0.5);
 
     cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(frame1.empty());
+    cv::resize(frame1, frame1, cv::Size(), 0.5, 0.5);
 
-    cv::Size block_size(16, 16);
+    cv::Size block_size(8, 8);
     cv::Size shift_size(1, 1);
-    cv::Size max_range(16, 16);
+    cv::Size max_range(8, 8);
 
     cv::cuda::GpuMat d_velx, d_vely, buf;
     cv::cuda::calcOpticalFlowBM(loadMat(frame0), loadMat(frame1),
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst b/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
index bf84ed0fb..4563f65c2 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_extractors.rst
@@ -25,10 +25,10 @@ Abstract base class for computing descriptors for image keypoints. ::
     public:
         virtual ~DescriptorExtractor();
 
-        void compute( const Mat& image, vector<KeyPoint>& keypoints,
-                      Mat& descriptors ) const;
-        void compute( const vector<Mat>& images, vector<vector<KeyPoint> >& keypoints,
-                      vector<Mat>& descriptors ) const;
+        void compute( InputArray image, vector<KeyPoint>& keypoints,
+                      OutputArray descriptors ) const;
+        void compute( InputArrayOfArrays images, vector<vector<KeyPoint> >& keypoints,
+                      OutputArrayOfArrays descriptors ) const;
 
         virtual void read( const FileNode& );
         virtual void write( FileStorage& ) const;
@@ -57,9 +57,9 @@ DescriptorExtractor::compute
 --------------------------------
 Computes the descriptors for a set of keypoints detected in an image (first variant) or image set (second variant).
 
-.. ocv:function:: void DescriptorExtractor::compute( const Mat& image, vector<KeyPoint>& keypoints, Mat& descriptors ) const
+.. ocv:function:: void DescriptorExtractor::compute( InputArray image, vector<KeyPoint>& keypoints, OutputArray descriptors ) const
 
-.. ocv:function:: void DescriptorExtractor::compute( const vector<Mat>& images, vector<vector<KeyPoint> >& keypoints, vector<Mat>& descriptors ) const
+.. ocv:function:: void DescriptorExtractor::compute( InputArrayOfArrays  images, vector<vector<KeyPoint> >& keypoints, OutputArrayOfArrays descriptors ) const
 
 .. ocv:pyfunction:: cv2.DescriptorExtractor_create.compute(image, keypoints[, descriptors]) -> keypoints, descriptors
 
@@ -119,35 +119,3 @@ them into a single color descriptor. ::
     protected:
         ...
     };
-
-
-
-BriefDescriptorExtractor
-------------------------
-.. ocv:class:: BriefDescriptorExtractor : public DescriptorExtractor
-
-Class for computing BRIEF descriptors described in a paper of Calonder M., Lepetit V.,
-Strecha C., Fua P. *BRIEF: Binary Robust Independent Elementary Features* ,
-11th European Conference on Computer Vision (ECCV), Heraklion, Crete. LNCS Springer, September 2010. ::
-
-    class BriefDescriptorExtractor : public DescriptorExtractor
-    {
-    public:
-        static const int PATCH_SIZE = 48;
-        static const int KERNEL_SIZE = 9;
-
-        // bytes is a length of descriptor in bytes. It can be equal 16, 32 or 64 bytes.
-        BriefDescriptorExtractor( int bytes = 32 );
-
-        virtual void read( const FileNode& );
-        virtual void write( FileStorage& ) const;
-        virtual int descriptorSize() const;
-        virtual int descriptorType() const;
-        virtual int defaultNorm() const;
-    protected:
-        ...
-    };
-
-.. note::
-
-   * A complete BRIEF extractor sample can be found at opencv_source_code/samples/cpp/brief_match_test.cpp
diff --git a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
index 2c2cf28f8..295cc8381 100644
--- a/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_descriptor_matchers.rst
@@ -28,7 +28,7 @@ with an image set. ::
     public:
         virtual ~DescriptorMatcher();
 
-        virtual void add( const vector<Mat>& descriptors );
+        virtual void add( InputArrayOfArrays descriptors );
 
         const vector<Mat>& getTrainDescriptors() const;
         virtual void clear();
@@ -40,24 +40,24 @@ with an image set. ::
         /*
          * Group of methods to match descriptors from an image pair.
          */
-        void match( const Mat& queryDescriptors, const Mat& trainDescriptors,
-                    vector<DMatch>& matches, const Mat& mask=Mat() ) const;
-        void knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+        void match( InputArray queryDescriptors, InputArray trainDescriptors,
+                    vector<DMatch>& matches, InputArray mask=noArray() ) const;
+        void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                        vector<vector<DMatch> >& matches, int k,
-                       const Mat& mask=Mat(), bool compactResult=false ) const;
-        void radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+                       InputArray mask=noArray(), bool compactResult=false ) const;
+        void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                           vector<vector<DMatch> >& matches, float maxDistance,
-                          const Mat& mask=Mat(), bool compactResult=false ) const;
+                          InputArray mask=noArray(), bool compactResult=false ) const;
         /*
          * Group of methods to match descriptors from one image to an image set.
          */
-        void match( const Mat& queryDescriptors, vector<DMatch>& matches,
-                    const vector<Mat>& masks=vector<Mat>() );
-        void knnMatch( const Mat& queryDescriptors, vector<vector<DMatch> >& matches,
-                       int k, const vector<Mat>& masks=vector<Mat>(),
+        void match( InputArray queryDescriptors, vector<DMatch>& matches,
+                    InputArrayOfArrays masks=noArray() );
+        void knnMatch( InputArray queryDescriptors, vector<vector<DMatch> >& matches,
+                       int k, InputArrayOfArrays masks=noArray(),
                        bool compactResult=false );
-        void radiusMatch( const Mat& queryDescriptors, vector<vector<DMatch> >& matches,
-                          float maxDistance, const vector<Mat>& masks=vector<Mat>(),
+        void radiusMatch( InputArray queryDescriptors, vector<vector<DMatch> >& matches,
+                          float maxDistance, InputArrayOfArrays masks=noArray(),
                           bool compactResult=false );
 
         virtual void read( const FileNode& );
@@ -69,15 +69,16 @@ with an image set. ::
 
     protected:
         vector<Mat> trainDescCollection;
+        vector<UMat> utrainDescCollection;
         ...
     };
 
 
 DescriptorMatcher::add
 --------------------------
-Adds descriptors to train a descriptor collection. If the collection ``trainDescCollectionis`` is not empty, the new descriptors are added to existing train descriptors.
+Adds descriptors to train a CPU(``trainDescCollectionis``) or GPU(``utrainDescCollectionis``) descriptor collection. If the collection is not empty, the new descriptors are added to existing train descriptors.
 
-.. ocv:function:: void DescriptorMatcher::add( const vector<Mat>& descriptors )
+.. ocv:function:: void DescriptorMatcher::add( InputArrayOfArrays descriptors )
 
     :param descriptors: Descriptors to add. Each  ``descriptors[i]``  is a set of descriptors from the same train image.
 
@@ -94,7 +95,7 @@ Returns a constant link to the train descriptor collection ``trainDescCollection
 
 DescriptorMatcher::clear
 ----------------------------
-Clears the train descriptor collection.
+Clears the train descriptor collections.
 
 .. ocv:function:: void DescriptorMatcher::clear()
 
@@ -102,7 +103,7 @@ Clears the train descriptor collection.
 
 DescriptorMatcher::empty
 ----------------------------
-Returns true if there are no train descriptors in the collection.
+Returns true if there are no train descriptors in the both collections.
 
 .. ocv:function:: bool DescriptorMatcher::empty() const
 
@@ -130,9 +131,9 @@ DescriptorMatcher::match
 ----------------------------
 Finds the best match for each descriptor from a query set.
 
-.. ocv:function:: void DescriptorMatcher::match( const Mat& queryDescriptors, const Mat& trainDescriptors, vector<DMatch>& matches, const Mat& mask=Mat() ) const
+.. ocv:function:: void DescriptorMatcher::match( InputArray queryDescriptors, InputArray trainDescriptors, vector<DMatch>& matches, InputArray mask=noArray() ) const
 
-.. ocv:function:: void DescriptorMatcher::match( const Mat& queryDescriptors, vector<DMatch>& matches, const vector<Mat>& masks=vector<Mat>() )
+.. ocv:function:: void DescriptorMatcher::match(InputArray queryDescriptors, vector<DMatch>& matches, InputArrayOfArrays masks=noArray() )
 
     :param queryDescriptors: Query set of descriptors.
 
@@ -152,9 +153,9 @@ DescriptorMatcher::knnMatch
 -------------------------------
 Finds the k best matches for each descriptor from a query set.
 
-.. ocv:function:: void DescriptorMatcher::knnMatch( const Mat& queryDescriptors,       const Mat& trainDescriptors,       vector<vector<DMatch> >& matches,       int k, const Mat& mask=Mat(),       bool compactResult=false ) const
+.. ocv:function:: void DescriptorMatcher::knnMatch(InputArray queryDescriptors,   InputArray trainDescriptors,       vector<vector<DMatch> >& matches,       int k, InputArray mask=noArray(),       bool compactResult=false ) const
 
-.. ocv:function:: void DescriptorMatcher::knnMatch( const Mat& queryDescriptors,           vector<vector<DMatch> >& matches, int k,      const vector<Mat>& masks=vector<Mat>(),       bool compactResult=false )
+.. ocv:function:: void DescriptorMatcher::knnMatch( InputArray queryDescriptors,           vector<vector<DMatch> >& matches, int k,      InputArrayOfArrays masks=noArray(),       bool compactResult=false )
 
     :param queryDescriptors: Query set of descriptors.
 
@@ -178,9 +179,9 @@ DescriptorMatcher::radiusMatch
 ----------------------------------
 For each query descriptor, finds the training descriptors not farther than the specified distance.
 
-.. ocv:function:: void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors,           const Mat& trainDescriptors,           vector<vector<DMatch> >& matches,           float maxDistance, const Mat& mask=Mat(),           bool compactResult=false ) const
+.. ocv:function:: void DescriptorMatcher::radiusMatch( InputArray queryDescriptors,           InputArray trainDescriptors,           vector<vector<DMatch> >& matches,           float maxDistance, InputArray mask=noArray(),           bool compactResult=false ) const
 
-.. ocv:function:: void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors,           vector<vector<DMatch> >& matches,           float maxDistance,      const vector<Mat>& masks=vector<Mat>(),       bool compactResult=false )
+.. ocv:function:: void DescriptorMatcher::radiusMatch( InputArray queryDescriptors,           vector<vector<DMatch> >& matches,           float maxDistance,      InputArrayOfArrays masks=noArray(),       bool compactResult=false )
 
     :param queryDescriptors: Query set of descriptors.
 
@@ -264,7 +265,7 @@ Flann-based descriptor matcher. This matcher trains :ocv:class:`flann::Index_` o
           const Ptr<flann::IndexParams>& indexParams=new flann::KDTreeIndexParams(),
           const Ptr<flann::SearchParams>& searchParams=new flann::SearchParams() );
 
-        virtual void add( const vector<Mat>& descriptors );
+        virtual void add( InputArrayOfArrays descriptors );
         virtual void clear();
 
         virtual void train();
diff --git a/modules/features2d/doc/common_interfaces_of_feature_detectors.rst b/modules/features2d/doc/common_interfaces_of_feature_detectors.rst
index 434585d1e..62a99073b 100644
--- a/modules/features2d/doc/common_interfaces_of_feature_detectors.rst
+++ b/modules/features2d/doc/common_interfaces_of_feature_detectors.rst
@@ -23,12 +23,12 @@ Abstract base class for 2D image feature detectors. ::
     public:
         virtual ~FeatureDetector();
 
-        void detect( const Mat& image, vector<KeyPoint>& keypoints,
-                     const Mat& mask=Mat() ) const;
+        void detect( InputArray image, vector<KeyPoint>& keypoints,
+                     InputArray mask=noArray() ) const;
 
-        void detect( const vector<Mat>& images,
+        void detect( InputArrayOfArrays images,
                      vector<vector<KeyPoint> >& keypoints,
-                     const vector<Mat>& masks=vector<Mat>() ) const;
+                     InputArrayOfArrays masks=noArray() ) const;
 
         virtual void read(const FileNode&);
         virtual void write(FileStorage&) const;
@@ -43,9 +43,9 @@ FeatureDetector::detect
 ---------------------------
 Detects keypoints in an image (first variant) or image set (second variant).
 
-.. ocv:function:: void FeatureDetector::detect( const Mat& image, vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const
+.. ocv:function:: void FeatureDetector::detect( InputArray image, vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const
 
-.. ocv:function:: void FeatureDetector::detect( const vector<Mat>& images, vector<vector<KeyPoint> >& keypoints, const vector<Mat>& masks=vector<Mat>() ) const
+.. ocv:function:: void FeatureDetector::detect( InputArrayOfArrays images, vector<vector<KeyPoint> >& keypoints, InputArrayOfArrays masks=noArray() ) const
 
 .. ocv:pyfunction:: cv2.FeatureDetector_create.detect(image[, mask]) -> keypoints
 
diff --git a/modules/features2d/doc/common_interfaces_of_generic_descriptor_matchers.rst b/modules/features2d/doc/common_interfaces_of_generic_descriptor_matchers.rst
index 5a7f952bc..5806b8228 100644
--- a/modules/features2d/doc/common_interfaces_of_generic_descriptor_matchers.rst
+++ b/modules/features2d/doc/common_interfaces_of_generic_descriptor_matchers.rst
@@ -29,7 +29,7 @@ Abstract interface for extracting and matching a keypoint descriptor. There are
         GenericDescriptorMatcher();
         virtual ~GenericDescriptorMatcher();
 
-        virtual void add( const vector<Mat>& images,
+        virtual void add( InputArrayOfArrays images,
                           vector<vector<KeyPoint> >& keypoints );
 
         const vector<Mat>& getTrainImages() const;
@@ -40,38 +40,38 @@ Abstract interface for extracting and matching a keypoint descriptor. There are
 
         virtual bool isMaskSupported() = 0;
 
-        void classify( const Mat& queryImage,
+        void classify( InputArray queryImage,
                        vector<KeyPoint>& queryKeypoints,
-                       const Mat& trainImage,
+                       InputArray trainImage,
                        vector<KeyPoint>& trainKeypoints ) const;
-        void classify( const Mat& queryImage,
+        void classify( InputArray queryImage,
                        vector<KeyPoint>& queryKeypoints );
 
         /*
          * Group of methods to match keypoints from an image pair.
          */
-        void match( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
-                    const Mat& trainImage, vector<KeyPoint>& trainKeypoints,
-                    vector<DMatch>& matches, const Mat& mask=Mat() ) const;
-        void knnMatch( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
-                       const Mat& trainImage, vector<KeyPoint>& trainKeypoints,
+        void match( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
+                    InputArray trainImage, vector<KeyPoint>& trainKeypoints,
+                    vector<DMatch>& matches, InputArray mask=noArray() ) const;
+        void knnMatch( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
+                       InputArray trainImage, vector<KeyPoint>& trainKeypoints,
                        vector<vector<DMatch> >& matches, int k,
-                       const Mat& mask=Mat(), bool compactResult=false ) const;
-        void radiusMatch( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
-                          const Mat& trainImage, vector<KeyPoint>& trainKeypoints,
+                       InputArray mask=noArray(), bool compactResult=false ) const;
+        void radiusMatch( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
+                          InputArray trainImage, vector<KeyPoint>& trainKeypoints,
                           vector<vector<DMatch> >& matches, float maxDistance,
-                          const Mat& mask=Mat(), bool compactResult=false ) const;
+                          InputArray mask=noArray(), bool compactResult=false ) const;
         /*
          * Group of methods to match keypoints from one image to an image set.
          */
-        void match( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
-                    vector<DMatch>& matches, const vector<Mat>& masks=vector<Mat>() );
-        void knnMatch( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
+        void match( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
+                    vector<DMatch>& matches, InputArrayOfArrays masks=noArray() );
+        void knnMatch( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
                        vector<vector<DMatch> >& matches, int k,
-                       const vector<Mat>& masks=vector<Mat>(), bool compactResult=false );
-        void radiusMatch( const Mat& queryImage, vector<KeyPoint>& queryKeypoints,
+                       InputArrayOfArrays masks=noArray(), bool compactResult=false );
+        void radiusMatch( InputArray queryImage, vector<KeyPoint>& queryKeypoints,
                           vector<vector<DMatch> >& matches, float maxDistance,
-                          const vector<Mat>& masks=vector<Mat>(), bool compactResult=false );
+                          InputArrayOfArrays masks=noArray(), bool compactResult=false );
 
         virtual void read( const FileNode& );
         virtual void write( FileStorage& ) const;
@@ -89,7 +89,7 @@ GenericDescriptorMatcher::add
 ---------------------------------
 Adds images and their keypoints to the training collection stored in the class instance.
 
-.. ocv:function:: void GenericDescriptorMatcher::add( const vector<Mat>& images,                        vector<vector<KeyPoint> >& keypoints )
+.. ocv:function:: void GenericDescriptorMatcher::add( InputArrayOfArrays images,                        vector<vector<KeyPoint> >& keypoints )
 
     :param images: Image collection.
 
@@ -142,9 +142,9 @@ GenericDescriptorMatcher::classify
 --------------------------------------
 Classifies keypoints from a query set.
 
-.. ocv:function:: void GenericDescriptorMatcher::classify(  const Mat& queryImage,           vector<KeyPoint>& queryKeypoints,           const Mat& trainImage,           vector<KeyPoint>& trainKeypoints ) const
+.. ocv:function:: void GenericDescriptorMatcher::classify(  InputArray queryImage,           vector<KeyPoint>& queryKeypoints,           InputArray trainImage,           vector<KeyPoint>& trainKeypoints ) const
 
-.. ocv:function:: void GenericDescriptorMatcher::classify( const Mat& queryImage,           vector<KeyPoint>& queryKeypoints )
+.. ocv:function:: void GenericDescriptorMatcher::classify( InputArray queryImage,           vector<KeyPoint>& queryKeypoints )
 
     :param queryImage: Query image.
 
@@ -170,9 +170,9 @@ GenericDescriptorMatcher::match
 -----------------------------------
 Finds the best match in the training set for each keypoint from the query set.
 
-.. ocv:function:: void GenericDescriptorMatcher::match( const Mat& queryImage, vector<KeyPoint>& queryKeypoints, const Mat& trainImage, vector<KeyPoint>& trainKeypoints, vector<DMatch>& matches, const Mat& mask=Mat() ) const
+.. ocv:function:: void GenericDescriptorMatcher::match(InputArray queryImage, vector<KeyPoint>& queryKeypoints, InputArray trainImage, vector<KeyPoint>& trainKeypoints, vector<DMatch>& matches, InputArray mask=noArray() ) const
 
-.. ocv:function:: void GenericDescriptorMatcher::match( const Mat& queryImage, vector<KeyPoint>& queryKeypoints, vector<DMatch>& matches, const vector<Mat>& masks=vector<Mat>() )
+.. ocv:function:: void GenericDescriptorMatcher::match( InputArray queryImage, vector<KeyPoint>& queryKeypoints, vector<DMatch>& matches, InputArrayOfArrays masks=noArray() )
 
     :param queryImage: Query image.
 
@@ -196,9 +196,9 @@ GenericDescriptorMatcher::knnMatch
 --------------------------------------
 Finds the ``k`` best matches for each query keypoint.
 
-.. ocv:function:: void GenericDescriptorMatcher::knnMatch(           const Mat& queryImage, vector<KeyPoint>& queryKeypoints,      const Mat& trainImage, vector<KeyPoint>& trainKeypoints,      vector<vector<DMatch> >& matches, int k,       const Mat& mask=Mat(), bool compactResult=false ) const
+.. ocv:function:: void GenericDescriptorMatcher::knnMatch(           InputArray queryImage, vector<KeyPoint>& queryKeypoints,      InputArray trainImage, vector<KeyPoint>& trainKeypoints,      vector<vector<DMatch> >& matches, int k,       InputArray mask=noArray(), bool compactResult=false ) const
 
-.. ocv:function:: void GenericDescriptorMatcher::knnMatch(           const Mat& queryImage, vector<KeyPoint>& queryKeypoints,      vector<vector<DMatch> >& matches, int k,       const vector<Mat>& masks=vector<Mat>(),       bool compactResult=false )
+.. ocv:function:: void GenericDescriptorMatcher::knnMatch(           InputArray queryImage, vector<KeyPoint>& queryKeypoints,      vector<vector<DMatch> >& matches, int k,       InputArrayOfArrays masks=noArray(),       bool compactResult=false )
 
 The methods are extended variants of ``GenericDescriptorMatch::match``. The parameters are similar, and the semantics is similar to ``DescriptorMatcher::knnMatch``. But this class does not require explicitly computed keypoint descriptors.
 
@@ -208,9 +208,9 @@ GenericDescriptorMatcher::radiusMatch
 -----------------------------------------
 For each query keypoint, finds the training keypoints not farther than the specified distance.
 
-.. ocv:function:: void GenericDescriptorMatcher::radiusMatch(           const Mat& queryImage, vector<KeyPoint>& queryKeypoints,      const Mat& trainImage, vector<KeyPoint>& trainKeypoints,      vector<vector<DMatch> >& matches, float maxDistance,       const Mat& mask=Mat(), bool compactResult=false ) const
+.. ocv:function:: void GenericDescriptorMatcher::radiusMatch(           InputArray queryImage, vector<KeyPoint>& queryKeypoints,      InputArray trainImage, vector<KeyPoint>& trainKeypoints,      vector<vector<DMatch> >& matches, float maxDistance,       InputArray mask=noArray(), bool compactResult=false ) const
 
-.. ocv:function:: void GenericDescriptorMatcher::radiusMatch(           const Mat& queryImage, vector<KeyPoint>& queryKeypoints,      vector<vector<DMatch> >& matches, float maxDistance,       const vector<Mat>& masks=vector<Mat>(),       bool compactResult=false )
+.. ocv:function:: void GenericDescriptorMatcher::radiusMatch(           InputArray queryImage, vector<KeyPoint>& queryKeypoints,      vector<vector<DMatch> >& matches, float maxDistance,       InputArrayOfArrays masks=noArray(),       bool compactResult=false )
 
 The methods are similar to ``DescriptorMatcher::radius``. But this class does not require explicitly computed keypoint descriptors.
 
@@ -254,7 +254,7 @@ Class used for matching descriptors that can be described as vectors in a finite
         VectorDescriptorMatcher( const Ptr<DescriptorExtractor>& extractor, const Ptr<DescriptorMatcher>& matcher );
         virtual ~VectorDescriptorMatcher();
 
-        virtual void add( const vector<Mat>& imgCollection,
+        virtual void add( InputArrayOfArrays imgCollection,
                           vector<vector<KeyPoint> >& pointCollection );
         virtual void clear();
         virtual void train();
diff --git a/modules/features2d/doc/drawing_function_of_keypoints_and_matches.rst b/modules/features2d/doc/drawing_function_of_keypoints_and_matches.rst
index 68c68fc6c..250b9a674 100644
--- a/modules/features2d/doc/drawing_function_of_keypoints_and_matches.rst
+++ b/modules/features2d/doc/drawing_function_of_keypoints_and_matches.rst
@@ -7,9 +7,9 @@ drawMatches
 ---------------
 Draws the found matches of keypoints from two images.
 
-.. ocv:function:: void drawMatches( const Mat& img1, const vector<KeyPoint>& keypoints1, const Mat& img2, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches1to2, Mat& outImg, const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1), const vector<char>& matchesMask=vector<char>(), int flags=DrawMatchesFlags::DEFAULT )
+.. ocv:function:: void drawMatches( InputArray img1, const vector<KeyPoint>& keypoints1, InputArray img2, const vector<KeyPoint>& keypoints2, const vector<DMatch>& matches1to2, InputOutputArray outImg, const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1), const vector<char>& matchesMask=vector<char>(), int flags=DrawMatchesFlags::DEFAULT )
 
-.. ocv:function:: void drawMatches( const Mat& img1, const vector<KeyPoint>& keypoints1, const Mat& img2, const vector<KeyPoint>& keypoints2, const vector<vector<DMatch> >& matches1to2, Mat& outImg, const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1), const vector<vector<char> >& matchesMask=vector<vector<char> >(), int flags=DrawMatchesFlags::DEFAULT )
+.. ocv:function:: void drawMatches( InputArray img1, const vector<KeyPoint>& keypoints1, InputArray img2, const vector<KeyPoint>& keypoints2, const vector<vector<DMatch> >& matches1to2, InputOutputArray outImg, const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1), const vector<vector<char> >& matchesMask=vector<vector<char> >(), int flags=DrawMatchesFlags::DEFAULT )
 
 .. ocv:pyfunction:: cv2.drawMatches(img1, keypoints1, img2, keypoints2, matches1to2[, outImg[, matchColor[, singlePointColor[, matchesMask[, flags]]]]]) -> outImg
 
@@ -69,7 +69,7 @@ drawKeypoints
 -----------------
 Draws keypoints.
 
-.. ocv:function:: void drawKeypoints( const Mat& image, const vector<KeyPoint>& keypoints, Mat& outImage, const Scalar& color=Scalar::all(-1), int flags=DrawMatchesFlags::DEFAULT )
+.. ocv:function:: void drawKeypoints( InputArray image, const vector<KeyPoint>& keypoints, InputOutputArray outImage, const Scalar& color=Scalar::all(-1), int flags=DrawMatchesFlags::DEFAULT )
 
 .. ocv:pyfunction:: cv2.drawKeypoints(image, keypoints[, outImage[, color[, flags]]]) -> outImage
 
diff --git a/modules/features2d/doc/feature_detection_and_description.rst b/modules/features2d/doc/feature_detection_and_description.rst
index a6fe7c8fa..02263472b 100644
--- a/modules/features2d/doc/feature_detection_and_description.rst
+++ b/modules/features2d/doc/feature_detection_and_description.rst
@@ -37,6 +37,37 @@ Detects corners using the FAST algorithm by [Rosten06]_.
 .. [Rosten06] E. Rosten. Machine Learning for High-speed Corner Detection, 2006.
 
 
+BriefDescriptorExtractor
+------------------------
+.. ocv:class:: BriefDescriptorExtractor : public DescriptorExtractor
+
+Class for computing BRIEF descriptors described in a paper of Calonder M., Lepetit V.,
+Strecha C., Fua P. *BRIEF: Binary Robust Independent Elementary Features* ,
+11th European Conference on Computer Vision (ECCV), Heraklion, Crete. LNCS Springer, September 2010. ::
+
+    class BriefDescriptorExtractor : public DescriptorExtractor
+    {
+    public:
+        static const int PATCH_SIZE = 48;
+        static const int KERNEL_SIZE = 9;
+
+        // bytes is a length of descriptor in bytes. It can be equal 16, 32 or 64 bytes.
+        BriefDescriptorExtractor( int bytes = 32 );
+
+        virtual void read( const FileNode& );
+        virtual void write( FileStorage& ) const;
+        virtual int descriptorSize() const;
+        virtual int descriptorType() const;
+        virtual int defaultNorm() const;
+    protected:
+        ...
+    };
+
+.. note::
+
+   * A complete BRIEF extractor sample can be found at opencv_source_code/samples/cpp/brief_match_test.cpp
+
+
 MSER
 ----
 .. ocv:class:: MSER : public FeatureDetector
diff --git a/modules/features2d/doc/object_categorization.rst b/modules/features2d/doc/object_categorization.rst
index 826919519..f387866c5 100644
--- a/modules/features2d/doc/object_categorization.rst
+++ b/modules/features2d/doc/object_categorization.rst
@@ -124,14 +124,17 @@ The class declaration is the following: ::
         public:
             BOWImgDescriptorExtractor( const Ptr<DescriptorExtractor>& dextractor,
                                        const Ptr<DescriptorMatcher>& dmatcher );
+            BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
             virtual ~BOWImgDescriptorExtractor(){}
 
             void setVocabulary( const Mat& vocabulary );
             const Mat& getVocabulary() const;
-            void compute( const Mat& image, vector<KeyPoint>& keypoints,
-                          Mat& imgDescriptor,
+            void compute( InputArray image, vector<KeyPoint>& keypoints,
+                          OutputArray imgDescriptor,
                           vector<vector<int> >* pointIdxsOfClusters=0,
                           Mat* descriptors=0 );
+            void compute( InputArray descriptors, OutputArray imgDescriptor,
+                          std::vector<std::vector<int> >* pointIdxsOfClusters=0 );
             int descriptorSize() const;
             int descriptorType() const;
 
@@ -147,6 +150,7 @@ BOWImgDescriptorExtractor::BOWImgDescriptorExtractor
 The constructor.
 
 .. ocv:function:: BOWImgDescriptorExtractor::BOWImgDescriptorExtractor(           const Ptr<DescriptorExtractor>& dextractor,          const Ptr<DescriptorMatcher>& dmatcher )
+.. ocv:function:: BOWImgDescriptorExtractor::BOWImgDescriptorExtractor(           const Ptr<DescriptorMatcher>& dmatcher )
 
     :param dextractor: Descriptor extractor that is used to compute descriptors for an input image and its keypoints.
 
@@ -176,12 +180,15 @@ BOWImgDescriptorExtractor::compute
 --------------------------------------
 Computes an image descriptor using the set visual vocabulary.
 
-.. ocv:function:: void BOWImgDescriptorExtractor::compute( const Mat& image, vector<KeyPoint>& keypoints, Mat& imgDescriptor, vector<vector<int> >* pointIdxsOfClusters=0, Mat* descriptors=0 )
+.. ocv:function:: void BOWImgDescriptorExtractor::compute( InputArray image, vector<KeyPoint>& keypoints, OutputArray imgDescriptor, vector<vector<int> >* pointIdxsOfClusters=0, Mat* descriptors=0 )
+.. ocv:function:: void BOWImgDescriptorExtractor::compute( InputArray keypointDescriptors, OutputArray imgDescriptor, std::vector<std::vector<int> >* pointIdxsOfClusters=0 )
 
     :param image: Image, for which the descriptor is computed.
 
     :param keypoints: Keypoints detected in the input image.
 
+    :param keypointDescriptors: Computed descriptors to match with vocabulary.
+
     :param imgDescriptor: Computed output image descriptor.
 
     :param pointIdxsOfClusters: Indices of keypoints that belong to the cluster. This means that ``pointIdxsOfClusters[i]``  are keypoint indices that belong to the  ``i`` -th cluster (word of vocabulary) returned if it is non-zero.
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 3655ab813..1589d59a8 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -108,7 +108,7 @@ public:
      * mask         Mask specifying where to look for keypoints (optional). Must be a char
      *              matrix with non-zero values in the region of interest.
      */
-    CV_WRAP void detect( const Mat& image, CV_OUT std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    CV_WRAP void detect( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     /*
      * Detect keypoints in an image set.
@@ -116,7 +116,7 @@ public:
      * keypoints    Collection of keypoints detected in an input images. keypoints[i] is a set of keypoints detected in an images[i].
      * masks        Masks for image set. masks[i] is a mask for images[i].
      */
-    void detect( const std::vector<Mat>& images, std::vector<std::vector<KeyPoint> >& keypoints, const std::vector<Mat>& masks=std::vector<Mat>() ) const;
+    void detect( InputArrayOfArrays images, std::vector<std::vector<KeyPoint> >& keypoints, InputArrayOfArrays masks=noArray() ) const;
 
     // Return true if detector object is empty
     CV_WRAP virtual bool empty() const;
@@ -125,14 +125,14 @@ public:
     CV_WRAP static Ptr<FeatureDetector> create( const String& detectorType );
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const = 0;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const = 0;
 
     /*
      * Remove keypoints that are not in the mask.
      * Helper function, useful when wrapping a library call for keypoint detection that
      * does not support a mask argument.
      */
-    static void removeInvalidPoints( const Mat& mask, std::vector<KeyPoint>& keypoints );
+    static void removeInvalidPoints( const Mat & mask, std::vector<KeyPoint>& keypoints );
 };
 
 
@@ -156,7 +156,7 @@ public:
      * keypoints    The input keypoints. Keypoints for which a descriptor cannot be computed are removed.
      * descriptors  Copmputed descriptors. Row i is the descriptor for keypoint i.
      */
-    CV_WRAP void compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const;
+    CV_WRAP void compute( InputArray image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 
     /*
      * Compute the descriptors for a keypoints collection detected in image collection.
@@ -165,7 +165,7 @@ public:
      *              Keypoints for which a descriptor cannot be computed are removed.
      * descriptors  Descriptor collection. descriptors[i] are descriptors computed for set keypoints[i].
      */
-    void compute( const std::vector<Mat>& images, std::vector<std::vector<KeyPoint> >& keypoints, std::vector<Mat>& descriptors ) const;
+    void compute( InputArrayOfArrays images, std::vector<std::vector<KeyPoint> >& keypoints, OutputArrayOfArrays descriptors ) const;
 
     CV_WRAP virtual int descriptorSize() const = 0;
     CV_WRAP virtual int descriptorType() const = 0;
@@ -176,7 +176,7 @@ public:
     CV_WRAP static Ptr<DescriptorExtractor> create( const String& descriptorExtractorType );
 
 protected:
-    virtual void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const = 0;
+    virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const = 0;
 
     /*
      * Remove keypoints within borderPixels of an image edge.
@@ -207,7 +207,7 @@ public:
                                      OutputArray descriptors,
                                      bool useProvidedKeypoints=false ) const = 0;
 
-    CV_WRAP void compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const;
+    CV_WRAP void compute( InputArray image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 
     // Create feature detector and descriptor extractor by name.
     CV_WRAP static Ptr<Feature2D> create( const String& name );
@@ -235,7 +235,7 @@ public:
 
     // Compute the BRISK features and descriptors on an image
     void operator()( InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
-                      OutputArray descriptors, bool useProvidedKeypoints=false ) const;
+                     OutputArray descriptors, bool useProvidedKeypoints=false ) const;
 
     AlgorithmInfo* info() const;
 
@@ -252,8 +252,8 @@ public:
 
 protected:
 
-    void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     void computeKeypointsNoOrientation(InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints) const;
     void computeDescriptorsAndOrOrientation(InputArray image, InputArray mask, std::vector<KeyPoint>& keypoints,
@@ -337,8 +337,8 @@ public:
 
 protected:
 
-    void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     CV_PROP_RW int nfeatures;
     CV_PROP_RW double scaleFactor;
@@ -403,9 +403,9 @@ public:
     };
 
 protected:
-    virtual void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
+    virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
     void buildPattern();
-    uchar meanIntensity( const Mat& image, const Mat& integral, const float kp_x, const float kp_y,
+    uchar meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y,
                          const unsigned int scale, const unsigned int rot, const unsigned int point ) const;
 
     bool orientationNormalized; //true if the orientation is normalized, false otherwise
@@ -465,12 +465,12 @@ public:
           double _min_margin=0.003, int _edge_blur_size=5 );
 
     //! the operator that extracts the MSERs from the image or the specific part of it
-    CV_WRAP_AS(detect) void operator()( const Mat& image, CV_OUT std::vector<std::vector<Point> >& msers,
-                                        const Mat& mask=Mat() ) const;
+    CV_WRAP_AS(detect) void operator()( InputArray image, CV_OUT std::vector<std::vector<Point> >& msers,
+                                        InputArray mask=noArray() ) const;
     AlgorithmInfo* info() const;
 
 protected:
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     int delta;
     int minArea;
@@ -506,7 +506,7 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     int maxSize;
     int responseThreshold;
@@ -535,7 +535,7 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     int threshold;
     bool nonmaxSuppression;
@@ -551,7 +551,7 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     int nfeatures;
     double qualityLevel;
@@ -608,8 +608,8 @@ protected:
       double confidence;
   };
 
-  virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
-  virtual void findBlobs(const Mat &image, const Mat &binaryImage, std::vector<Center> &centers) const;
+  virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
+  virtual void findBlobs(InputArray image, InputArray binaryImage, std::vector<Center> &centers) const;
 
   Params params;
   AlgorithmInfo* info() const;
@@ -627,7 +627,7 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     double initFeatureScale;
     int featureScaleLevels;
@@ -664,7 +664,7 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     Ptr<FeatureDetector> detector;
     int maxTotalKeypoints;
@@ -686,7 +686,7 @@ public:
     virtual bool empty() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     Ptr<FeatureDetector> detector;
     int maxLevel;
@@ -747,7 +747,7 @@ public:
     virtual bool empty() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
 private:
     DynamicAdaptedFeatureDetector& operator=(const DynamicAdaptedFeatureDetector&);
@@ -776,7 +776,7 @@ public:
     virtual Ptr<AdjusterAdapter> clone() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     int thresh_;
     bool nonmax_;
@@ -799,7 +799,7 @@ public:
     virtual Ptr<AdjusterAdapter> clone() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl(InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     double thresh_, init_thresh_, min_thresh_, max_thresh_;
 };
@@ -816,7 +816,7 @@ public:
     virtual Ptr<AdjusterAdapter> clone() const;
 
 protected:
-    virtual void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask=Mat() ) const;
+    virtual void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask=noArray() ) const;
 
     double thresh_, init_thresh_, min_thresh_, max_thresh_;
 };
@@ -829,7 +829,7 @@ CV_EXPORTS Mat windowedMatchingMask( const std::vector<KeyPoint>& keypoints1, co
 /*
  * OpponentColorDescriptorExtractor
  *
- * Adapts a descriptor extractor to compute descripors in Opponent Color Space
+ * Adapts a descriptor extractor to compute descriptors in Opponent Color Space
  * (refer to van de Sande et al., CGIV 2008 "Color Descriptors for Object Category Recognition").
  * Input RGB image is transformed in Opponent Color Space. Then unadapted descriptor extractor
  * (set in constructor) computes descriptors on each of the three channel and concatenate
@@ -850,7 +850,7 @@ public:
     virtual bool empty() const;
 
 protected:
-    virtual void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
+    virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 
     Ptr<DescriptorExtractor> descriptorExtractor;
 };
@@ -879,9 +879,9 @@ public:
     AlgorithmInfo* info() const;
 
 protected:
-    virtual void computeImpl(const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const;
+    virtual void computeImpl(InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const;
 
-    typedef void(*PixelTestFn)(const Mat&, const std::vector<KeyPoint>&, Mat&);
+    typedef void(*PixelTestFn)(InputArray, const std::vector<KeyPoint>&, OutputArray);
 
     int bytes_;
     PixelTestFn test_fn_;
@@ -998,7 +998,7 @@ public:
      * Add descriptors to train descriptor collection.
      * descriptors      Descriptors to add. Each descriptors[i] is a descriptors set from one image.
      */
-    CV_WRAP virtual void add( const std::vector<Mat>& descriptors );
+    CV_WRAP virtual void add( InputArrayOfArrays descriptors );
     /*
      * Get train descriptors collection.
      */
@@ -1034,30 +1034,30 @@ public:
      * Method train() is run in this methods.
      */
     // Find one best match for each query descriptor (if mask is empty).
-    CV_WRAP void match( const Mat& queryDescriptors, const Mat& trainDescriptors,
-                CV_OUT std::vector<DMatch>& matches, const Mat& mask=Mat() ) const;
+    CV_WRAP void match( InputArray queryDescriptors, InputArray trainDescriptors,
+                CV_OUT std::vector<DMatch>& matches, InputArray mask=noArray() ) const;
     // Find k best matches for each query descriptor (in increasing order of distances).
     // compactResult is used when mask is not empty. If compactResult is false matches
     // vector will have the same size as queryDescriptors rows. If compactResult is true
     // matches vector will not contain matches for fully masked out query descriptors.
-    CV_WRAP void knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+    CV_WRAP void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                    CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
-                   const Mat& mask=Mat(), bool compactResult=false ) const;
+                   InputArray mask=noArray(), bool compactResult=false ) const;
     // Find best matches for each query descriptor which have distance less than
     // maxDistance (in increasing order of distances).
-    void radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors,
+    void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
                       std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                      const Mat& mask=Mat(), bool compactResult=false ) const;
+                      InputArray mask=noArray(), bool compactResult=false ) const;
     /*
      * Group of methods to match descriptors from one image to image set.
      * See description of similar methods for matching image pair above.
      */
-    CV_WRAP void match( const Mat& queryDescriptors, CV_OUT std::vector<DMatch>& matches,
-                const std::vector<Mat>& masks=std::vector<Mat>() );
-    CV_WRAP void knnMatch( const Mat& queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
-    void radiusMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+    CV_WRAP void match( InputArray queryDescriptors, CV_OUT std::vector<DMatch>& matches,
+                        InputArrayOfArrays masks=noArray() );
+    CV_WRAP void knnMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+                           InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    void radiusMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                      InputArrayOfArrays masks=noArray(), bool compactResult=false );
 
     // Reads matcher object from a file node
     virtual void read( const FileNode& );
@@ -1101,19 +1101,20 @@ protected:
     // In fact the matching is implemented only by the following two methods. These methods suppose
     // that the class object has been trained already. Public match methods call these methods
     // after calling train().
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false ) = 0;
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false ) = 0;
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
 
-    static bool isPossibleMatch( const Mat& mask, int queryIdx, int trainIdx );
-    static bool isMaskedOut( const std::vector<Mat>& masks, int queryIdx );
+    static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
+    static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
 
     static Mat clone_op( Mat m ) { return m.clone(); }
-    void checkMasks( const std::vector<Mat>& masks, int queryDescriptorsCount ) const;
+    void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
 
     // Collection of descriptors from train images.
     std::vector<Mat> trainDescCollection;
+    std::vector<UMat> utrainDescCollection;
 };
 
 /*
@@ -1137,10 +1138,10 @@ public:
 
     AlgorithmInfo* info() const;
 protected:
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-           const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false );
 
     int normType;
     bool crossCheck;
@@ -1156,7 +1157,7 @@ public:
     CV_WRAP FlannBasedMatcher( const Ptr<flann::IndexParams>& indexParams=makePtr<flann::KDTreeIndexParams>(),
                        const Ptr<flann::SearchParams>& searchParams=makePtr<flann::SearchParams>() );
 
-    virtual void add( const std::vector<Mat>& descriptors );
+    virtual void add( InputArrayOfArrays descriptors );
     virtual void clear();
 
     // Reads matcher object from a file node
@@ -1175,10 +1176,10 @@ protected:
                                    const Mat& indices, const Mat& distances,
                                    std::vector<std::vector<DMatch> >& matches );
 
-    virtual void knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
-    virtual void radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false );
 
     Ptr<flann::IndexParams> indexParams;
     Ptr<flann::SearchParams> searchParams;
@@ -1213,7 +1214,7 @@ public:
      * If inheritor class need perform such prefiltering the method add() must be overloaded.
      * In the other class methods programmer has access to the train keypoints by a constant link.
      */
-    virtual void add( const std::vector<Mat>& images,
+    virtual void add( InputArrayOfArrays images,
                       std::vector<std::vector<KeyPoint> >& keypoints );
 
     const std::vector<Mat>& getTrainImages() const;
@@ -1242,10 +1243,10 @@ public:
      * trainKeypoints   Keypoints from the train image
      */
     // Classify keypoints from query image under one train image.
-    void classify( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                           const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints ) const;
+    void classify( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                           InputArray trainImage, std::vector<KeyPoint>& trainKeypoints ) const;
     // Classify keypoints from query image under train image collection.
-    void classify( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints );
+    void classify( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints );
 
     /*
      * Group of methods to match keypoints from image pair.
@@ -1253,34 +1254,34 @@ public:
      * train() method is called here.
      */
     // Find one best match for each query descriptor (if mask is empty).
-    void match( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
-                std::vector<DMatch>& matches, const Mat& mask=Mat() ) const;
+    void match( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                InputArray trainImage, std::vector<KeyPoint>& trainKeypoints,
+                std::vector<DMatch>& matches, InputArray mask=noArray() ) const;
     // Find k best matches for each query keypoint (in increasing order of distances).
     // compactResult is used when mask is not empty. If compactResult is false matches
     // vector will have the same size as queryDescriptors rows.
     // If compactResult is true matches vector will not contain matches for fully masked out query descriptors.
-    void knnMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                   const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
+    void knnMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                   InputArray trainImage, std::vector<KeyPoint>& trainKeypoints,
                    std::vector<std::vector<DMatch> >& matches, int k,
-                   const Mat& mask=Mat(), bool compactResult=false ) const;
+                   InputArray mask=noArray(), bool compactResult=false ) const;
     // Find best matches for each query descriptor which have distance less than maxDistance (in increasing order of distances).
-    void radiusMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                      const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
+    void radiusMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                      InputArray trainImage, std::vector<KeyPoint>& trainKeypoints,
                       std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                      const Mat& mask=Mat(), bool compactResult=false ) const;
+                      InputArray mask=noArray(), bool compactResult=false ) const;
     /*
      * Group of methods to match keypoints from one image to image set.
      * See description of similar methods for matching image pair above.
      */
-    void match( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                std::vector<DMatch>& matches, const std::vector<Mat>& masks=std::vector<Mat>() );
-    void knnMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    void match( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                std::vector<DMatch>& matches, InputArrayOfArrays masks=noArray() );
+    void knnMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                    std::vector<std::vector<DMatch> >& matches, int k,
-                   const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
-    void radiusMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+                   InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    void radiusMatch(InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                       std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                      const std::vector<Mat>& masks=std::vector<Mat>(), bool compactResult=false );
+                      InputArrayOfArrays masks=noArray(), bool compactResult=false );
 
     // Reads matcher object from a file node
     virtual void read( const FileNode& fn );
@@ -1302,12 +1303,12 @@ protected:
     // In fact the matching is implemented only by the following two methods. These methods suppose
     // that the class object has been trained already. Public match methods call these methods
     // after calling train().
-    virtual void knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    virtual void knnMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                std::vector<std::vector<DMatch> >& matches, int k,
-                               const std::vector<Mat>& masks, bool compactResult ) = 0;
-    virtual void radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+                               InputArrayOfArrays masks, bool compactResult ) = 0;
+    virtual void radiusMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                   std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                  const std::vector<Mat>& masks, bool compactResult ) = 0;
+                                  InputArrayOfArrays masks, bool compactResult ) = 0;
     /*
      * A storage for sets of keypoints together with corresponding images and class IDs
      */
@@ -1364,7 +1365,7 @@ public:
     VectorDescriptorMatcher( const Ptr<DescriptorExtractor>& extractor, const Ptr<DescriptorMatcher>& matcher );
     virtual ~VectorDescriptorMatcher();
 
-    virtual void add( const std::vector<Mat>& imgCollection,
+    virtual void add( InputArrayOfArrays imgCollection,
                       std::vector<std::vector<KeyPoint> >& pointCollection );
 
     virtual void clear();
@@ -1380,12 +1381,12 @@ public:
     virtual Ptr<GenericDescriptorMatcher> clone( bool emptyTrainData=false ) const;
 
 protected:
-    virtual void knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    virtual void knnMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                std::vector<std::vector<DMatch> >& matches, int k,
-                               const std::vector<Mat>& masks, bool compactResult );
-    virtual void radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+                               InputArrayOfArrays masks, bool compactResult );
+    virtual void radiusMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                   std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                  const std::vector<Mat>& masks, bool compactResult );
+                                  InputArrayOfArrays masks, bool compactResult );
 
     Ptr<DescriptorExtractor> extractor;
     Ptr<DescriptorMatcher> matcher;
@@ -1410,19 +1411,19 @@ struct CV_EXPORTS DrawMatchesFlags
 };
 
 // Draw keypoints.
-CV_EXPORTS_W void drawKeypoints( const Mat& image, const std::vector<KeyPoint>& keypoints, CV_OUT Mat& outImage,
+CV_EXPORTS_W void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, InputOutputArray outImage,
                                const Scalar& color=Scalar::all(-1), int flags=DrawMatchesFlags::DEFAULT );
 
 // Draws matches of keypints from two images on output image.
-CV_EXPORTS_W void drawMatches( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
-                             const Mat& img2, const std::vector<KeyPoint>& keypoints2,
-                             const std::vector<DMatch>& matches1to2, CV_OUT Mat& outImg,
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
                              const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
                              const std::vector<char>& matchesMask=std::vector<char>(), int flags=DrawMatchesFlags::DEFAULT );
 
-CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
-                             const Mat& img2, const std::vector<KeyPoint>& keypoints2,
-                             const std::vector<std::vector<DMatch> >& matches1to2, CV_OUT Mat& outImg,
+CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
                              const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
                              const std::vector<std::vector<char> >& matchesMask=std::vector<std::vector<char> >(), int flags=DrawMatchesFlags::DEFAULT );
 
@@ -1512,12 +1513,15 @@ class CV_EXPORTS BOWImgDescriptorExtractor
 public:
     BOWImgDescriptorExtractor( const Ptr<DescriptorExtractor>& dextractor,
                                const Ptr<DescriptorMatcher>& dmatcher );
+    BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
     virtual ~BOWImgDescriptorExtractor();
 
     void setVocabulary( const Mat& vocabulary );
     const Mat& getVocabulary() const;
-    void compute( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& imgDescriptor,
+    void compute( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray imgDescriptor,
                   std::vector<std::vector<int> >* pointIdxsOfClusters=0, Mat* descriptors=0 );
+    void compute( InputArray keypointDescriptors, OutputArray imgDescriptor,
+                  std::vector<std::vector<int> >* pointIdxsOfClusters=0 );
     // compute() is not constant because DescriptorMatcher::match is not constant
 
     int descriptorSize() const;
diff --git a/modules/ocl/perf/perf_match_template.cpp b/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
similarity index 52%
rename from modules/ocl/perf/perf_match_template.cpp
rename to modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
index 9c9829cd9..f7bd24cf5 100644
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/features2d/perf/opencl/perf_brute_force_matcher.cpp
@@ -44,78 +44,86 @@
 //
 //M*/
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
+#ifdef HAVE_OPENCL
 
-/////////// matchTemplate ////////////////////////
+namespace cvtest {
+namespace ocl {
 
-typedef Size_MatType CV_TM_CCORRFixture;
+//////////////////// BruteForceMatch /////////////////
 
-PERF_TEST_P(CV_TM_CCORRFixture, matchTemplate,
-            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
-                               OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+typedef Size_MatType BruteForceMatcherFixture;
+
+OCL_PERF_TEST_P(BruteForceMatcherFixture, Match, ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_PERF_ENUM((MatType)CV_32FC1) ) )
 {
     const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params), templSize(5, 5);
+    const Size srcSize = get<0>(params);
     const int type = get<1>(params);
 
-    Mat src(srcSize, type), templ(templSize, type);
-    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
-    Mat dst(dstSize, CV_32F);
-    randu(src, 0.0f, 1.0f);
-    randu(templ, 0.0f, 1.0f);
-    declare.time(srcSize == OCL_SIZE_2000 ? 20 : 6).in(src, templ).out(dst);
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclTempl(templ), oclDst(dstSize, CV_32F);
+    vector<DMatch> matches;
+    UMat uquery(srcSize, type), utrain(srcSize, type);
 
-        OCL_TEST_CYCLE() cv::ocl::matchTemplate(oclSrc, oclTempl, oclDst, TM_CCORR);
+    declare.in(uquery, utrain, WARMUP_RNG);
 
-        oclDst.download(dst);
+    BFMatcher matcher(NORM_L2);
 
-        SANITY_CHECK(dst, 1e-4);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::matchTemplate(src, templ, dst, TM_CCORR);
+    OCL_TEST_CYCLE()
+        matcher.match(uquery, utrain, matches);
 
-        SANITY_CHECK(dst, 1e-4);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK_MATCHES(matches, 1e-3);
 }
 
-typedef TestBaseWithParam<Size> CV_TM_CCORR_NORMEDFixture;
-
-PERF_TEST_P(CV_TM_CCORR_NORMEDFixture, matchTemplate, OCL_TYPICAL_MAT_SIZES)
+OCL_PERF_TEST_P(BruteForceMatcherFixture, KnnMatch, ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_PERF_ENUM((MatType)CV_32FC1) ) )
 {
-    const Size srcSize = GetParam(), templSize(5, 5);
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
 
-    Mat src(srcSize, CV_8UC1), templ(templSize, CV_8UC1), dst;
-    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
-    dst.create(dstSize, CV_8UC1);
-    declare.in(src, templ, WARMUP_RNG).out(dst)
-            .time(srcSize == OCL_SIZE_2000 ? 10 : srcSize == OCL_SIZE_4000 ? 23 : 2);
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclTempl(templ), oclDst(dstSize, CV_8UC1);
+    vector< vector<DMatch> > matches;
+    UMat uquery(srcSize, type), utrain(srcSize, type);
 
-        OCL_TEST_CYCLE() cv::ocl::matchTemplate(oclSrc, oclTempl, oclDst, TM_CCORR_NORMED);
+    declare.in(uquery, utrain, WARMUP_RNG);
 
-        oclDst.download(dst);
+    BFMatcher matcher(NORM_L2);
 
-        SANITY_CHECK(dst, 3e-2);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::matchTemplate(src, templ, dst, TM_CCORR_NORMED);
+    OCL_TEST_CYCLE()
+        matcher.knnMatch(uquery, utrain, matches, 2);
+
+    vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+    SANITY_CHECK_MATCHES(matches0, 1e-3);
+    SANITY_CHECK_MATCHES(matches1, 1e-3);
 
-        SANITY_CHECK(dst, 3e-2);
-    }
-    else
-        OCL_PERF_ELSE
 }
+
+OCL_PERF_TEST_P(BruteForceMatcherFixture, RadiusMatch, ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_PERF_ENUM((MatType)CV_32FC1) ) )
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    vector< vector<DMatch> > matches;
+    UMat uquery(srcSize, type), utrain(srcSize, type);
+
+    declare.in(uquery, utrain, WARMUP_RNG);
+
+    BFMatcher matcher(NORM_L2);
+
+    OCL_TEST_CYCLE()
+        matcher.radiusMatch(uquery, utrain, matches, 2.0f);
+
+    vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+    SANITY_CHECK_MATCHES(matches0, 1e-3);
+    SANITY_CHECK_MATCHES(matches1, 1e-3);
+}
+
+}//ocl
+}//cvtest
+
+#endif //HAVE_OPENCL
diff --git a/modules/features2d/src/bagofwords.cpp b/modules/features2d/src/bagofwords.cpp
index a3cfb60a9..525e478d4 100644
--- a/modules/features2d/src/bagofwords.cpp
+++ b/modules/features2d/src/bagofwords.cpp
@@ -44,7 +44,7 @@
 namespace cv
 {
 
-BOWTrainer::BOWTrainer()
+BOWTrainer::BOWTrainer() : size(0)
 {}
 
 BOWTrainer::~BOWTrainer()
@@ -121,6 +121,10 @@ BOWImgDescriptorExtractor::BOWImgDescriptorExtractor( const Ptr<DescriptorExtrac
     dextractor(_dextractor), dmatcher(_dmatcher)
 {}
 
+BOWImgDescriptorExtractor::BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& _dmatcher ) :
+    dmatcher(_dmatcher)
+{}
+
 BOWImgDescriptorExtractor::~BOWImgDescriptorExtractor()
 {}
 
@@ -136,50 +140,23 @@ const Mat& BOWImgDescriptorExtractor::getVocabulary() const
     return vocabulary;
 }
 
-void BOWImgDescriptorExtractor::compute( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& imgDescriptor,
-                                         std::vector<std::vector<int> >* pointIdxsOfClusters, Mat* _descriptors )
+void BOWImgDescriptorExtractor::compute( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray imgDescriptor,
+                                         std::vector<std::vector<int> >* pointIdxsOfClusters, Mat* descriptors )
 {
     imgDescriptor.release();
 
     if( keypoints.empty() )
         return;
 
-    int clusterCount = descriptorSize(); // = vocabulary.rows
-
     // Compute descriptors for the image.
-    Mat descriptors;
-    dextractor->compute( image, keypoints, descriptors );
+    Mat _descriptors;
+    dextractor->compute( image, keypoints, _descriptors );
 
-    // Match keypoint descriptors to cluster center (to vocabulary)
-    std::vector<DMatch> matches;
-    dmatcher->match( descriptors, matches );
-
-    // Compute image descriptor
-    if( pointIdxsOfClusters )
-    {
-        pointIdxsOfClusters->clear();
-        pointIdxsOfClusters->resize(clusterCount);
-    }
-
-    imgDescriptor = Mat( 1, clusterCount, descriptorType(), Scalar::all(0.0) );
-    float *dptr = (float*)imgDescriptor.data;
-    for( size_t i = 0; i < matches.size(); i++ )
-    {
-        int queryIdx = matches[i].queryIdx;
-        int trainIdx = matches[i].trainIdx; // cluster index
-        CV_Assert( queryIdx == (int)i );
-
-        dptr[trainIdx] = dptr[trainIdx] + 1.f;
-        if( pointIdxsOfClusters )
-            (*pointIdxsOfClusters)[trainIdx].push_back( queryIdx );
-    }
-
-    // Normalize image descriptor.
-    imgDescriptor /= descriptors.rows;
+    compute( _descriptors, imgDescriptor, pointIdxsOfClusters );
 
     // Add the descriptors of image keypoints
-    if (_descriptors) {
-        *_descriptors = descriptors.clone();
+    if (descriptors) {
+        *descriptors = _descriptors.clone();
     }
 }
 
@@ -193,4 +170,42 @@ int BOWImgDescriptorExtractor::descriptorType() const
     return CV_32FC1;
 }
 
+void BOWImgDescriptorExtractor::compute( InputArray keypointDescriptors, OutputArray _imgDescriptor, std::vector<std::vector<int> >* pointIdxsOfClusters )
+{
+    CV_Assert( !vocabulary.empty() );
+
+    int clusterCount = descriptorSize(); // = vocabulary.rows
+
+    // Match keypoint descriptors to cluster center (to vocabulary)
+    std::vector<DMatch> matches;
+    dmatcher->match( keypointDescriptors, matches );
+
+    // Compute image descriptor
+    if( pointIdxsOfClusters )
+    {
+        pointIdxsOfClusters->clear();
+        pointIdxsOfClusters->resize(clusterCount);
+    }
+
+    _imgDescriptor.create(1, clusterCount, descriptorType());
+    _imgDescriptor.setTo(Scalar::all(0));
+
+    Mat imgDescriptor = _imgDescriptor.getMat();
+
+    float *dptr = (float*)imgDescriptor.data;
+    for( size_t i = 0; i < matches.size(); i++ )
+    {
+        int queryIdx = matches[i].queryIdx;
+        int trainIdx = matches[i].trainIdx; // cluster index
+        CV_Assert( queryIdx == (int)i );
+
+        dptr[trainIdx] = dptr[trainIdx] + 1.f;
+        if( pointIdxsOfClusters )
+            (*pointIdxsOfClusters)[trainIdx].push_back( queryIdx );
+    }
+
+    // Normalize image descriptor.
+    imgDescriptor /= keypointDescriptors.size().height;
+}
+
 }
diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp
index 92d50ee53..69e058555 100644
--- a/modules/features2d/src/blobdetector.cpp
+++ b/modules/features2d/src/blobdetector.cpp
@@ -163,8 +163,9 @@ void SimpleBlobDetector::write( cv::FileStorage& fs ) const
     params.write(fs);
 }
 
-void SimpleBlobDetector::findBlobs(const cv::Mat &image, const cv::Mat &binaryImage, std::vector<Center> &centers) const
+void SimpleBlobDetector::findBlobs(InputArray _image, InputArray _binaryImage, std::vector<Center> &centers) const
 {
+    Mat image = _image.getMat(), binaryImage = _binaryImage.getMat();
     (void)image;
     centers.clear();
 
@@ -276,7 +277,7 @@ void SimpleBlobDetector::findBlobs(const cv::Mat &image, const cv::Mat &binaryIm
 #endif
 }
 
-void SimpleBlobDetector::detectImpl(const cv::Mat& image, std::vector<cv::KeyPoint>& keypoints, const cv::Mat&) const
+void SimpleBlobDetector::detectImpl(InputArray image, std::vector<cv::KeyPoint>& keypoints, InputArray) const
 {
     //TODO: support mask
     keypoints.clear();
@@ -284,7 +285,7 @@ void SimpleBlobDetector::detectImpl(const cv::Mat& image, std::vector<cv::KeyPoi
     if (image.channels() == 3)
         cvtColor(image, grayscaleImage, COLOR_BGR2GRAY);
     else
-        grayscaleImage = image;
+        grayscaleImage = image.getMat();
 
     std::vector < std::vector<Center> > centers;
     for (double thresh = params.minThreshold; thresh < params.maxThreshold; thresh += params.thresholdStep)
@@ -292,20 +293,11 @@ void SimpleBlobDetector::detectImpl(const cv::Mat& image, std::vector<cv::KeyPoi
         Mat binarizedImage;
         threshold(grayscaleImage, binarizedImage, thresh, 255, THRESH_BINARY);
 
-#ifdef DEBUG_BLOB_DETECTOR
-        //    Mat keypointsImage;
-        //    cvtColor( binarizedImage, keypointsImage, CV_GRAY2RGB );
-#endif
-
         std::vector < Center > curCenters;
         findBlobs(grayscaleImage, binarizedImage, curCenters);
         std::vector < std::vector<Center> > newCenters;
         for (size_t i = 0; i < curCenters.size(); i++)
         {
-#ifdef DEBUG_BLOB_DETECTOR
-            //      circle(keypointsImage, curCenters[i].location, curCenters[i].radius, Scalar(0,0,255),-1);
-#endif
-
             bool isNew = true;
             for (size_t j = 0; j < centers.size(); j++)
             {
@@ -327,17 +319,9 @@ void SimpleBlobDetector::detectImpl(const cv::Mat& image, std::vector<cv::KeyPoi
                 }
             }
             if (isNew)
-            {
                 newCenters.push_back(std::vector<Center> (1, curCenters[i]));
-                //centers.push_back(std::vector<Center> (1, curCenters[i]));
-            }
         }
         std::copy(newCenters.begin(), newCenters.end(), std::back_inserter(centers));
-
-#ifdef DEBUG_BLOB_DETECTOR
-        //    imshow("binarized", keypointsImage );
-        //waitKey();
-#endif
     }
 
     for (size_t i = 0; i < centers.size(); i++)
@@ -352,19 +336,7 @@ void SimpleBlobDetector::detectImpl(const cv::Mat& image, std::vector<cv::KeyPoi
             normalizer += centers[i][j].confidence;
         }
         sumPoint *= (1. / normalizer);
-        KeyPoint kpt(sumPoint, (float)(centers[i][centers[i].size() / 2].radius));
+        KeyPoint kpt(sumPoint, (float)(centers[i][centers[i].size() / 2].radius) * 2.0f);
         keypoints.push_back(kpt);
     }
-
-#ifdef DEBUG_BLOB_DETECTOR
-    namedWindow("keypoints", CV_WINDOW_NORMAL);
-    Mat outImg = image.clone();
-    for(size_t i=0; i<keypoints.size(); i++)
-    {
-        circle(outImg, keypoints[i].pt, keypoints[i].size, Scalar(255, 0, 255), -1);
-    }
-    //drawKeypoints(image, keypoints, outImg);
-    imshow("keypoints", outImg);
-    waitKey();
-#endif
 }
diff --git a/modules/features2d/src/brief.cpp b/modules/features2d/src/brief.cpp
index 252191988..0226ffb1e 100644
--- a/modules/features2d/src/brief.cpp
+++ b/modules/features2d/src/brief.cpp
@@ -61,8 +61,9 @@ inline int smoothedSum(const Mat& sum, const KeyPoint& pt, int y, int x)
            + sum.at<int>(img_y - HALF_KERNEL, img_x - HALF_KERNEL);
 }
 
-static void pixelTests16(const Mat& sum, const std::vector<KeyPoint>& keypoints, Mat& descriptors)
+static void pixelTests16(InputArray _sum, const std::vector<KeyPoint>& keypoints, OutputArray _descriptors)
 {
+    Mat sum = _sum.getMat(), descriptors = _descriptors.getMat();
     for (int i = 0; i < (int)keypoints.size(); ++i)
     {
         uchar* desc = descriptors.ptr(i);
@@ -71,8 +72,9 @@ static void pixelTests16(const Mat& sum, const std::vector<KeyPoint>& keypoints,
     }
 }
 
-static void pixelTests32(const Mat& sum, const std::vector<KeyPoint>& keypoints, Mat& descriptors)
+static void pixelTests32(InputArray _sum, const std::vector<KeyPoint>& keypoints, OutputArray _descriptors)
 {
+    Mat sum = _sum.getMat(), descriptors = _descriptors.getMat();
     for (int i = 0; i < (int)keypoints.size(); ++i)
     {
         uchar* desc = descriptors.ptr(i);
@@ -82,8 +84,9 @@ static void pixelTests32(const Mat& sum, const std::vector<KeyPoint>& keypoints,
     }
 }
 
-static void pixelTests64(const Mat& sum, const std::vector<KeyPoint>& keypoints, Mat& descriptors)
+static void pixelTests64(InputArray _sum, const std::vector<KeyPoint>& keypoints, OutputArray _descriptors)
 {
+    Mat sum = _sum.getMat(), descriptors = _descriptors.getMat();
     for (int i = 0; i < (int)keypoints.size(); ++i)
     {
         uchar* desc = descriptors.ptr(i);
@@ -155,12 +158,12 @@ void BriefDescriptorExtractor::write( FileStorage& fs) const
     fs << "descriptorSize" << bytes_;
 }
 
-void BriefDescriptorExtractor::computeImpl(const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const
+void BriefDescriptorExtractor::computeImpl(InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const
 {
     // Construct integral image for fast smoothing (box filter)
     Mat sum;
 
-    Mat grayImage = image;
+    Mat grayImage = image.getMat();
     if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
 
     ///TODO allow the user to pass in a precomputed integral image
@@ -173,7 +176,8 @@ void BriefDescriptorExtractor::computeImpl(const Mat& image, std::vector<KeyPoin
     //Remove keypoints very close to the border
     KeyPointsFilter::runByImageBorder(keypoints, image.size(), PATCH_SIZE/2 + KERNEL_SIZE/2);
 
-    descriptors = Mat::zeros((int)keypoints.size(), bytes_, CV_8U);
+    descriptors.create((int)keypoints.size(), bytes_, CV_8U);
+    descriptors.setTo(Scalar::all(0));
     test_fn_(sum, keypoints, descriptors);
 }
 
diff --git a/modules/features2d/src/brisk.cpp b/modules/features2d/src/brisk.cpp
index a3c5d7e3f..f4690eb17 100644
--- a/modules/features2d/src/brisk.cpp
+++ b/modules/features2d/src/brisk.cpp
@@ -224,6 +224,8 @@ BRISK::BRISK(std::vector<float> &radiusList, std::vector<int> &numberList, float
                                                    std::vector<int> indexChange)
 {
   generateKernel(radiusList, numberList, dMax, dMin, indexChange);
+  threshold = 20;
+  octaves = 3;
 }
 
 void
@@ -751,13 +753,13 @@ BRISK::computeKeypointsNoOrientation(InputArray _image, InputArray _mask, std::v
 
 
 void
-BRISK::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+BRISK::detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
-    (*this)(image, mask, keypoints);
+    (*this)(image.getMat(), mask.getMat(), keypoints);
 }
 
 void
-BRISK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const
+    BRISK::computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const
 {
     (*this)(image, Mat(), keypoints, descriptors, true);
 }
@@ -2229,7 +2231,7 @@ BriskLayer::halfsample(const cv::Mat& srcimg, cv::Mat& dstimg)
   CV_Assert(srcimg.cols / 2 == dstimg.cols);
   CV_Assert(srcimg.rows / 2 == dstimg.rows);
 
-  // handle non-SSE case
+  // handle non-SSE case
   resize(srcimg, dstimg, dstimg.size(), 0, 0, INTER_AREA);
 }
 
diff --git a/modules/features2d/src/descriptors.cpp b/modules/features2d/src/descriptors.cpp
index 9e0ac5c55..c45319062 100644
--- a/modules/features2d/src/descriptors.cpp
+++ b/modules/features2d/src/descriptors.cpp
@@ -54,7 +54,7 @@ namespace cv
 DescriptorExtractor::~DescriptorExtractor()
 {}
 
-void DescriptorExtractor::compute( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const
+void DescriptorExtractor::compute( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const
 {
     if( image.empty() || keypoints.empty() )
     {
@@ -68,8 +68,11 @@ void DescriptorExtractor::compute( const Mat& image, std::vector<KeyPoint>& keyp
     computeImpl( image, keypoints, descriptors );
 }
 
-void DescriptorExtractor::compute( const std::vector<Mat>& imageCollection, std::vector<std::vector<KeyPoint> >& pointCollection, std::vector<Mat>& descCollection ) const
+void DescriptorExtractor::compute( InputArrayOfArrays _imageCollection, std::vector<std::vector<KeyPoint> >& pointCollection, OutputArrayOfArrays _descCollection ) const
 {
+    std::vector<Mat> imageCollection, descCollection;
+    _imageCollection.getMatVector(imageCollection);
+    _descCollection.getMatVector(descCollection);
     CV_Assert( imageCollection.size() == pointCollection.size() );
     descCollection.resize( imageCollection.size() );
     for( size_t i = 0; i < imageCollection.size(); i++ )
@@ -106,7 +109,7 @@ Ptr<DescriptorExtractor> DescriptorExtractor::create(const String& descriptorExt
 }
 
 
-CV_WRAP void Feature2D::compute( const Mat& image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, CV_OUT Mat& descriptors ) const
+CV_WRAP void Feature2D::compute( InputArray image, CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const
 {
    DescriptorExtractor::compute(image, keypoints, descriptors);
 }
@@ -157,8 +160,9 @@ struct KP_LessThan
     const std::vector<KeyPoint>* kp;
 };
 
-void OpponentColorDescriptorExtractor::computeImpl( const Mat& bgrImage, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const
+void OpponentColorDescriptorExtractor::computeImpl( InputArray _bgrImage, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const
 {
+    Mat bgrImage = _bgrImage.getMat();
     std::vector<Mat> opponentChannels;
     convertBGRImageToOpponentColorSpace( bgrImage, opponentChannels );
 
diff --git a/modules/features2d/src/detectors.cpp b/modules/features2d/src/detectors.cpp
index 63a882dd3..d3c1f3f20 100644
--- a/modules/features2d/src/detectors.cpp
+++ b/modules/features2d/src/detectors.cpp
@@ -51,7 +51,7 @@ namespace cv
 FeatureDetector::~FeatureDetector()
 {}
 
-void FeatureDetector::detect( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void FeatureDetector::detect( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask ) const
 {
     keypoints.clear();
 
@@ -63,11 +63,29 @@ void FeatureDetector::detect( const Mat& image, std::vector<KeyPoint>& keypoints
     detectImpl( image, keypoints, mask );
 }
 
-void FeatureDetector::detect(const std::vector<Mat>& imageCollection, std::vector<std::vector<KeyPoint> >& pointCollection, const std::vector<Mat>& masks ) const
+void FeatureDetector::detect(InputArrayOfArrays _imageCollection, std::vector<std::vector<KeyPoint> >& pointCollection,
+                             InputArrayOfArrays _masks ) const
 {
+    if (_imageCollection.isUMatVector())
+    {
+        std::vector<UMat> uimageCollection, umasks;
+        _imageCollection.getUMatVector(uimageCollection);
+        _masks.getUMatVector(umasks);
+
+        pointCollection.resize( uimageCollection.size() );
+        for( size_t i = 0; i < uimageCollection.size(); i++ )
+            detect( uimageCollection[i], pointCollection[i], umasks.empty() ? noArray() : umasks[i] );
+
+        return;
+    }
+
+    std::vector<Mat> imageCollection, masks;
+    _imageCollection.getMatVector(imageCollection);
+    _masks.getMatVector(masks);
+
     pointCollection.resize( imageCollection.size() );
     for( size_t i = 0; i < imageCollection.size(); i++ )
-        detect( imageCollection[i], pointCollection[i], masks.empty() ? Mat() : masks[i] );
+        detect( imageCollection[i], pointCollection[i], masks.empty() ? noArray() : masks[i] );
 }
 
 /*void FeatureDetector::read( const FileNode& )
@@ -125,21 +143,37 @@ GFTTDetector::GFTTDetector( int _nfeatures, double _qualityLevel,
 {
 }
 
-void GFTTDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void GFTTDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask) const
 {
-    Mat grayImage = image;
-    if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
-
     std::vector<Point2f> corners;
-    goodFeaturesToTrack( grayImage, corners, nfeatures, qualityLevel, minDistance, mask,
-                         blockSize, useHarrisDetector, k );
+
+    if (_image.isUMat())
+    {
+        UMat ugrayImage;
+        if( _image.type() != CV_8U )
+            cvtColor( _image, ugrayImage, COLOR_BGR2GRAY );
+        else
+            ugrayImage = _image.getUMat();
+
+        goodFeaturesToTrack( ugrayImage, corners, nfeatures, qualityLevel, minDistance, _mask,
+                             blockSize, useHarrisDetector, k );
+    }
+    else
+    {
+        Mat image = _image.getMat(), grayImage = image;
+        if( image.type() != CV_8U )
+            cvtColor( image, grayImage, COLOR_BGR2GRAY );
+
+        goodFeaturesToTrack( grayImage, corners, nfeatures, qualityLevel, minDistance, _mask,
+                             blockSize, useHarrisDetector, k );
+    }
+
     keypoints.resize(corners.size());
     std::vector<Point2f>::const_iterator corner_it = corners.begin();
     std::vector<KeyPoint>::iterator keypoint_it = keypoints.begin();
     for( ; corner_it != corners.end(); ++corner_it, ++keypoint_it )
-    {
         *keypoint_it = KeyPoint( *corner_it, (float)blockSize );
-    }
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -157,8 +191,10 @@ DenseFeatureDetector::DenseFeatureDetector( float _initFeatureScale, int _featur
 {}
 
 
-void DenseFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void DenseFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
+    Mat image = _image.getMat(), mask = _mask.getMat();
+
     float curScale = static_cast<float>(initFeatureScale);
     int curStep = initXyStep;
     int curBound = initImgBound;
@@ -271,9 +307,9 @@ public:
 };
 } // namepace
 
-void GridAdaptedFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void GridAdaptedFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
-    if (image.empty() || maxTotalKeypoints < gridRows * gridCols)
+    if (_image.empty() || maxTotalKeypoints < gridRows * gridCols)
     {
         keypoints.clear();
         return;
@@ -281,6 +317,8 @@ void GridAdaptedFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPo
     keypoints.reserve(maxTotalKeypoints);
     int maxPerCell = maxTotalKeypoints / (gridRows * gridCols);
 
+    Mat image = _image.getMat(), mask = _mask.getMat();
+
     cv::Mutex kptLock;
     cv::parallel_for_(cv::Range(0, gridRows * gridCols),
         GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols, &kptLock));
@@ -298,8 +336,9 @@ bool PyramidAdaptedFeatureDetector::empty() const
     return !detector || detector->empty();
 }
 
-void PyramidAdaptedFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void PyramidAdaptedFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
+    Mat image = _image.getMat(), mask = _mask.getMat();
     Mat src = image;
     Mat src_mask = mask;
 
diff --git a/modules/features2d/src/draw.cpp b/modules/features2d/src/draw.cpp
index 61c2f817a..6673e4631 100644
--- a/modules/features2d/src/draw.cpp
+++ b/modules/features2d/src/draw.cpp
@@ -50,7 +50,7 @@ namespace cv
 /*
  * Functions to draw keypoints and matches.
  */
-static inline void _drawKeypoint( Mat& img, const KeyPoint& p, const Scalar& color, int flags )
+static inline void _drawKeypoint( InputOutputArray img, const KeyPoint& p, const Scalar& color, int flags )
 {
     CV_Assert( !img.empty() );
     Point center( cvRound(p.pt.x * draw_multiplier), cvRound(p.pt.y * draw_multiplier) );
@@ -88,7 +88,7 @@ static inline void _drawKeypoint( Mat& img, const KeyPoint& p, const Scalar& col
     }
 }
 
-void drawKeypoints( const Mat& image, const std::vector<KeyPoint>& keypoints, Mat& outImage,
+void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, InputOutputArray outImage,
                     const Scalar& _color, int flags )
 {
     if( !(flags & DrawMatchesFlags::DRAW_OVER_OUTIMG) )
@@ -120,25 +120,29 @@ void drawKeypoints( const Mat& image, const std::vector<KeyPoint>& keypoints, Ma
     }
 }
 
-static void _prepareImgAndDrawKeypoints( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
-                                         const Mat& img2, const std::vector<KeyPoint>& keypoints2,
-                                         Mat& outImg, Mat& outImg1, Mat& outImg2,
+static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                                         InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                                         InputOutputArray _outImg, Mat& outImg1, Mat& outImg2,
                                          const Scalar& singlePointColor, int flags )
 {
-    Size size( img1.cols + img2.cols, MAX(img1.rows, img2.rows) );
+    Mat outImg;
+    Size img1size = img1.size(), img2size = img2.size();
+    Size size( img1size.width + img2size.width, MAX(img1size.height, img2size.height) );
     if( flags & DrawMatchesFlags::DRAW_OVER_OUTIMG )
     {
+        outImg = _outImg.getMat();
         if( size.width > outImg.cols || size.height > outImg.rows )
             CV_Error( Error::StsBadSize, "outImg has size less than need to draw img1 and img2 together" );
-        outImg1 = outImg( Rect(0, 0, img1.cols, img1.rows) );
-        outImg2 = outImg( Rect(img1.cols, 0, img2.cols, img2.rows) );
+        outImg1 = outImg( Rect(0, 0, img1size.width, img1size.height) );
+        outImg2 = outImg( Rect(img1size.width, 0, img2size.width, img2size.height) );
     }
     else
     {
-        outImg.create( size, CV_MAKETYPE(img1.depth(), 3) );
+        _outImg.create( size, CV_MAKETYPE(img1.depth(), 3) );
+        outImg = _outImg.getMat();
         outImg = Scalar::all(0);
-        outImg1 = outImg( Rect(0, 0, img1.cols, img1.rows) );
-        outImg2 = outImg( Rect(img1.cols, 0, img2.cols, img2.rows) );
+        outImg1 = outImg( Rect(0, 0, img1size.width, img1size.height) );
+        outImg2 = outImg( Rect(img1size.width, 0, img2size.width, img2size.height) );
 
         if( img1.type() == CV_8U )
             cvtColor( img1, outImg1, COLOR_GRAY2BGR );
@@ -154,15 +158,15 @@ static void _prepareImgAndDrawKeypoints( const Mat& img1, const std::vector<KeyP
     // draw keypoints
     if( !(flags & DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS) )
     {
-        Mat _outImg1 = outImg( Rect(0, 0, img1.cols, img1.rows) );
+        Mat _outImg1 = outImg( Rect(0, 0, img1size.width, img1size.height) );
         drawKeypoints( _outImg1, keypoints1, _outImg1, singlePointColor, flags + DrawMatchesFlags::DRAW_OVER_OUTIMG );
 
-        Mat _outImg2 = outImg( Rect(img1.cols, 0, img2.cols, img2.rows) );
+        Mat _outImg2 = outImg( Rect(img1size.width, 0, img2size.width, img2size.height) );
         drawKeypoints( _outImg2, keypoints2, _outImg2, singlePointColor, flags + DrawMatchesFlags::DRAW_OVER_OUTIMG );
     }
 }
 
-static inline void _drawMatch( Mat& outImg, Mat& outImg1, Mat& outImg2 ,
+static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1, InputOutputArray outImg2 ,
                           const KeyPoint& kp1, const KeyPoint& kp2, const Scalar& matchColor, int flags )
 {
     RNG& rng = theRNG();
@@ -174,7 +178,7 @@ static inline void _drawMatch( Mat& outImg, Mat& outImg1, Mat& outImg2 ,
 
     Point2f pt1 = kp1.pt,
             pt2 = kp2.pt,
-            dpt2 = Point2f( std::min(pt2.x+outImg1.cols, float(outImg.cols-1)), pt2.y );
+            dpt2 = Point2f( std::min(pt2.x+outImg1.size().width, float(outImg.size().width-1)), pt2.y );
 
     line( outImg,
           Point(cvRound(pt1.x*draw_multiplier), cvRound(pt1.y*draw_multiplier)),
@@ -182,9 +186,9 @@ static inline void _drawMatch( Mat& outImg, Mat& outImg1, Mat& outImg2 ,
           color, 1, LINE_AA, draw_shift_bits );
 }
 
-void drawMatches( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
-                  const Mat& img2, const std::vector<KeyPoint>& keypoints2,
-                  const std::vector<DMatch>& matches1to2, Mat& outImg,
+void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                  InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                  const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
                   const Scalar& matchColor, const Scalar& singlePointColor,
                   const std::vector<char>& matchesMask, int flags )
 {
@@ -211,9 +215,9 @@ void drawMatches( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
     }
 }
 
-void drawMatches( const Mat& img1, const std::vector<KeyPoint>& keypoints1,
-                  const Mat& img2, const std::vector<KeyPoint>& keypoints2,
-                  const std::vector<std::vector<DMatch> >& matches1to2, Mat& outImg,
+void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                  InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                  const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
                   const Scalar& matchColor, const Scalar& singlePointColor,
                   const std::vector<std::vector<char> >& matchesMask, int flags )
 {
diff --git a/modules/features2d/src/dynamic.cpp b/modules/features2d/src/dynamic.cpp
index 6bd6ab4de..560005fba 100644
--- a/modules/features2d/src/dynamic.cpp
+++ b/modules/features2d/src/dynamic.cpp
@@ -54,8 +54,10 @@ bool DynamicAdaptedFeatureDetector::empty() const
     return !adjuster_ || adjuster_->empty();
 }
 
-void DynamicAdaptedFeatureDetector::detectImpl(const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void DynamicAdaptedFeatureDetector::detectImpl(InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask) const
 {
+    Mat image = _image.getMat(), mask = _mask.getMat();
+
     //for oscillation testing
     bool down = false;
     bool up = false;
@@ -98,7 +100,7 @@ FastAdjuster::FastAdjuster( int init_thresh, bool nonmax, int min_thresh, int ma
     min_thresh_(min_thresh), max_thresh_(max_thresh)
 {}
 
-void FastAdjuster::detectImpl(const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void FastAdjuster::detectImpl(InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
     FastFeatureDetector(thresh_, nonmax_).detect(image, keypoints, mask);
 }
@@ -133,7 +135,7 @@ StarAdjuster::StarAdjuster(double initial_thresh, double min_thresh, double max_
     min_thresh_(min_thresh), max_thresh_(max_thresh)
 {}
 
-void StarAdjuster::detectImpl(const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void StarAdjuster::detectImpl(InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
     StarFeatureDetector detector_tmp(16, cvRound(thresh_), 10, 8, 3);
     detector_tmp.detect(image, keypoints, mask);
@@ -167,7 +169,7 @@ SurfAdjuster::SurfAdjuster( double initial_thresh, double min_thresh, double max
     min_thresh_(min_thresh), max_thresh_(max_thresh)
 {}
 
-void SurfAdjuster::detectImpl(const Mat& image, std::vector<KeyPoint>& keypoints, const cv::Mat& mask) const
+void SurfAdjuster::detectImpl(InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
     Ptr<FeatureDetector> surf = FeatureDetector::create("SURF");
     surf->set("hessianThreshold", thresh_);
diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp
index 9e2181c14..b3335a067 100644
--- a/modules/features2d/src/fast.cpp
+++ b/modules/features2d/src/fast.cpp
@@ -283,10 +283,11 @@ FastFeatureDetector::FastFeatureDetector( int _threshold, bool _nonmaxSuppressio
 : threshold(_threshold), nonmaxSuppression(_nonmaxSuppression), type((short)_type)
 {}
 
-void FastFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void FastFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
-    Mat grayImage = image;
-    if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
+    Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image;
+    if( image.type() != CV_8U )
+        cvtColor( image, grayImage, COLOR_BGR2GRAY );
     FAST( grayImage, keypoints, threshold, nonmaxSuppression, type );
     KeyPointsFilter::runByPixelsMask( keypoints, mask );
 }
diff --git a/modules/features2d/src/features2d_init.cpp b/modules/features2d/src/features2d_init.cpp
index 959c1ac9d..889c5b64c 100644
--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@@ -58,19 +58,19 @@ Ptr<Feature2D> Feature2D::create( const String& feature2DType )
 
 CV_INIT_ALGORITHM(BRISK, "Feature2D.BRISK",
                    obj.info()->addParam(obj, "thres", obj.threshold);
-                   obj.info()->addParam(obj, "octaves", obj.octaves));
+                   obj.info()->addParam(obj, "octaves", obj.octaves))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 CV_INIT_ALGORITHM(BriefDescriptorExtractor, "Feature2D.BRIEF",
-                  obj.info()->addParam(obj, "bytes", obj.bytes_));
+                  obj.info()->addParam(obj, "bytes", obj.bytes_))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 CV_INIT_ALGORITHM(FastFeatureDetector, "Feature2D.FAST",
                   obj.info()->addParam(obj, "threshold", obj.threshold);
                   obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression);
-                  obj.info()->addParam(obj, "type", obj.type));
+                  obj.info()->addParam(obj, "type", obj.type))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -79,7 +79,7 @@ CV_INIT_ALGORITHM(StarDetector, "Feature2D.STAR",
                   obj.info()->addParam(obj, "responseThreshold", obj.responseThreshold);
                   obj.info()->addParam(obj, "lineThresholdProjected", obj.lineThresholdProjected);
                   obj.info()->addParam(obj, "lineThresholdBinarized", obj.lineThresholdBinarized);
-                  obj.info()->addParam(obj, "suppressNonmaxSize", obj.suppressNonmaxSize));
+                  obj.info()->addParam(obj, "suppressNonmaxSize", obj.suppressNonmaxSize))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -92,7 +92,7 @@ CV_INIT_ALGORITHM(MSER, "Feature2D.MSER",
                   obj.info()->addParam(obj, "maxEvolution", obj.maxEvolution);
                   obj.info()->addParam(obj, "areaThreshold", obj.areaThreshold);
                   obj.info()->addParam(obj, "minMargin", obj.minMargin);
-                  obj.info()->addParam(obj, "edgeBlurSize", obj.edgeBlurSize));
+                  obj.info()->addParam(obj, "edgeBlurSize", obj.edgeBlurSize))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -104,7 +104,7 @@ CV_INIT_ALGORITHM(ORB, "Feature2D.ORB",
                   obj.info()->addParam(obj, "edgeThreshold", obj.edgeThreshold);
                   obj.info()->addParam(obj, "patchSize", obj.patchSize);
                   obj.info()->addParam(obj, "WTA_K", obj.WTA_K);
-                  obj.info()->addParam(obj, "scoreType", obj.scoreType));
+                  obj.info()->addParam(obj, "scoreType", obj.scoreType))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -112,7 +112,7 @@ CV_INIT_ALGORITHM(FREAK, "Feature2D.FREAK",
                   obj.info()->addParam(obj, "orientationNormalized", obj.orientationNormalized);
                   obj.info()->addParam(obj, "scaleNormalized", obj.scaleNormalized);
                   obj.info()->addParam(obj, "patternScale", obj.patternScale);
-                  obj.info()->addParam(obj, "nbOctave", obj.nOctaves));
+                  obj.info()->addParam(obj, "nbOctave", obj.nOctaves))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -121,7 +121,7 @@ CV_INIT_ALGORITHM(GFTTDetector, "Feature2D.GFTT",
                   obj.info()->addParam(obj, "qualityLevel", obj.qualityLevel);
                   obj.info()->addParam(obj, "minDistance", obj.minDistance);
                   obj.info()->addParam(obj, "useHarrisDetector", obj.useHarrisDetector);
-                  obj.info()->addParam(obj, "k", obj.k));
+                  obj.info()->addParam(obj, "k", obj.k))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -141,7 +141,7 @@ CV_INIT_ALGORITHM(SimpleBlobDetector, "Feature2D.SimpleBlob",
                   obj.info()->addParam(obj, "maxInertiaRatio",  obj.params.maxInertiaRatio);
                   obj.info()->addParam(obj, "filterByConvexity", obj.params.filterByConvexity);
                   obj.info()->addParam(obj, "maxConvexity",     obj.params.maxConvexity);
-                  );
+                  )
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -162,7 +162,7 @@ CV_INIT_ALGORITHM(HarrisDetector, "Feature2D.HARRIS",
                   obj.info()->addParam(obj, "qualityLevel", obj.qualityLevel);
                   obj.info()->addParam(obj, "minDistance", obj.minDistance);
                   obj.info()->addParam(obj, "useHarrisDetector", obj.useHarrisDetector);
-                  obj.info()->addParam(obj, "k", obj.k));
+                  obj.info()->addParam(obj, "k", obj.k))
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -173,21 +173,21 @@ CV_INIT_ALGORITHM(DenseFeatureDetector, "Feature2D.Dense",
                   obj.info()->addParam(obj, "initXyStep", obj.initXyStep);
                   obj.info()->addParam(obj, "initImgBound", obj.initImgBound);
                   obj.info()->addParam(obj, "varyXyStepWithScale", obj.varyXyStepWithScale);
-                  obj.info()->addParam(obj, "varyImgBoundWithScale", obj.varyImgBoundWithScale));
+                  obj.info()->addParam(obj, "varyImgBoundWithScale", obj.varyImgBoundWithScale))
 
 CV_INIT_ALGORITHM(GridAdaptedFeatureDetector, "Feature2D.Grid",
                   obj.info()->addParam<FeatureDetector>(obj, "detector", obj.detector, false, 0, 0); // Extra params added to avoid VS2013 fatal error in opencv2/core.hpp (decl. of addParam)
                   obj.info()->addParam(obj, "maxTotalKeypoints", obj.maxTotalKeypoints);
                   obj.info()->addParam(obj, "gridRows", obj.gridRows);
-                  obj.info()->addParam(obj, "gridCols", obj.gridCols));
+                  obj.info()->addParam(obj, "gridCols", obj.gridCols))
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 CV_INIT_ALGORITHM(BFMatcher, "DescriptorMatcher.BFMatcher",
                   obj.info()->addParam(obj, "normType", obj.normType);
-                  obj.info()->addParam(obj, "crossCheck", obj.crossCheck));
+                  obj.info()->addParam(obj, "crossCheck", obj.crossCheck))
 
-CV_INIT_ALGORITHM(FlannBasedMatcher, "DescriptorMatcher.FlannBasedMatcher",);
+CV_INIT_ALGORITHM(FlannBasedMatcher, "DescriptorMatcher.FlannBasedMatcher",)
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/modules/features2d/src/freak.cpp b/modules/features2d/src/freak.cpp
index 35ff97f58..8759efa2e 100644
--- a/modules/features2d/src/freak.cpp
+++ b/modules/features2d/src/freak.cpp
@@ -229,9 +229,9 @@ void FREAK::buildPattern()
     }
 }
 
-void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const
+void FREAK::computeImpl( InputArray _image, std::vector<KeyPoint>& keypoints, OutputArray _descriptors ) const
 {
-
+    Mat image = _image.getMat();
     if( image.empty() )
         return;
     if( keypoints.empty() )
@@ -297,7 +297,9 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
     if( !extAll )
     {
         // extract the best comparisons only
-        descriptors = cv::Mat::zeros((int)keypoints.size(), FREAK_NB_PAIRS/8, CV_8U);
+        _descriptors.create((int)keypoints.size(), FREAK_NB_PAIRS/8, CV_8U);
+        _descriptors.setTo(Scalar::all(0));
+        Mat descriptors = _descriptors.getMat();
 #if CV_SSE2
         __m128i* ptr= (__m128i*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
 #else
@@ -415,7 +417,9 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
     }
     else // extract all possible comparisons for selection
     {
-        descriptors = cv::Mat::zeros((int)keypoints.size(), 128, CV_8U);
+        _descriptors.create((int)keypoints.size(), 128, CV_8U);
+        _descriptors.setTo(Scalar::all(0));
+        Mat descriptors = _descriptors.getMat();
         std::bitset<1024>* ptr = (std::bitset<1024>*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]);
 
         for( size_t k = keypoints.size(); k--; )
@@ -474,13 +478,14 @@ void FREAK::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat
 }
 
 // simply take average on a square patch, not even gaussian approx
-uchar FREAK::meanIntensity( const cv::Mat& image, const cv::Mat& integral,
+uchar FREAK::meanIntensity( InputArray _image, InputArray _integral,
                             const float kp_x,
                             const float kp_y,
                             const unsigned int scale,
                             const unsigned int rot,
                             const unsigned int point) const
 {
+    Mat image = _image.getMat(), integral = _integral.getMat();
     // get point position in image
     const PatternPoint& FreakPoint = patternLookup[scale*FREAK_NB_ORIENTATION*FREAK_NB_POINTS + rot*FREAK_NB_POINTS + point];
     const float xf = FreakPoint.x+kp_x;
diff --git a/modules/features2d/src/matchers.cpp b/modules/features2d/src/matchers.cpp
index 087c6a78b..6bc925f8a 100644
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@@ -41,6 +41,7 @@
 
 #include "precomp.hpp"
 #include <limits>
+#include "opencl_kernels.hpp"
 
 #if defined(HAVE_EIGEN) && EIGEN_WORLD_VERSION == 2
 #include <Eigen/Array>
@@ -68,6 +69,533 @@ Mat windowedMatchingMask( const std::vector<KeyPoint>& keypoints1, const std::ve
     return mask;
 }
 
+//////////////////////////////////////////////////////////////////ocl functions for BFMatcher ///////////////////////////////////////////////////////////////
+
+static void ensureSizeIsEnough(int rows, int cols, int type, UMat &m)
+{
+    if (m.type() == type && m.rows >= rows && m.cols >= cols)
+        m = m(Rect(0, 0, cols, rows));
+    else
+        m.create(rows, cols, type);
+}
+
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN >
+static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train,
+                     const UMat &trainIdx, const UMat &distance, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
+    ocl::Kernel k("BruteForceMatch_UnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, (void *)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, (int)query.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+template < int BLOCK_SIZE >
+static bool ocl_match(InputArray _query, InputArray _train,
+                     const UMat &trainIdx, const UMat &distance, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+    ocl::Kernel k("BruteForceMatch_Match", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, (void *)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, (int)query.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+static bool ocl_matchDispatcher(InputArray query, InputArray train,
+                     const UMat &trainIdx, const UMat &distance, int distType)
+{
+    int query_cols = query.size().width;
+    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+    if (query_cols <= 64)
+    {
+        if(!ocl_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType)) return false;
+    }
+    else if (query_cols <= 128 && !is_cpu)
+    {
+        if(!ocl_matchUnrolledCached<16, 128>(query, train, trainIdx,  distance, distType)) return false;
+    }
+    else
+    {
+        if(!ocl_match<16>(query, train, trainIdx, distance, distType)) return false;
+    }
+    return true;
+}
+
+static bool ocl_matchSingle(InputArray query, InputArray train,
+        UMat &trainIdx, UMat &distance, int dstType)
+{
+    if (query.empty() || train.empty())
+        return false;
+
+    int query_rows = query.size().height;
+
+    ensureSizeIsEnough(1, query_rows, CV_32S, trainIdx);
+    ensureSizeIsEnough(1, query_rows, CV_32F, distance);
+
+    return ocl_matchDispatcher(query, train, trainIdx, distance, dstType);
+}
+
+static bool ocl_matchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches)
+{
+    if (trainIdx.empty() || distance.empty())
+        return false;
+
+    if( (trainIdx.type() != CV_32SC1) || (distance.type() != CV_32FC1 || distance.cols != trainIdx.cols) )
+        return false;
+
+    const int nQuery = trainIdx.cols;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const int *trainIdx_ptr = trainIdx.ptr<int>();
+    const float *distance_ptr =  distance.ptr<float>();
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
+    {
+        int trainIndex = *trainIdx_ptr;
+
+        if (trainIndex == -1)
+            continue;
+
+        float dst = *distance_ptr;
+
+        DMatch m(queryIdx, trainIndex, 0, dst);
+
+        std::vector<DMatch> temp;
+        temp.push_back(m);
+        matches.push_back(temp);
+    }
+    return true;
+}
+
+static bool ocl_matchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector<DMatch> > &matches)
+{
+    if (trainIdx.empty() || distance.empty())
+        return false;
+
+    Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+    Mat distanceCPU = distance.getMat(ACCESS_READ);
+
+    return ocl_matchConvert(trainIdxCPU, distanceCPU, matches);
+}
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN >
+static bool ocl_knn_matchUnrolledCached(InputArray _query, InputArray _train,
+                             const UMat &trainIdx, const UMat &distance, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = cv::format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN );
+    ocl::Kernel k("BruteForceMatch_knnUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, (void *)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, (int)query.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+template < int BLOCK_SIZE >
+static bool ocl_knn_match(InputArray _query, InputArray _train,
+               const UMat &trainIdx, const UMat &distance, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
+        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+    ocl::Kernel k("BruteForceMatch_knnMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, (void*)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, (int)query.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+static bool ocl_match2Dispatcher(InputArray query, InputArray train, const UMat &trainIdx, const UMat &distance, int distType)
+{
+    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+    if (query.size().width <= 64)
+    {
+        if(!ocl_knn_matchUnrolledCached<16, 64>(query, train, trainIdx, distance, distType))
+            return false;
+    }
+    else if (query.size().width <= 128 && !is_cpu)
+    {
+        if(!ocl_knn_matchUnrolledCached<16, 128>(query, train, trainIdx, distance, distType))
+            return false;
+    }
+    else
+    {
+        if(!ocl_knn_match<16>(query, train, trainIdx, distance, distType))
+            return false;
+    }
+    return true;
+}
+
+static bool ocl_kmatchDispatcher(InputArray query, InputArray train, const UMat &trainIdx,
+                                 const UMat &distance, int distType)
+{
+        return ocl_match2Dispatcher(query, train, trainIdx, distance, distType);
+}
+
+static bool ocl_knnMatchSingle(InputArray query, InputArray train, UMat &trainIdx,
+                               UMat &distance, int dstType)
+{
+    if (query.empty() || train.empty())
+        return false;
+
+    const int nQuery = query.size().height;
+
+    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
+    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
+
+    trainIdx.setTo(Scalar::all(-1));
+
+    return ocl_kmatchDispatcher(query, train, trainIdx, distance, dstType);
+}
+
+static bool ocl_knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+    if (trainIdx.empty() || distance.empty())
+        return false;
+
+    if(trainIdx.type() != CV_32SC2 && trainIdx.type() != CV_32SC1) return false;
+    if(distance.type() != CV_32FC2 && distance.type() != CV_32FC1)return false;
+    if(distance.size() != trainIdx.size()) return false;
+    if(!trainIdx.isContinuous() || !distance.isContinuous()) return false;
+
+    const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;
+    const int k = trainIdx.type() == CV_32SC2 ? 2 : trainIdx.cols;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const int *trainIdx_ptr = trainIdx.ptr<int>();
+    const float *distance_ptr = distance.ptr<float>();
+
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+    {
+        matches.push_back(std::vector<DMatch>());
+        std::vector<DMatch> &curMatches = matches.back();
+        curMatches.reserve(k);
+
+        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
+        {
+            int trainIndex = *trainIdx_ptr;
+
+            if (trainIndex != -1)
+            {
+                float dst = *distance_ptr;
+
+                DMatch m(queryIdx, trainIndex, 0, dst);
+
+                curMatches.push_back(m);
+            }
+        }
+
+        if (compactResult && curMatches.empty())
+            matches.pop_back();
+    }
+    return true;
+}
+
+static bool ocl_knnMatchDownload(const UMat &trainIdx, const UMat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+    if (trainIdx.empty() || distance.empty())
+        return false;
+
+    Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+    Mat distanceCPU = distance.getMat(ACCESS_READ);
+
+    if (ocl_knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult) )
+        return true;
+    return false;
+}
+
+template < int BLOCK_SIZE, int MAX_DESC_LEN >
+static bool ocl_matchUnrolledCached(InputArray _query, InputArray _train, float maxDistance,
+                  const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
+        ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE, (int)MAX_DESC_LEN);
+    ocl::Kernel k("BruteForceMatch_RadiusUnrollMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, maxDistance);
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
+        idx = k.set(idx, (void*)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, trainIdx.cols);
+        idx = k.set(idx, (int)query.step);
+        idx = k.set(idx, (int)trainIdx.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+//radius_match
+template < int BLOCK_SIZE >
+static bool ocl_radius_match(InputArray _query, InputArray _train, float maxDistance,
+                  const UMat &trainIdx, const UMat &distance, const UMat &nMatches, int distType)
+{
+    int depth = _query.depth();
+    cv::String opts;
+    opts = format("-D T=%s %s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", ocl::typeToStr(depth), depth == CV_32F ? "-D T_FLOAT" : "", distType, (int)BLOCK_SIZE);
+    ocl::Kernel k("BruteForceMatch_RadiusMatch", ocl::features2d::brute_force_match_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    size_t globalSize[] = {(_train.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (_query.size().height + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
+    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    if(globalSize[0] != 0)
+    {
+        UMat query = _query.getUMat(), train = _train.getUMat();
+
+        int idx = 0;
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(query));
+        idx = k.set(idx, ocl::KernelArg::PtrReadOnly(train));
+        idx = k.set(idx, maxDistance);
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(trainIdx));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(distance));
+        idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(nMatches));
+        idx = k.set(idx, (void*)NULL, smemSize);
+        idx = k.set(idx, query.rows);
+        idx = k.set(idx, query.cols);
+        idx = k.set(idx, train.rows);
+        idx = k.set(idx, train.cols);
+        idx = k.set(idx, trainIdx.cols);
+        idx = k.set(idx, (int)query.step);
+        idx = k.set(idx, (int)trainIdx.step);
+
+        return k.run(2, globalSize, localSize, false);
+    }
+    return true;
+}
+
+static bool ocl_rmatchDispatcher(InputArray query, InputArray train,
+        UMat &trainIdx,   UMat &distance, UMat &nMatches, float maxDistance, int distType)
+{
+    bool is_cpu = ocl::Device::getDefault().type() == ocl::Device::TYPE_CPU;
+    int query_cols = query.size().width;
+    if (query_cols <= 64)
+    {
+        if(!ocl_matchUnrolledCached<16, 64>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+    }
+    else if (query_cols <= 128 && !is_cpu)
+    {
+        if(!ocl_matchUnrolledCached<16, 128>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+    }
+    else
+    {
+        if(!ocl_radius_match<16>(query, train, maxDistance, trainIdx, distance, nMatches, distType)) return false;
+    }
+    return true;
+}
+
+
+static bool ocl_radiusMatchSingle(InputArray query, InputArray train,
+        UMat &trainIdx,   UMat &distance, UMat &nMatches, float maxDistance, int distType)
+{
+    if (query.empty() || train.empty())
+        return false;
+
+    const int nQuery = query.size().height;
+    const int nTrain = train.size().height;
+
+    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
+
+    if (trainIdx.empty())
+    {
+        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
+        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
+    }
+
+    nMatches.setTo(Scalar::all(0));
+
+    return ocl_rmatchDispatcher(query, train, trainIdx, distance, nMatches, maxDistance, distType);
+}
+
+static bool ocl_radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &_nMatches,
+        std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+    if (trainIdx.empty() || distance.empty() || _nMatches.empty())
+        return false;
+
+    if( (trainIdx.type() != CV_32SC1) ||
+        (distance.type() != CV_32FC1 || distance.size() != trainIdx.size()) ||
+        (_nMatches.type() != CV_32SC1 || _nMatches.cols != trainIdx.rows) )
+        return false;
+
+    const int nQuery = trainIdx.rows;
+
+    matches.clear();
+    matches.reserve(nQuery);
+
+    const int *nMatches_ptr = _nMatches.ptr<int>();
+
+    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
+    {
+        const int *trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
+        const float *distance_ptr = distance.ptr<float>(queryIdx);
+
+        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+
+        if (nMatches == 0)
+        {
+            if (!compactResult)
+                matches.push_back(std::vector<DMatch>());
+            continue;
+        }
+
+        matches.push_back(std::vector<DMatch>(nMatches));
+        std::vector<DMatch> &curMatches = matches.back();
+
+        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        {
+            int trainIndex = *trainIdx_ptr;
+
+            float dst = *distance_ptr;
+
+            DMatch m(queryIdx, trainIndex, 0, dst);
+
+            curMatches[i] = m;
+        }
+
+        std::sort(curMatches.begin(), curMatches.end());
+    }
+    return true;
+}
+
+static bool ocl_radiusMatchDownload(const UMat &trainIdx, const UMat &distance, const UMat &nMatches,
+        std::vector< std::vector<DMatch> > &matches, bool compactResult)
+{
+    if (trainIdx.empty() || distance.empty() || nMatches.empty())
+        return false;
+
+    Mat trainIdxCPU = trainIdx.getMat(ACCESS_READ);
+    Mat distanceCPU = distance.getMat(ACCESS_READ);
+    Mat nMatchesCPU = nMatches.getMat(ACCESS_READ);
+
+    return ocl_radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
+}
+
 /****************************************************************************************\
 *                                      DescriptorMatcher                                 *
 \****************************************************************************************/
@@ -190,9 +718,32 @@ static void convertMatches( const std::vector<std::vector<DMatch> >& knnMatches,
 DescriptorMatcher::~DescriptorMatcher()
 {}
 
-void DescriptorMatcher::add( const std::vector<Mat>& descriptors )
+void DescriptorMatcher::add( InputArrayOfArrays _descriptors )
 {
-    trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+    if(_descriptors.isUMatVector())
+    {
+        std::vector<UMat> descriptors;
+        _descriptors.getUMatVector(descriptors);
+        utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() );
+    }
+    else if(_descriptors.isUMat())
+    {
+        std::vector<UMat> descriptors = std::vector<UMat>(1, _descriptors.getUMat());
+        utrainDescCollection.insert( utrainDescCollection.end(), descriptors.begin(), descriptors.end() );
+    }
+    else if(_descriptors.isMatVector())
+    {
+        std::vector<Mat> descriptors;
+        _descriptors.getMatVector(descriptors);
+        trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+    }
+    else if(_descriptors.isMat())
+    {
+        std::vector<Mat> descriptors = std::vector<Mat>(1, _descriptors.getMat());
+        trainDescCollection.insert( trainDescCollection.end(), descriptors.begin(), descriptors.end() );
+    }
+    else
+        CV_Assert( _descriptors.isUMat() || _descriptors.isUMatVector() || _descriptors.isMat() || _descriptors.isMatVector() );
 }
 
 const std::vector<Mat>& DescriptorMatcher::getTrainDescriptors() const
@@ -202,83 +753,90 @@ const std::vector<Mat>& DescriptorMatcher::getTrainDescriptors() const
 
 void DescriptorMatcher::clear()
 {
+    utrainDescCollection.clear();
     trainDescCollection.clear();
 }
 
 bool DescriptorMatcher::empty() const
 {
-    return trainDescCollection.empty();
+    return trainDescCollection.empty() && utrainDescCollection.empty();
 }
 
 void DescriptorMatcher::train()
 {}
 
-void DescriptorMatcher::match( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<DMatch>& matches, const Mat& mask ) const
+void DescriptorMatcher::match( InputArray queryDescriptors, InputArray trainDescriptors,
+                              std::vector<DMatch>& matches, InputArray mask ) const
 {
     Ptr<DescriptorMatcher> tempMatcher = clone(true);
-    tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
-    tempMatcher->match( queryDescriptors, matches, std::vector<Mat>(1, mask) );
+    tempMatcher->add(trainDescriptors);
+    tempMatcher->match( queryDescriptors, matches, std::vector<Mat>(1, mask.getMat()) );
 }
 
-void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
-                                  const Mat& mask, bool compactResult ) const
+void DescriptorMatcher::knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                                  std::vector<std::vector<DMatch> >& matches, int knn,
+                                  InputArray mask, bool compactResult ) const
 {
     Ptr<DescriptorMatcher> tempMatcher = clone(true);
-    tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
-    tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector<Mat>(1, mask), compactResult );
+    tempMatcher->add(trainDescriptors);
+    tempMatcher->knnMatch( queryDescriptors, matches, knn, std::vector<Mat>(1, mask.getMat()), compactResult );
 }
 
-void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, const Mat& trainDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                     const Mat& mask, bool compactResult ) const
+void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                                     std::vector<std::vector<DMatch> >& matches, float maxDistance, InputArray mask,
+                                     bool compactResult ) const
 {
     Ptr<DescriptorMatcher> tempMatcher = clone(true);
-    tempMatcher->add( std::vector<Mat>(1, trainDescriptors) );
-    tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector<Mat>(1, mask), compactResult );
+    tempMatcher->add(trainDescriptors);
+    tempMatcher->radiusMatch( queryDescriptors, matches, maxDistance, std::vector<Mat>(1, mask.getMat()), compactResult );
 }
 
-void DescriptorMatcher::match( const Mat& queryDescriptors, std::vector<DMatch>& matches, const std::vector<Mat>& masks )
+void DescriptorMatcher::match( InputArray queryDescriptors, std::vector<DMatch>& matches, InputArrayOfArrays masks )
 {
     std::vector<std::vector<DMatch> > knnMatches;
     knnMatch( queryDescriptors, knnMatches, 1, masks, true /*compactResult*/ );
     convertMatches( knnMatches, matches );
 }
 
-void DescriptorMatcher::checkMasks( const std::vector<Mat>& masks, int queryDescriptorsCount ) const
+void DescriptorMatcher::checkMasks( InputArrayOfArrays _masks, int queryDescriptorsCount ) const
 {
+    std::vector<Mat> masks;
+    _masks.getMatVector(masks);
+
     if( isMaskSupported() && !masks.empty() )
     {
         // Check masks
-        size_t imageCount = trainDescCollection.size();
+        size_t imageCount = std::max(trainDescCollection.size(), utrainDescCollection.size() );
         CV_Assert( masks.size() == imageCount );
         for( size_t i = 0; i < imageCount; i++ )
         {
-            if( !masks[i].empty() && !trainDescCollection[i].empty() )
+            if( !masks[i].empty() && (!trainDescCollection[i].empty() || !utrainDescCollection[i].empty() ) )
             {
+                int rows = trainDescCollection[i].empty() ? utrainDescCollection[i].rows : trainDescCollection[i].rows;
                     CV_Assert( masks[i].rows == queryDescriptorsCount &&
-                                   masks[i].cols == trainDescCollection[i].rows &&
-                                       masks[i].type() == CV_8UC1 );
+                        (masks[i].cols == rows || masks[i].cols == rows) &&
+                        masks[i].type() == CV_8UC1 );
             }
         }
     }
 }
 
-void DescriptorMatcher::knnMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
-                                  const std::vector<Mat>& masks, bool compactResult )
+void DescriptorMatcher::knnMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+                                  InputArrayOfArrays masks, bool compactResult )
 {
-    matches.clear();
     if( empty() || queryDescriptors.empty() )
         return;
 
     CV_Assert( knn > 0 );
 
-    checkMasks( masks, queryDescriptors.rows );
+    checkMasks( masks, queryDescriptors.size().height );
 
     train();
     knnMatchImpl( queryDescriptors, matches, knn, masks, compactResult );
 }
 
-void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                     const std::vector<Mat>& masks, bool compactResult )
+void DescriptorMatcher::radiusMatch( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                                     InputArrayOfArrays masks, bool compactResult )
 {
     matches.clear();
     if( empty() || queryDescriptors.empty() )
@@ -286,7 +844,7 @@ void DescriptorMatcher::radiusMatch( const Mat& queryDescriptors, std::vector<st
 
     CV_Assert( maxDistance > std::numeric_limits<float>::epsilon() );
 
-    checkMasks( masks, queryDescriptors.rows );
+    checkMasks( masks, queryDescriptors.size().height );
 
     train();
     radiusMatchImpl( queryDescriptors, matches, maxDistance, masks, compactResult );
@@ -298,13 +856,17 @@ void DescriptorMatcher::read( const FileNode& )
 void DescriptorMatcher::write( FileStorage& ) const
 {}
 
-bool DescriptorMatcher::isPossibleMatch( const Mat& mask, int queryIdx, int trainIdx )
+bool DescriptorMatcher::isPossibleMatch( InputArray _mask, int queryIdx, int trainIdx )
 {
+    Mat mask = _mask.getMat();
     return mask.empty() || mask.at<uchar>(queryIdx, trainIdx);
 }
 
-bool DescriptorMatcher::isMaskedOut( const std::vector<Mat>& masks, int queryIdx )
+bool DescriptorMatcher::isMaskedOut( InputArrayOfArrays _masks, int queryIdx )
 {
+    std::vector<Mat> masks;
+    _masks.getMatVector(masks);
+
     size_t outCount = 0;
     for( size_t i = 0; i < masks.size(); i++ )
     {
@@ -316,7 +878,7 @@ bool DescriptorMatcher::isMaskedOut( const std::vector<Mat>& masks, int queryIdx
 }
 
 
-///////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////// BruteForceMatcher /////////////////////////////////////////////////
 
 BFMatcher::BFMatcher( int _normType, bool _crossCheck )
 {
@@ -336,19 +898,104 @@ Ptr<DescriptorMatcher> BFMatcher::clone( bool emptyTrainData ) const
     return matcher;
 }
 
-
-void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
-                              const std::vector<Mat>& masks, bool compactResult )
+static bool ocl_match(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches, int dstType)
 {
+    UMat trainIdx, distance;
+    if (!ocl_matchSingle(query, _train, trainIdx, distance, dstType))
+        return false;
+    if (!ocl_matchDownload(trainIdx, distance, matches))
+        return false;
+    return true;
+}
+
+static bool ocl_knnMatch(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches, int k, int dstType, bool compactResult)
+{
+    UMat trainIdx, distance;
+    if (k != 2)
+        return false;
+    if (!ocl_knnMatchSingle(query, _train, trainIdx, distance, dstType))
+        return false;
+    if (!ocl_knnMatchDownload(trainIdx, distance, matches, compactResult) )
+        return false;
+    return true;
+}
+
+void BFMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+                             InputArrayOfArrays _masks, bool compactResult )
+{
+    int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type();
+    CV_Assert( _queryDescriptors.type() == trainDescType );
+
     const int IMGIDX_SHIFT = 18;
     const int IMGIDX_ONE = (1 << IMGIDX_SHIFT);
 
-    if( queryDescriptors.empty() || trainDescCollection.empty() )
+    if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty()))
     {
         matches.clear();
         return;
     }
-    CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() );
+
+    std::vector<Mat> masks;
+    _masks.getMatVector(masks);
+
+    if(!trainDescCollection.empty() && !utrainDescCollection.empty())
+    {
+        for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+        {
+            Mat tempMat;
+            utrainDescCollection[i].copyTo(tempMat);
+            trainDescCollection.push_back(tempMat);
+        }
+        utrainDescCollection.clear();
+    }
+
+    int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size();
+    Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size();
+    int trainDescOffset = trainDescCollection.empty() ? (int)utrainDescCollection[0].offset : 0;
+
+    if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 &&
+        _queryDescriptors.type() == CV_32FC1 && _queryDescriptors.offset() == 0 && trainDescOffset == 0 &&
+        trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 )
+    {
+        if(knn == 1)
+        {
+            if(trainDescCollection.empty())
+            {
+                if(ocl_match(_queryDescriptors, utrainDescCollection[0], matches, normType))
+                    return;
+            }
+            else
+            {
+                if(ocl_match(_queryDescriptors, trainDescCollection[0], matches, normType))
+                    return;
+            }
+        }
+        else
+        {
+            if(trainDescCollection.empty())
+            {
+                if(ocl_knnMatch(_queryDescriptors, utrainDescCollection[0], matches, knn, normType, compactResult) )
+                    return;
+            }
+            else
+            {
+                if(ocl_knnMatch(_queryDescriptors, trainDescCollection[0], matches, knn, normType, compactResult) )
+                    return;
+            }
+        }
+    }
+
+    Mat queryDescriptors = _queryDescriptors.getMat();
+    if(trainDescCollection.empty() && !utrainDescCollection.empty())
+    {
+        for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+        {
+            Mat tempMat;
+            utrainDescCollection[i].copyTo(tempMat);
+            trainDescCollection.push_back(tempMat);
+        }
+        utrainDescCollection.clear();
+    }
 
     matches.reserve(queryDescriptors.rows);
 
@@ -397,16 +1044,74 @@ void BFMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vect
     }
 }
 
-
-void BFMatcher::radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches,
-                                 float maxDistance, const std::vector<Mat>& masks, bool compactResult )
+static bool ocl_radiusMatch(InputArray query, InputArray _train, std::vector< std::vector<DMatch> > &matches,
+        float maxDistance, int dstType, bool compactResult)
 {
-    if( queryDescriptors.empty() || trainDescCollection.empty() )
+    UMat trainIdx, distance, nMatches;
+    if (!ocl_radiusMatchSingle(query, _train, trainIdx, distance, nMatches, maxDistance, dstType))
+        return false;
+    if (!ocl_radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult))
+        return false;
+    return true;
+}
+
+void BFMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches,
+                                float maxDistance, InputArrayOfArrays _masks, bool compactResult )
+{
+    int trainDescType = trainDescCollection.empty() ? utrainDescCollection[0].type() : trainDescCollection[0].type();
+    CV_Assert( _queryDescriptors.type() == trainDescType );
+
+    if( _queryDescriptors.empty() || (trainDescCollection.empty() && utrainDescCollection.empty()))
     {
         matches.clear();
         return;
     }
-    CV_Assert( queryDescriptors.type() == trainDescCollection[0].type() );
+
+    std::vector<Mat> masks;
+    _masks.getMatVector(masks);
+
+    if(!trainDescCollection.empty() && !utrainDescCollection.empty())
+    {
+        for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+        {
+            Mat tempMat;
+            utrainDescCollection[i].copyTo(tempMat);
+            trainDescCollection.push_back(tempMat);
+        }
+        utrainDescCollection.clear();
+    }
+
+    int trainDescVectorSize = trainDescCollection.empty() ? (int)utrainDescCollection.size() : (int)trainDescCollection.size();
+    Size trainDescSize = trainDescCollection.empty() ? utrainDescCollection[0].size() : trainDescCollection[0].size();
+    int trainDescOffset = trainDescCollection.empty() ? (int)utrainDescCollection[0].offset : 0;
+
+    if ( ocl::useOpenCL() && _queryDescriptors.isUMat() && _queryDescriptors.dims()<=2 && trainDescVectorSize == 1 &&
+        _queryDescriptors.type() == CV_32FC1 && _queryDescriptors.offset() == 0 && trainDescOffset == 0 &&
+        trainDescSize.width == _queryDescriptors.size().width && masks.size() == 1 && masks[0].total() == 0 )
+    {
+        if (trainDescCollection.empty())
+        {
+            if(ocl_radiusMatch(_queryDescriptors, utrainDescCollection[0], matches, maxDistance, normType, compactResult) )
+                return;
+        }
+        else
+        {
+            if (ocl_radiusMatch(_queryDescriptors, trainDescCollection[0], matches, maxDistance, normType, compactResult) )
+                return;
+        }
+    }
+
+    Mat queryDescriptors = _queryDescriptors.getMat();
+    if(trainDescCollection.empty() && !utrainDescCollection.empty())
+    {
+        for(int i = 0; i < (int)utrainDescCollection.size(); i++)
+        {
+            Mat tempMat;
+            utrainDescCollection[i].copyTo(tempMat);
+            trainDescCollection.push_back(tempMat);
+        }
+        utrainDescCollection.clear();
+    }
 
     matches.resize(queryDescriptors.rows);
     Mat dist, distf;
@@ -501,9 +1206,12 @@ FlannBasedMatcher::FlannBasedMatcher( const Ptr<flann::IndexParams>& _indexParam
     CV_Assert( _searchParams );
 }
 
-void FlannBasedMatcher::add( const std::vector<Mat>& descriptors )
+void FlannBasedMatcher::add( InputArrayOfArrays _descriptors )
 {
-    DescriptorMatcher::add( descriptors );
+    DescriptorMatcher::add( _descriptors );
+    std::vector<Mat> descriptors;
+    _descriptors.getMatVector(descriptors);
+
     for( size_t i = 0; i < descriptors.size(); i++ )
     {
         addedDescCount += descriptors[i].rows;
@@ -763,9 +1471,10 @@ void FlannBasedMatcher::convertToDMatches( const DescriptorCollection& collectio
     }
 }
 
-void FlannBasedMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
-                                      const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+void FlannBasedMatcher::knnMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, int knn,
+                                     InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
 {
+    Mat queryDescriptors = _queryDescriptors.getMat();
     Mat indices( queryDescriptors.rows, knn, CV_32SC1 );
     Mat dists( queryDescriptors.rows, knn, CV_32FC1);
     flannIndex->knnSearch( queryDescriptors, indices, dists, knn, *searchParams );
@@ -773,9 +1482,10 @@ void FlannBasedMatcher::knnMatchImpl( const Mat& queryDescriptors, std::vector<s
     convertToDMatches( mergedDescriptors, indices, dists, matches );
 }
 
-void FlannBasedMatcher::radiusMatchImpl( const Mat& queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                         const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+void FlannBasedMatcher::radiusMatchImpl( InputArray _queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                                         InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
 {
+    Mat queryDescriptors = _queryDescriptors.getMat();
     const int count = mergedDescriptors.size(); // TODO do count as param?
     Mat indices( queryDescriptors.rows, count, CV_32SC1, Scalar::all(-1) );
     Mat dists( queryDescriptors.rows, count, CV_32FC1, Scalar::all(-1) );
@@ -917,9 +1627,11 @@ GenericDescriptorMatcher::GenericDescriptorMatcher()
 GenericDescriptorMatcher::~GenericDescriptorMatcher()
 {}
 
-void GenericDescriptorMatcher::add( const std::vector<Mat>& images,
+void GenericDescriptorMatcher::add( InputArrayOfArrays _images,
                                     std::vector<std::vector<KeyPoint> >& keypoints )
 {
+    std::vector<Mat> images;
+    _images.getMatVector(images);
     CV_Assert( !images.empty() );
     CV_Assert( images.size() == keypoints.size() );
 
@@ -951,8 +1663,8 @@ void GenericDescriptorMatcher::clear()
 void GenericDescriptorMatcher::train()
 {}
 
-void GenericDescriptorMatcher::classify( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                                         const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints ) const
+void GenericDescriptorMatcher::classify( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                                         InputArray trainImage, std::vector<KeyPoint>& trainKeypoints ) const
 {
     std::vector<DMatch> matches;
     match( queryImage, queryKeypoints, trainImage, trainKeypoints, matches );
@@ -962,7 +1674,7 @@ void GenericDescriptorMatcher::classify( const Mat& queryImage, std::vector<KeyP
         queryKeypoints[matches[i].queryIdx].class_id = trainKeypoints[matches[i].trainIdx].class_id;
 }
 
-void GenericDescriptorMatcher::classify( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints )
+void GenericDescriptorMatcher::classify( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints )
 {
     std::vector<DMatch> matches;
     match( queryImage, queryKeypoints, matches );
@@ -972,51 +1684,54 @@ void GenericDescriptorMatcher::classify( const Mat& queryImage, std::vector<KeyP
         queryKeypoints[matches[i].queryIdx].class_id = trainPointCollection.getKeyPoint( matches[i].trainIdx, matches[i].trainIdx ).class_id;
 }
 
-void GenericDescriptorMatcher::match( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                                      const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
-                                      std::vector<DMatch>& matches, const Mat& mask ) const
+void GenericDescriptorMatcher::match( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                                      InputArray _trainImage, std::vector<KeyPoint>& trainKeypoints,
+                                      std::vector<DMatch>& matches, InputArray mask ) const
 {
+    Mat trainImage = _trainImage.getMat();
     Ptr<GenericDescriptorMatcher> tempMatcher = clone( true );
     std::vector<std::vector<KeyPoint> > vecTrainPoints(1, trainKeypoints);
     tempMatcher->add( std::vector<Mat>(1, trainImage), vecTrainPoints );
-    tempMatcher->match( queryImage, queryKeypoints, matches, std::vector<Mat>(1, mask) );
+    tempMatcher->match( queryImage, queryKeypoints, matches, std::vector<Mat>(1, mask.getMat()) );
     vecTrainPoints[0].swap( trainKeypoints );
 }
 
-void GenericDescriptorMatcher::knnMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                                         const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
-                                         std::vector<std::vector<DMatch> >& matches, int knn, const Mat& mask, bool compactResult ) const
+void GenericDescriptorMatcher::knnMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                                         InputArray _trainImage, std::vector<KeyPoint>& trainKeypoints,
+                                         std::vector<std::vector<DMatch> >& matches, int knn, InputArray mask, bool compactResult ) const
 {
+    Mat trainImage = _trainImage.getMat();
     Ptr<GenericDescriptorMatcher> tempMatcher = clone( true );
     std::vector<std::vector<KeyPoint> > vecTrainPoints(1, trainKeypoints);
     tempMatcher->add( std::vector<Mat>(1, trainImage), vecTrainPoints );
-    tempMatcher->knnMatch( queryImage, queryKeypoints, matches, knn, std::vector<Mat>(1, mask), compactResult );
+    tempMatcher->knnMatch( queryImage, queryKeypoints, matches, knn, std::vector<Mat>(1, mask.getMat()), compactResult );
     vecTrainPoints[0].swap( trainKeypoints );
 }
 
-void GenericDescriptorMatcher::radiusMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                                            const Mat& trainImage, std::vector<KeyPoint>& trainKeypoints,
+void GenericDescriptorMatcher::radiusMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                                            InputArray _trainImage, std::vector<KeyPoint>& trainKeypoints,
                                             std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                            const Mat& mask, bool compactResult ) const
+                                            InputArray mask, bool compactResult ) const
 {
+    Mat trainImage = _trainImage.getMat();
     Ptr<GenericDescriptorMatcher> tempMatcher = clone( true );
     std::vector<std::vector<KeyPoint> > vecTrainPoints(1, trainKeypoints);
     tempMatcher->add( std::vector<Mat>(1, trainImage), vecTrainPoints );
-    tempMatcher->radiusMatch( queryImage, queryKeypoints, matches, maxDistance, std::vector<Mat>(1, mask), compactResult );
+    tempMatcher->radiusMatch( queryImage, queryKeypoints, matches, maxDistance, std::vector<Mat>(1, mask.getMat()), compactResult );
     vecTrainPoints[0].swap( trainKeypoints );
 }
 
-void GenericDescriptorMatcher::match( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
-                                      std::vector<DMatch>& matches, const std::vector<Mat>& masks )
+void GenericDescriptorMatcher::match( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
+                                      std::vector<DMatch>& matches, InputArrayOfArrays masks )
 {
     std::vector<std::vector<DMatch> > knnMatches;
     knnMatch( queryImage, queryKeypoints, knnMatches, 1, masks, false );
     convertMatches( knnMatches, matches );
 }
 
-void GenericDescriptorMatcher::knnMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void GenericDescriptorMatcher::knnMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                          std::vector<std::vector<DMatch> >& matches, int knn,
-                                         const std::vector<Mat>& masks, bool compactResult )
+                                         InputArrayOfArrays masks, bool compactResult )
 {
     matches.clear();
 
@@ -1030,9 +1745,9 @@ void GenericDescriptorMatcher::knnMatch( const Mat& queryImage, std::vector<KeyP
     knnMatchImpl( queryImage, queryKeypoints, matches, knn, masks, compactResult );
 }
 
-void GenericDescriptorMatcher::radiusMatch( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void GenericDescriptorMatcher::radiusMatch( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                             std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                            const std::vector<Mat>& masks, bool compactResult )
+                                            InputArrayOfArrays masks, bool compactResult )
 {
     matches.clear();
 
@@ -1092,10 +1807,11 @@ VectorDescriptorMatcher::VectorDescriptorMatcher( const Ptr<DescriptorExtractor>
 VectorDescriptorMatcher::~VectorDescriptorMatcher()
 {}
 
-void VectorDescriptorMatcher::add( const std::vector<Mat>& imgCollection,
+void VectorDescriptorMatcher::add( InputArrayOfArrays _imgCollection,
                                    std::vector<std::vector<KeyPoint> >& pointCollection )
 {
-    std::vector<Mat> descriptors;
+    std::vector<Mat> imgCollection, descriptors;
+    _imgCollection.getMatVector(imgCollection);
     extractor->compute( imgCollection, pointCollection, descriptors );
 
     matcher->add( descriptors );
@@ -1120,18 +1836,18 @@ bool VectorDescriptorMatcher::isMaskSupported()
     return matcher->isMaskSupported();
 }
 
-void VectorDescriptorMatcher::knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void VectorDescriptorMatcher::knnMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                             std::vector<std::vector<DMatch> >& matches, int knn,
-                                            const std::vector<Mat>& masks, bool compactResult )
+                                            InputArrayOfArrays masks, bool compactResult )
 {
     Mat queryDescriptors;
     extractor->compute( queryImage, queryKeypoints, queryDescriptors );
     matcher->knnMatch( queryDescriptors, matches, knn, masks, compactResult );
 }
 
-void VectorDescriptorMatcher::radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void VectorDescriptorMatcher::radiusMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                                std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                               const std::vector<Mat>& masks, bool compactResult )
+                                               InputArrayOfArrays masks, bool compactResult )
 {
     Mat queryDescriptors;
     extractor->compute( queryImage, queryKeypoints, queryDescriptors );
diff --git a/modules/features2d/src/mser.cpp b/modules/features2d/src/mser.cpp
index a68a5a5b7..40c32d0a9 100644
--- a/modules/features2d/src/mser.cpp
+++ b/modules/features2d/src/mser.cpp
@@ -1266,11 +1266,11 @@ MSER::MSER( int _delta, int _min_area, int _max_area,
 {
 }
 
-void MSER::operator()( const Mat& image, std::vector<std::vector<Point> >& dstcontours, const Mat& mask ) const
+void MSER::operator()( InputArray image, std::vector<std::vector<Point> >& dstcontours, InputArray mask ) const
 {
-    CvMat _image = image, _mask, *pmask = 0;
-    if( mask.data )
-        pmask = &(_mask = mask);
+    CvMat _image = image.getMat(), _mask, *pmask = 0;
+    if( !mask.empty() )
+        pmask = &(_mask = mask.getMat());
     MemStorage storage(cvCreateMemStorage(0));
     Seq<CvSeq*> contours;
     extractMSER( &_image, pmask, &contours.seq, storage,
@@ -1284,8 +1284,9 @@ void MSER::operator()( const Mat& image, std::vector<std::vector<Point> >& dstco
 }
 
 
-void MserFeatureDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void MserFeatureDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
+    Mat image = _image.getMat(), mask = _mask.getMat();
     std::vector<std::vector<Point> > msers;
 
     (*this)(image, msers, mask);
diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/features2d/src/opencl/brute_force_match.cl
similarity index 99%
rename from modules/ocl/src/opencl/brute_force_match.cl
rename to modules/features2d/src/opencl/brute_force_match.cl
index 544737053..e2757e172 100644
--- a/modules/ocl/src/opencl/brute_force_match.cl
+++ b/modules/features2d/src/opencl/brute_force_match.cl
@@ -60,11 +60,11 @@
 #endif
 
 #ifndef DIST_TYPE
-#define DIST_TYPE 0
+#define DIST_TYPE 2
 #endif
 
 // dirty fix for non-template support
-#if   (DIST_TYPE == 0) // L1Dist
+#if   (DIST_TYPE == 2) // L1Dist
 #   ifdef T_FLOAT
 #       define DIST(x, y) fabs((x) - (y))
         typedef float value_type;
@@ -75,12 +75,12 @@
         typedef int result_type;
 #   endif
 #define DIST_RES(x) (x)
-#elif (DIST_TYPE == 1) // L2Dist
+#elif (DIST_TYPE == 4) // L2Dist
 #define DIST(x, y) (((x) - (y)) * ((x) - (y)))
 typedef float value_type;
 typedef float result_type;
 #define DIST_RES(x) sqrt(x)
-#elif (DIST_TYPE == 2) // Hamming
+#elif (DIST_TYPE == 6) // Hamming
 //http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
 inline int bit1Count(int v)
 {
diff --git a/modules/features2d/src/orb.cpp b/modules/features2d/src/orb.cpp
index be06c6984..b72a6dbce 100644
--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp
@@ -943,12 +943,12 @@ void ORB::operator()( InputArray _image, InputArray _mask, std::vector<KeyPoint>
     }
 }
 
-void ORB::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void ORB::detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
-    (*this)(image, mask, keypoints, noArray(), false);
+    (*this)(image.getMat(), mask.getMat(), keypoints, noArray(), false);
 }
 
-void ORB::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const
+void ORB::computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const
 {
     (*this)(image, Mat(), keypoints, descriptors, true);
 }
diff --git a/modules/features2d/src/precomp.hpp b/modules/features2d/src/precomp.hpp
index 3c9073ec5..2f77d9270 100644
--- a/modules/features2d/src/precomp.hpp
+++ b/modules/features2d/src/precomp.hpp
@@ -48,6 +48,7 @@
 
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"
 
 #include <algorithm>
 
diff --git a/modules/features2d/src/stardetector.cpp b/modules/features2d/src/stardetector.cpp
index 02b999b62..1e00ee604 100644
--- a/modules/features2d/src/stardetector.cpp
+++ b/modules/features2d/src/stardetector.cpp
@@ -426,9 +426,9 @@ StarDetector::StarDetector(int _maxSize, int _responseThreshold,
 {}
 
 
-void StarDetector::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask ) const
+void StarDetector::detectImpl( InputArray _image, std::vector<KeyPoint>& keypoints, InputArray _mask ) const
 {
-    Mat grayImage = image;
+    Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image;
     if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY );
 
     (*this)(grayImage, keypoints);
diff --git a/modules/features2d/test/ocl/test_brute_force_matcher.cpp b/modules/features2d/test/ocl/test_brute_force_matcher.cpp
new file mode 100644
index 000000000..0e1df784f
--- /dev/null
+++ b/modules/features2d/test/ocl/test_brute_force_matcher.cpp
@@ -0,0 +1,213 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Zero Lin, Zero.Lin@amd.com
+//    Zhang Ying, zhangying913@gmail.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+PARAM_TEST_CASE(BruteForceMatcher, int, int)
+{
+    int distType;
+    int dim;
+
+    int queryDescCount;
+    int countFactor;
+
+    Mat query, train;
+    UMat uquery, utrain;
+
+    virtual void SetUp()
+    {
+        distType = GET_PARAM(0);
+        dim = GET_PARAM(1);
+
+        queryDescCount = 300; // must be even number because we split train data in some cases in two
+        countFactor = 4; // do not change it
+
+        cv::Mat queryBuf, trainBuf;
+
+        // Generate query descriptors randomly.
+        // Descriptor vector elements are integer values.
+        queryBuf.create(queryDescCount, dim, CV_32SC1);
+        rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
+        queryBuf.convertTo(queryBuf, CV_32FC1);
+
+        // Generate train decriptors as follows:
+        // copy each query descriptor to train set countFactor times
+        // and perturb some one element of the copied descriptors in
+        // in ascending order. General boundaries of the perturbation
+        // are (0.f, 1.f).
+        trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
+        float step = 1.f / countFactor;
+        for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
+        {
+            cv::Mat queryDescriptor = queryBuf.row(qIdx);
+            for (int c = 0; c < countFactor; c++)
+            {
+                int tIdx = qIdx * countFactor + c;
+                cv::Mat trainDescriptor = trainBuf.row(tIdx);
+                queryDescriptor.copyTo(trainDescriptor);
+                int elem = rng(dim);
+                float diff = rng.uniform(step * c, step * (c + 1));
+                trainDescriptor.at<float>(0, elem) += diff;
+            }
+        }
+
+        queryBuf.convertTo(query, CV_32F);
+        trainBuf.convertTo(train, CV_32F);
+        query.copyTo(uquery);
+        train.copyTo(utrain);
+    }
+};
+
+#ifdef ANDROID
+OCL_TEST_P(BruteForceMatcher, DISABLED_Match_Single)
+#else
+OCL_TEST_P(BruteForceMatcher, Match_Single)
+#endif
+{
+    BFMatcher matcher(distType);
+
+    std::vector<cv::DMatch> matches;
+    matcher.match(uquery, utrain,  matches);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        cv::DMatch match = matches[i];
+        if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
+            badCount++;
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+#ifdef ANDROID
+OCL_TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
+#else
+OCL_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
+#endif
+{
+    const int knn = 2;
+
+    BFMatcher matcher(distType);
+
+    std::vector< std::vector<cv::DMatch> > matches;
+    matcher.knnMatch(uquery, utrain, matches, knn);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != knn)
+            badCount++;
+        else
+        {
+            int localBadCount = 0;
+            for (int k = 0; k < knn; k++)
+            {
+                cv::DMatch match = matches[i][k];
+                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
+                    localBadCount++;
+            }
+            badCount += localBadCount > 0 ? 1 : 0;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+#ifdef ANDROID
+OCL_TEST_P(BruteForceMatcher, DISABLED_RadiusMatch_Single)
+#else
+OCL_TEST_P(BruteForceMatcher, RadiusMatch_Single)
+#endif
+{
+    float radius = 1.f / countFactor;
+
+    BFMatcher matcher(distType);
+
+    std::vector< std::vector<cv::DMatch> > matches;
+    matcher.radiusMatch(uquery, utrain, matches, radius);
+
+    ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
+
+    int badCount = 0;
+    for (size_t i = 0; i < matches.size(); i++)
+    {
+        if ((int)matches[i].size() != 1)
+        {
+            badCount++;
+        }
+        else
+        {
+            cv::DMatch match = matches[i][0];
+            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
+                badCount++;
+        }
+    }
+
+    ASSERT_EQ(0, badCount);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Matcher, BruteForceMatcher, Combine( Values((int)NORM_L1, (int)NORM_L2),
+                                                                Values(57, 64, 83, 128, 179, 256, 304) ) );
+
+}//ocl
+}//cvtest
+
+#endif //HAVE_OPENCL
diff --git a/modules/flann/include/opencv2/flann/autotuned_index.h b/modules/flann/include/opencv2/flann/autotuned_index.h
index 8d531753e..b0beac499 100644
--- a/modules/flann/include/opencv2/flann/autotuned_index.h
+++ b/modules/flann/include/opencv2/flann/autotuned_index.h
@@ -270,7 +270,7 @@ private:
     //    struct KMeansSimpleDownhillFunctor {
     //
     //        Autotune& autotuner;
-    //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
+    //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
     //
     //        float operator()(int* params) {
     //
@@ -295,7 +295,7 @@ private:
     //    struct KDTreeSimpleDownhillFunctor {
     //
     //        Autotune& autotuner;
-    //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
+    //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
     //
     //        float operator()(int* params) {
     //            float maxFloat = numeric_limits<float>::max();
diff --git a/modules/flann/include/opencv2/flann/dist.h b/modules/flann/include/opencv2/flann/dist.h
index dd4cc421d..7eac9df83 100644
--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -706,7 +706,7 @@ struct KL_Divergence
         Iterator1 last = a + size;
 
         while (a < last) {
-            if (* a != 0) {
+            if (* b != 0) {
                 ResultType ratio = (ResultType)(*a / *b);
                 if (ratio>0) {
                     result += *a * log(ratio);
diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
index b8b16941f..710382053 100644
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -297,6 +297,11 @@ public:
         trees_ = get_param(params,"trees",4);
         root = new NodePtr[trees_];
         indices = new int*[trees_];
+
+        for (int i=0; i<trees_; ++i) {
+            root[i] = NULL;
+            indices[i] = NULL;
+        }
     }
 
     HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
@@ -309,11 +314,34 @@ public:
      */
     virtual ~HierarchicalClusteringIndex()
     {
+        free_elements();
+
+        if (root!=NULL) {
+            delete[] root;
+        }
+
         if (indices!=NULL) {
             delete[] indices;
         }
     }
 
+
+    /**
+     * Release the inner elements of indices[]
+     */
+    void free_elements()
+    {
+        if (indices!=NULL) {
+            for(int i=0; i<trees_; ++i) {
+                if (indices[i]!=NULL) {
+                    delete[] indices[i];
+                    indices[i] = NULL;
+                }
+            }
+        }
+    }
+
+
     /**
      *  Returns size of index.
      */
@@ -348,6 +376,9 @@ public:
         if (branching_<2) {
             throw FLANNException("Branching factor must be at least 2");
         }
+
+        free_elements();
+
         for (int i=0; i<trees_; ++i) {
             indices[i] = new int[size_];
             for (size_t j=0; j<size_; ++j) {
@@ -382,11 +413,22 @@ public:
 
     void loadIndex(FILE* stream)
     {
+        free_elements();
+
+        if (root!=NULL) {
+            delete[] root;
+        }
+
+        if (indices!=NULL) {
+            delete[] indices;
+        }
+
         load_value(stream, branching_);
         load_value(stream, trees_);
         load_value(stream, centers_init_);
         load_value(stream, leaf_size_);
         load_value(stream, memoryCounter);
+
         indices = new int*[trees_];
         root = new NodePtr[trees_];
         for (int i=0; i<trees_; ++i) {
diff --git a/modules/flann/include/opencv2/flann/timer.h b/modules/flann/include/opencv2/flann/timer.h
index 58354e75d..f771a3417 100644
--- a/modules/flann/include/opencv2/flann/timer.h
+++ b/modules/flann/include/opencv2/flann/timer.h
@@ -33,6 +33,7 @@
 
 #include <time.h>
 #include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
 
 namespace cvflann
 {
diff --git a/modules/flann/test/test_lshtable_badarg.cpp b/modules/flann/test/test_lshtable_badarg.cpp
index 3b776668d..4c9ebf9fe 100644
--- a/modules/flann/test/test_lshtable_badarg.cpp
+++ b/modules/flann/test/test_lshtable_badarg.cpp
@@ -47,7 +47,7 @@ class CV_LshTableBadArgTest : public cvtest::BadArgTest
 {
 protected:
     void run(int);
-    void run_func(void) {};
+    void run_func(void) { }
 
     struct Caller
     {
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index 51ab0c3ef..a54ae4621 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -222,6 +222,12 @@ elseif(HAVE_QTKIT)
   list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit")
 endif()
 
+if(HAVE_INTELPERC)
+  list(APPEND highgui_srcs src/cap_intelperc.cpp)
+  ocv_include_directories(${INTELPERC_INCLUDE_DIR})
+  list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES})
+endif(HAVE_INTELPERC)
+
 if(IOS)
   add_definitions(-DHAVE_IOS=1)
   list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
@@ -309,7 +315,7 @@ if(WIN32 AND WITH_FFMPEG)
                        COMMENT "Copying ${ffmpeg_path} to the output directory")
   endif()
 
-  install(FILES "${ffmpeg_path}" DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT main RENAME "${ffmpeg_bare_name_ver}")
+  install(FILES "${ffmpeg_path}" DESTINATION ${OPENCV_BIN_INSTALL_PATH} COMPONENT libs RENAME "${ffmpeg_bare_name_ver}")
 endif()
 
 ocv_add_accuracy_tests()
diff --git a/modules/highgui/doc/user_interface.rst b/modules/highgui/doc/user_interface.rst
index 8b655a1c8..0d0ccde94 100644
--- a/modules/highgui/doc/user_interface.rst
+++ b/modules/highgui/doc/user_interface.rst
@@ -208,6 +208,26 @@ Sets mouse handler for the specified window
     :param userdata: The optional parameter passed to the callback.
 
 
+getMouseWheelDelta
+------------------
+Gets the mouse-wheel motion delta, when handling mouse-wheel events EVENT_MOUSEWHEEL and EVENT_MOUSEHWHEEL.
+
+.. ocv:function:: int getMouseWheelDelta(int flags)
+
+    :param flags: The mouse callback flags parameter.
+
+For regular mice with a scroll-wheel, delta will be a multiple of 120. The value 120 corresponds to a one notch rotation of the wheel or the threshold for action to be taken and one such action should occur for each delta.
+Some high-precision mice with higher-resolution freely-rotating wheels may generate smaller values.
+
+For EVENT_MOUSEWHEEL positive and negative values mean forward and backward scrolling, respectively.
+For EVENT_MOUSEHWHEEL, where available, positive and negative values mean right and left scrolling, respectively.
+
+With the C API, the macro CV_GET_WHEEL_DELTA(flags) can be used alternatively.
+
+.. note::
+
+    Mouse-wheel events are currently supported only on Windows.
+
 setTrackbarPos
 ------------------
 Sets the trackbar position.
diff --git a/modules/highgui/include/opencv2/highgui.hpp b/modules/highgui/include/opencv2/highgui.hpp
index cebf8fe22..275d12e97 100644
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -76,7 +76,9 @@ enum { EVENT_MOUSEMOVE      = 0,
        EVENT_MBUTTONUP      = 6,
        EVENT_LBUTTONDBLCLK  = 7,
        EVENT_RBUTTONDBLCLK  = 8,
-       EVENT_MBUTTONDBLCLK  = 9
+       EVENT_MBUTTONDBLCLK  = 9,
+       EVENT_MOUSEWHEEL     = 10,
+       EVENT_MOUSEHWHEEL    = 11
      };
 
 enum { EVENT_FLAG_LBUTTON   = 1,
@@ -137,6 +139,8 @@ CV_EXPORTS_W double getWindowProperty(const String& winname, int prop_id);
 //! assigns callback for mouse events
 CV_EXPORTS void setMouseCallback(const String& winname, MouseCallback onMouse, void* userdata = 0);
 
+CV_EXPORTS int getMouseWheelDelta(int flags);
+
 CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
                               int* value, int count,
                               TrackbarCallback onChange = 0,
@@ -271,7 +275,8 @@ enum { CAP_ANY          = 0,     // autodetect
        CAP_XIAPI        = 1100,  // XIMEA Camera API
        CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
        CAP_GIGANETIX    = 1300,  // Smartek Giganetix GigEVisionSDK
-       CAP_MSMF         = 1400   // Microsoft Media Foundation (via videoInput)
+       CAP_MSMF         = 1400,  // Microsoft Media Foundation (via videoInput)
+       CAP_INTELPERC    = 1500   // Intel Perceptual Computing SDK
      };
 
 // generic properties (based on DC1394 properties)
@@ -384,9 +389,17 @@ enum { CAP_PROP_GSTREAMER_QUEUE_LENGTH = 200 // default is 1
 
 
 // PVAPI
-enum { CAP_PROP_PVAPI_MULTICASTIP = 300 // ip for anable multicast master mode. 0 for disable multicast
+enum { CAP_PROP_PVAPI_MULTICASTIP               = 300, // ip for anable multicast master mode. 0 for disable multicast
+       CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE     = 301  // FrameStartTriggerMode: Determines how a frame is initiated
      };
 
+// PVAPI: FrameStartTriggerMode
+enum { CAP_PVAPI_FSTRIGMODE_FREERUN     = 0,    // Freerun
+       CAP_PVAPI_FSTRIGMODE_SYNCIN1     = 1,    // SyncIn1
+       CAP_PVAPI_FSTRIGMODE_SYNCIN2     = 2,    // SyncIn2
+       CAP_PVAPI_FSTRIGMODE_FIXEDRATE   = 3,    // FixedRate
+       CAP_PVAPI_FSTRIGMODE_SOFTWARE    = 4     // Software
+     };
 
 // Properties of cameras available through XIMEA SDK interface
 enum { CAP_PROP_XI_DOWNSAMPLING  = 400, // Change image resolution by binning or skipping.
@@ -496,6 +509,26 @@ enum { CAP_PROP_GIGA_FRAME_OFFSET_X   = 10001,
        CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
      };
 
+enum { CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+       CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+       CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+       CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+       CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+       CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007
+     };
+
+// Intel PerC streams
+enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+       CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+       CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR
+     };
+
+enum { CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+       CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+       CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+       CAP_INTELPERC_IMAGE                  = 3
+     };
 
 class CV_EXPORTS_W VideoCapture
 {
diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h
index 2ebea0d30..ed8e2df0a 100644
--- a/modules/highgui/include/opencv2/highgui/highgui_c.h
+++ b/modules/highgui/include/opencv2/highgui/highgui_c.h
@@ -170,7 +170,9 @@ enum
     CV_EVENT_MBUTTONUP      =6,
     CV_EVENT_LBUTTONDBLCLK  =7,
     CV_EVENT_RBUTTONDBLCLK  =8,
-    CV_EVENT_MBUTTONDBLCLK  =9
+    CV_EVENT_MBUTTONDBLCLK  =9,
+    CV_EVENT_MOUSEWHEEL     =10,
+    CV_EVENT_MOUSEHWHEEL    =11
 };
 
 enum
@@ -183,6 +185,9 @@ enum
     CV_EVENT_FLAG_ALTKEY    =32
 };
 
+
+#define CV_GET_WHEEL_DELTA(flags) ((short)((flags >> 16) & 0xffff)) // upper 16 bits
+
 typedef void (CV_CDECL *CvMouseCallback )(int event, int x, int y, int flags, void* param);
 
 /* assign callback for mouse events */
@@ -313,7 +318,9 @@ enum
 
     CV_CAP_AVFOUNDATION = 1200,  // AVFoundation framework for iOS (OS X Lion will have the same API)
 
-    CV_CAP_GIGANETIX = 1300  // Smartek Giganetix GigEVisionSDK
+    CV_CAP_GIGANETIX = 1300,  // Smartek Giganetix GigEVisionSDK
+
+    CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK
 };
 
 /* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */
@@ -418,8 +425,11 @@ enum
     CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CV_CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
 
     // Properties of cameras available through GStreamer interface
-    CV_CAP_GSTREAMER_QUEUE_LENGTH   = 200, // default is 1
-    CV_CAP_PROP_PVAPI_MULTICASTIP   = 300, // ip for anable multicast master mode. 0 for disable multicast
+    CV_CAP_GSTREAMER_QUEUE_LENGTH           = 200, // default is 1
+
+    // PVAPI
+    CV_CAP_PROP_PVAPI_MULTICASTIP           = 300, // ip for anable multicast master mode. 0 for disable multicast
+    CV_CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE = 301, // FrameStartTriggerMode: Determines how a frame is initiated
 
     // Properties of cameras available through XIMEA SDK interface
     CV_CAP_PROP_XI_DOWNSAMPLING  = 400,      // Change image resolution by binning or skipping.
@@ -459,16 +469,29 @@ enum
     CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002,
     CV_CAP_PROP_IOS_DEVICE_FLASH = 9003,
     CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
-    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005
+    CV_CAP_PROP_IOS_DEVICE_TORCH = 9005,
 
     // Properties of cameras available through Smartek Giganetix Ethernet Vision interface
     /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
-    ,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
+    CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
     CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002,
     CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003,
     CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004,
     CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
-    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
+    CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006,
+
+    CV_CAP_PROP_INTELPERC_PROFILE_COUNT               = 11001,
+    CV_CAP_PROP_INTELPERC_PROFILE_IDX                 = 11002,
+    CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE  = 11003,
+    CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE      = 11004,
+    CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD  = 11005,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ     = 11006,
+    CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT     = 11007,
+
+    // Intel PerC streams
+    CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
+    CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
+    CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR
 };
 
 enum
@@ -549,6 +572,14 @@ enum
     CV_CAP_ANDROID_ANTIBANDING_OFF
 };
 
+enum
+{
+    CV_CAP_INTELPERC_DEPTH_MAP              = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
+    CV_CAP_INTELPERC_UVDEPTH_MAP            = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
+    CV_CAP_INTELPERC_IR_MAP                 = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
+    CV_CAP_INTELPERC_IMAGE                  = 3
+};
+
 /* retrieve or set capture properties */
 CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id );
 CVAPI(int)    cvSetCaptureProperty( CvCapture* capture, int property_id, double value );
diff --git a/modules/highgui/src/bitstrm.hpp b/modules/highgui/src/bitstrm.hpp
index df72e7373..465c0a847 100644
--- a/modules/highgui/src/bitstrm.hpp
+++ b/modules/highgui/src/bitstrm.hpp
@@ -53,7 +53,7 @@ enum
     RBS_THROW_EOS=-123,  // <end of stream> exception code
     RBS_THROW_FORB=-124,  // <forrbidden huffman code> exception code
     RBS_HUFF_FORB=2047,  // forrbidden huffman code "value"
-    RBS_BAD_HEADER=-125, // invalid header
+    RBS_BAD_HEADER=-125 // invalid header
 };
 
 typedef unsigned long ulong;
diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp
index 0f4e6afb8..105f92e8c 100644
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
 #endif
 #ifdef HAVE_GIGE_API
         CV_CAP_GIGANETIX,
+#endif
+#ifdef HAVE_INTELPERC
+        CV_CAP_INTELPERC,
 #endif
         -1
     };
@@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_ANDROID_NATIVE_CAMERA) || \
     defined(HAVE_GIGE_API) || \
+    defined(HAVE_INTELPERC)    || \
     (0)
         // local variable to memorize the captured device
         CvCapture *capture;
@@ -342,6 +346,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
         break; // CV_CAP_GIGANETIX
 #endif
 
+#ifdef HAVE_INTELPERC
+        case CV_CAP_INTELPERC:
+            capture = cvCreateCameraCapture_IntelPerC(index);
+            if (capture)
+                return capture;
+        break; // CV_CAP_INTEL_PERC
+#endif
         }
     }
 
diff --git a/modules/highgui/src/cap_ffmpeg_impl.hpp b/modules/highgui/src/cap_ffmpeg_impl.hpp
index d0eabb155..4a3b029b8 100644
--- a/modules/highgui/src/cap_ffmpeg_impl.hpp
+++ b/modules/highgui/src/cap_ffmpeg_impl.hpp
@@ -2096,7 +2096,7 @@ enum
     VideoCodec_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,V,U (4:2:0)
     VideoCodec_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   // Y,UV  (4:2:0)
     VideoCodec_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   // YUYV/YUY2 (4:2:2)
-    VideoCodec_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')),   // UYVY (4:2:2)
+    VideoCodec_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    // UYVY (4:2:2)
 };
 
 enum
@@ -2104,7 +2104,7 @@ enum
     VideoChromaFormat_Monochrome = 0,
     VideoChromaFormat_YUV420,
     VideoChromaFormat_YUV422,
-    VideoChromaFormat_YUV444,
+    VideoChromaFormat_YUV444
 };
 
 struct InputMediaStream_FFMPEG
diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp
new file mode 100644
index 000000000..368f4fd2c
--- /dev/null
+++ b/modules/highgui/src/cap_intelperc.cpp
@@ -0,0 +1,714 @@
+#include "precomp.hpp"
+
+#ifdef HAVE_INTELPERC
+
+#include "pxcsession.h"
+#include "pxcsmartptr.h"
+#include "pxccapture.h"
+
+class CvIntelPerCStreamBase
+{
+protected:
+    struct FrameInternal
+    {
+        IplImage* retrieveFrame()
+        {
+            if (m_mat.empty())
+                return NULL;
+            m_iplHeader = IplImage(m_mat);
+            return &m_iplHeader;
+        }
+        cv::Mat m_mat;
+    private:
+        IplImage m_iplHeader;
+    };
+public:
+    CvIntelPerCStreamBase()
+        : m_profileIdx(-1)
+        , m_frameIdx(0)
+        , m_timeStampStartNS(0)
+    {
+    }
+    virtual ~CvIntelPerCStreamBase()
+    {
+    }
+
+    bool isValid()
+    {
+        return (m_device.IsValid() && m_stream.IsValid());
+    }
+    bool grabFrame()
+    {
+        if (!m_stream.IsValid())
+            return false;
+        if (-1 == m_profileIdx)
+        {
+            if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0))
+                return false;
+        }
+        PXCSmartPtr<PXCImage> pxcImage; PXCSmartSP sp;
+        if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp))
+            return false;
+        if (PXC_STATUS_NO_ERROR > sp->Synchronize())
+            return false;
+        if (0 == m_timeStampStartNS)
+            m_timeStampStartNS = pxcImage->QueryTimeStamp();
+        m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000);
+        m_frameIdx++;
+        return prepareIplImage(pxcImage);
+    }
+    int getProfileIDX() const
+    {
+        return m_profileIdx;
+    }
+public:
+    virtual bool initStream(PXCSession *session)            = 0;
+    virtual double getProperty(int propIdx)
+    {
+        double ret = 0.0;
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_PROFILE_COUNT:
+            ret = (double)m_profiles.size();
+            break;
+        case CV_CAP_PROP_FRAME_WIDTH :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+                ret = (double)m_profiles[m_profileIdx].imageInfo.width;
+            break;
+        case CV_CAP_PROP_FRAME_HEIGHT :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+                ret = (double)m_profiles[m_profileIdx].imageInfo.height;
+            break;
+        case CV_CAP_PROP_FPS :
+            if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
+            {
+                ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator
+                        + (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0;
+            }
+            break;
+        case CV_CAP_PROP_POS_FRAMES:
+            ret  = (double)m_frameIdx;
+            break;
+        case CV_CAP_PROP_POS_MSEC:
+            ret  = m_timeStamp;
+            break;
+        };
+        return ret;
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        bool isSet = false;
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_PROFILE_IDX:
+            {
+                int propValInt = (int)propVal;
+                if ((0 <= propValInt) && (propValInt < m_profiles.size()))
+                {
+                    if (m_profileIdx != propValInt)
+                    {
+                        m_profileIdx = propValInt;
+                        if (m_stream.IsValid())
+                            m_stream->SetProfile(&m_profiles[m_profileIdx]);
+                        m_frameIdx = 0;
+                        m_timeStampStartNS = 0;
+                    }
+                    isSet = true;
+                }
+            }
+            break;
+        };
+        return isSet;
+    }
+protected:
+    PXCSmartPtr<PXCCapture::Device> m_device;
+    bool initDevice(PXCSession *session)
+    {
+        if (NULL == session)
+            return false;
+
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        PXCSession::ImplDesc templat;
+        memset(&templat,0,sizeof(templat));
+        templat.group   = PXCSession::IMPL_GROUP_SENSOR;
+        templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE;
+
+        for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++)
+        {
+            PXCSession::ImplDesc desc;
+            sts = session->QueryImpl(&templat, modidx, &desc);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+
+            PXCSmartPtr<PXCCapture> capture;
+            sts = session->CreateImpl<PXCCapture>(&desc, &capture);
+            if (!capture.IsValid())
+                continue;
+
+            /* enumerate devices */
+            for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++)
+            {
+                PXCSmartPtr<PXCCapture::Device> device;
+                sts = capture->CreateDevice(devidx, &device);
+                if (PXC_STATUS_NO_ERROR <= sts)
+                {
+                    m_device = device.ReleasePtr();
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    PXCSmartPtr<PXCCapture::VideoStream> m_stream;
+    void initStreamImpl(PXCImage::ImageType type)
+    {
+        if (!m_device.IsValid())
+            return;
+
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        /* enumerate streams */
+        for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++)
+        {
+            PXCCapture::Device::StreamInfo sinfo;
+            sts = m_device->QueryStream(streamidx, &sinfo);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+            if (PXCCapture::VideoStream::CUID != sinfo.cuid)
+                continue;
+            if (type != sinfo.imageType)
+                continue;
+
+            sts = m_device->CreateStream<PXCCapture::VideoStream>(streamidx, &m_stream);
+            if (PXC_STATUS_NO_ERROR == sts)
+                break;
+            m_stream.ReleaseRef();
+        }
+    }
+protected:
+    std::vector<PXCCapture::VideoStream::ProfileInfo> m_profiles;
+    int m_profileIdx;
+    int m_frameIdx;
+    pxcU64 m_timeStampStartNS;
+    double m_timeStamp;
+
+    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/)
+    {
+        return true;
+    }
+    void enumProfiles()
+    {
+        m_profiles.clear();
+        if (!m_stream.IsValid())
+            return;
+        pxcStatus sts = PXC_STATUS_NO_ERROR;
+        for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++)
+        {
+            PXCCapture::VideoStream::ProfileInfo pinfo;
+            sts = m_stream->QueryProfile(profidx, &pinfo);
+            if (PXC_STATUS_NO_ERROR > sts)
+                break;
+            if (validProfile(pinfo))
+                m_profiles.push_back(pinfo);
+        }
+    }
+    virtual bool prepareIplImage(PXCImage *pxcImage) = 0;
+};
+
+class CvIntelPerCStreamImage
+    : public CvIntelPerCStreamBase
+{
+public:
+    CvIntelPerCStreamImage()
+    {
+    }
+    virtual ~CvIntelPerCStreamImage()
+    {
+    }
+
+    virtual bool initStream(PXCSession *session)
+    {
+        if (!initDevice(session))
+            return false;
+        initStreamImpl(PXCImage::IMAGE_TYPE_COLOR);
+        if (!m_stream.IsValid())
+            return false;
+        enumProfiles();
+        return true;
+    }
+    virtual double getProperty(int propIdx)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_BRIGHTNESS:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_CONTRAST:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_SATURATION:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_HUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_GAMMA:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_SHARPNESS:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_GAIN:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_BACKLIGHT:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_EXPOSURE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        //Add image stream specific properties
+        }
+        return CvIntelPerCStreamBase::getProperty(propIdx);
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_BRIGHTNESS:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_CONTRAST:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_SATURATION:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_HUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_GAMMA:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_SHARPNESS:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_GAIN:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_BACKLIGHT:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_EXPOSURE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal));
+            }
+            break;
+        //Add image stream specific properties
+        }
+        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
+    }
+public:
+    IplImage* retrieveFrame()
+    {
+        return m_frame.retrieveFrame();
+    }
+protected:
+    FrameInternal m_frame;
+    bool prepareIplImage(PXCImage *pxcImage)
+    {
+        if (NULL == pxcImage)
+            return false;
+        PXCImage::ImageInfo info;
+        pxcImage->QueryInfo(&info);
+
+        PXCImage::ImageData data;
+        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data);
+
+        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
+            return false;
+
+        cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]);
+        temp.copyTo(m_frame.m_mat);
+
+        pxcImage->ReleaseAccess(&data);
+        return true;
+    }
+};
+
+class CvIntelPerCStreamDepth
+    : public CvIntelPerCStreamBase
+{
+public:
+    CvIntelPerCStreamDepth()
+    {
+    }
+    virtual ~CvIntelPerCStreamDepth()
+    {
+    }
+
+    virtual bool initStream(PXCSession *session)
+    {
+        if (!initDevice(session))
+            return false;
+        initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH);
+        if (!m_stream.IsValid())
+            return false;
+        enumProfiles();
+        return true;
+    }
+    virtual double getProperty(int propIdx)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
+            {
+                if (!m_device.IsValid())
+                    return 0.0;
+                float fret = 0.0f;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret))
+                    return (double)fret;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ:
+            {
+                if (!m_device.IsValid())
+                    return 0.0f;
+                PXCPointF32 ptf;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
+                    return (double)ptf.x;
+                return 0.0;
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT:
+            {
+                if (!m_device.IsValid())
+                    return 0.0f;
+                PXCPointF32 ptf;
+                if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
+                    return (double)ptf.y;
+                return 0.0;
+            }
+            break;
+            //Add depth stream sepcific properties
+        }
+        return CvIntelPerCStreamBase::getProperty(propIdx);
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        switch (propIdx)
+        {
+        case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal));
+            }
+            break;
+        case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
+            {
+                if (!m_device.IsValid())
+                    return false;
+                return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal));
+            }
+            break;
+        //Add depth stream sepcific properties
+        }
+        return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
+    }
+public:
+    IplImage* retrieveDepthFrame()
+    {
+        return m_frameDepth.retrieveFrame();
+    }
+    IplImage* retrieveIRFrame()
+    {
+        return m_frameIR.retrieveFrame();
+    }
+    IplImage* retrieveUVFrame()
+    {
+        return m_frameUV.retrieveFrame();
+    }
+protected:
+    virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo)
+    {
+        return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format);
+    }
+protected:
+    FrameInternal m_frameDepth;
+    FrameInternal m_frameIR;
+    FrameInternal m_frameUV;
+
+    bool prepareIplImage(PXCImage *pxcImage)
+    {
+        if (NULL == pxcImage)
+            return false;
+        PXCImage::ImageInfo info;
+        pxcImage->QueryInfo(&info);
+
+        PXCImage::ImageData data;
+        pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data);
+
+        if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
+            return false;
+
+        if (PXCImage::COLOR_FORMAT_DEPTH != data.format)
+            return false;
+
+        {
+            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]);
+            temp.copyTo(m_frameDepth.m_mat);
+        }
+        {
+            cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]);
+            temp.copyTo(m_frameIR.m_mat);
+        }
+        {
+            cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]);
+            temp.copyTo(m_frameUV.m_mat);
+        }
+
+        pxcImage->ReleaseAccess(&data);
+        return true;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class CvCapture_IntelPerC : public CvCapture
+{
+public:
+    CvCapture_IntelPerC(int /*index*/)
+        : m_contextOpened(false)
+    {
+        pxcStatus sts = PXCSession_Create(&m_session);
+        if (PXC_STATUS_NO_ERROR > sts)
+            return;
+        m_contextOpened = m_imageStream.initStream(m_session);
+        m_contextOpened &= m_depthStream.initStream(m_session);
+    }
+    virtual ~CvCapture_IntelPerC(){}
+
+    virtual double getProperty(int propIdx)
+    {
+        double propValue = 0;
+        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
+        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            propValue = m_imageStream.getProperty(purePropIdx);
+        }
+        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            propValue = m_depthStream.getProperty(purePropIdx);
+        }
+        else
+        {
+            propValue = m_depthStream.getProperty(purePropIdx);
+        }
+        return propValue;
+    }
+    virtual bool setProperty(int propIdx, double propVal)
+    {
+        bool isSet = false;
+        int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
+        if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            isSet = m_imageStream.setProperty(purePropIdx, propVal);
+        }
+        else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
+        {
+            isSet = m_depthStream.setProperty(purePropIdx, propVal);
+        }
+        else
+        {
+            isSet = m_depthStream.setProperty(purePropIdx, propVal);
+        }
+        return isSet;
+    }
+
+    bool grabFrame()
+    {
+        if (!isOpened())
+            return false;
+
+        bool isGrabbed = false;
+        if (m_depthStream.isValid())
+            isGrabbed = m_depthStream.grabFrame();
+        if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX()))
+            isGrabbed &= m_imageStream.grabFrame();
+
+        return isGrabbed;
+    }
+
+    virtual IplImage* retrieveFrame(int outputType)
+    {
+        IplImage* image = 0;
+        switch (outputType)
+        {
+        case CV_CAP_INTELPERC_DEPTH_MAP:
+            image = m_depthStream.retrieveDepthFrame();
+            break;
+        case CV_CAP_INTELPERC_UVDEPTH_MAP:
+            image = m_depthStream.retrieveUVFrame();
+            break;
+        case CV_CAP_INTELPERC_IR_MAP:
+            image = m_depthStream.retrieveIRFrame();
+            break;
+        case CV_CAP_INTELPERC_IMAGE:
+            image = m_imageStream.retrieveFrame();
+            break;
+        }
+        CV_Assert(NULL != image);
+        return image;
+    }
+
+    bool isOpened() const
+    {
+        return m_contextOpened;
+    }
+protected:
+    bool m_contextOpened;
+
+    PXCSmartPtr<PXCSession> m_session;
+    CvIntelPerCStreamImage m_imageStream;
+    CvIntelPerCStreamDepth m_depthStream;
+};
+
+
+CvCapture* cvCreateCameraCapture_IntelPerC(int index)
+{
+    CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index);
+
+    if( capture->isOpened() )
+        return capture;
+
+    delete capture;
+    return 0;
+}
+
+
+#endif //HAVE_INTELPERC
diff --git a/modules/highgui/src/cap_ios_abstract_camera.mm b/modules/highgui/src/cap_ios_abstract_camera.mm
index a77e200a8..b40b3648d 100644
--- a/modules/highgui/src/cap_ios_abstract_camera.mm
+++ b/modules/highgui/src/cap_ios_abstract_camera.mm
@@ -278,8 +278,20 @@
 {
     self.captureVideoPreviewLayer = [[AVCaptureVideoPreviewLayer alloc] initWithSession:self.captureSession];
 
-    if ([self.captureVideoPreviewLayer isOrientationSupported]) {
-        [self.captureVideoPreviewLayer setOrientation:self.defaultAVCaptureVideoOrientation];
+    if ([self.captureVideoPreviewLayer respondsToSelector:@selector(connection)])
+    {
+        if ([self.captureVideoPreviewLayer.connection isVideoOrientationSupported])
+        {
+            [self.captureVideoPreviewLayer.connection setVideoOrientation:self.defaultAVCaptureVideoOrientation];
+        }
+    }
+    else
+    {
+        // Deprecated in 6.0; here for backward compatibility
+        if ([self.captureVideoPreviewLayer isOrientationSupported])
+        {
+            [self.captureVideoPreviewLayer setOrientation:self.defaultAVCaptureVideoOrientation];
+        }
     }
 
     if (parentView != nil) {
@@ -290,9 +302,6 @@
     NSLog(@"[Camera] created AVCaptureVideoPreviewLayer");
 }
 
-
-
-
 - (void)setDesiredCameraPosition:(AVCaptureDevicePosition)desiredPosition;
 {
     for (AVCaptureDevice *device in [AVCaptureDevice devicesWithMediaType:AVMediaTypeVideo]) {
diff --git a/modules/highgui/src/cap_ios_video_camera.mm b/modules/highgui/src/cap_ios_video_camera.mm
index 99f8a75ae..20973c313 100644
--- a/modules/highgui/src/cap_ios_video_camera.mm
+++ b/modules/highgui/src/cap_ios_video_camera.mm
@@ -34,7 +34,7 @@
 #import <AssetsLibrary/AssetsLibrary.h>
 
 
-static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
+static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;}
 
 #pragma mark - Private Interface
 
diff --git a/modules/highgui/src/cap_pvapi.cpp b/modules/highgui/src/cap_pvapi.cpp
index 6ed3aea84..4b27ab160 100644
--- a/modules/highgui/src/cap_pvapi.cpp
+++ b/modules/highgui/src/cap_pvapi.cpp
@@ -254,6 +254,11 @@ double CvCaptureCAM_PvAPI::getProperty( int property_id )
     case CV_CAP_PROP_FRAME_HEIGHT:
         PvAttrUint32Get(Camera.Handle, "Height", &nTemp);
         return (double)nTemp;
+    case CV_CAP_PROP_MONOCROME:
+        if (monocrome)
+          return 1;
+        else
+          return 0;
     case CV_CAP_PROP_EXPOSURE:
         PvAttrUint32Get(Camera.Handle,"ExposureValue",&nTemp);
         return (double)nTemp;
@@ -280,6 +285,21 @@ double CvCaptureCAM_PvAPI::getProperty( int property_id )
     case CV_CAP_PROP_GAIN:
         PvAttrUint32Get(Camera.Handle, "GainValue", &nTemp);
         return (double)nTemp;
+    case CV_CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE:
+        char triggerMode[256];
+        PvAttrEnumGet(Camera.Handle, "FrameStartTriggerMode", triggerMode, 256, NULL);
+        if (strcmp(triggerMode, "Freerun")==0)
+            return 0.0;
+        else if (strcmp(triggerMode, "SyncIn1")==0)
+            return 1.0;
+        else if (strcmp(triggerMode, "SyncIn2")==0)
+            return 2.0;
+        else if (strcmp(triggerMode, "FixedRate")==0)
+            return 3.0;
+        else if (strcmp(triggerMode, "Software")==0)
+            return 4.0;
+        else
+            return -1.0;
     }
     return -1.0;
 }
@@ -368,6 +388,24 @@ bool CvCaptureCAM_PvAPI::setProperty( int property_id, double value )
             return false;
         }
         break;
+    case CV_CAP_PROP_PVAPI_FRAMESTARTTRIGGERMODE:
+        tPvErr error;
+        if (value==0)
+            error = PvAttrEnumSet(Camera.Handle, "FrameStartTriggerMode", "Freerun");
+        else if (value==1)
+            error = PvAttrEnumSet(Camera.Handle, "FrameStartTriggerMode", "SyncIn1");
+        else if (value==2)
+            error = PvAttrEnumSet(Camera.Handle, "FrameStartTriggerMode", "SyncIn2");
+        else if (value==3)
+            error = PvAttrEnumSet(Camera.Handle, "FrameStartTriggerMode", "FixedRate");
+        else if (value==4)
+            error = PvAttrEnumSet(Camera.Handle, "FrameStartTriggerMode", "Software");
+        else
+            error = ePvErrOutOfRange;
+        if(error==ePvErrSuccess)
+            break;
+        else
+            return false;
     default:
         return false;
     }
diff --git a/modules/highgui/src/grfmt_base.hpp b/modules/highgui/src/grfmt_base.hpp
index f7fde90cf..8a534daa8 100644
--- a/modules/highgui/src/grfmt_base.hpp
+++ b/modules/highgui/src/grfmt_base.hpp
@@ -59,11 +59,11 @@ class BaseImageDecoder
 {
 public:
     BaseImageDecoder();
-    virtual ~BaseImageDecoder() {};
+    virtual ~BaseImageDecoder() {}
 
-    int width() const { return m_width; };
-    int height() const { return m_height; };
-    virtual int type() const { return m_type; };
+    int width() const { return m_width; }
+    int height() const { return m_height; }
+    virtual int type() const { return m_type; }
 
     virtual bool setSource( const String& filename );
     virtual bool setSource( const Mat& buf );
@@ -90,7 +90,7 @@ class BaseImageEncoder
 {
 public:
     BaseImageEncoder();
-    virtual ~BaseImageEncoder() {};
+    virtual ~BaseImageEncoder() {}
     virtual bool isFormatSupported( int depth ) const;
 
     virtual bool setDestination( const String& filename );
diff --git a/modules/highgui/src/grfmt_tiff.cpp b/modules/highgui/src/grfmt_tiff.cpp
index efabf76fd..9013c39d1 100644
--- a/modules/highgui/src/grfmt_tiff.cpp
+++ b/modules/highgui/src/grfmt_tiff.cpp
@@ -118,18 +118,21 @@ bool TiffDecoder::readHeader()
     bool result = false;
 
     close();
-    TIFF* tif = TIFFOpen( m_filename.c_str(), "rb" );
+    // TIFFOpen() mode flags are different to fopen().  A 'b' in mode "rb" has no effect when reading.
+    // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+    TIFF* tif = TIFFOpen( m_filename.c_str(), "r" );
 
     if( tif )
     {
-        int wdth = 0, hght = 0, photometric = 0;
+        uint32 wdth = 0, hght = 0;
+        uint16 photometric = 0;
         m_tif = tif;
 
         if( TIFFGetField( tif, TIFFTAG_IMAGEWIDTH, &wdth ) &&
             TIFFGetField( tif, TIFFTAG_IMAGELENGTH, &hght ) &&
             TIFFGetField( tif, TIFFTAG_PHOTOMETRIC, &photometric ))
         {
-            int bpp=8, ncn = photometric > 1 ? 3 : 1;
+            uint16 bpp=8, ncn = photometric > 1 ? 3 : 1;
             TIFFGetField( tif, TIFFTAG_BITSPERSAMPLE, &bpp );
             TIFFGetField( tif, TIFFTAG_SAMPLESPERPIXEL, &ncn );
 
@@ -195,12 +198,12 @@ bool  TiffDecoder::readData( Mat& img )
     if( m_tif && m_width && m_height )
     {
         TIFF* tif = (TIFF*)m_tif;
-        int tile_width0 = m_width, tile_height0 = 0;
+        uint32 tile_width0 = m_width, tile_height0 = 0;
         int x, y, i;
         int is_tiled = TIFFIsTiled(tif);
-        int photometric;
+        uint16 photometric;
         TIFFGetField( tif, TIFFTAG_PHOTOMETRIC, &photometric );
-        int bpp = 8, ncn = photometric > 1 ? 3 : 1;
+        uint16 bpp = 8, ncn = photometric > 1 ? 3 : 1;
         TIFFGetField( tif, TIFFTAG_BITSPERSAMPLE, &bpp );
         TIFFGetField( tif, TIFFTAG_SAMPLESPERPIXEL, &ncn );
         const int bitsPerByte = 8;
@@ -256,11 +259,15 @@ bool  TiffDecoder::readData( Mat& img )
                     {
                         case 8:
                         {
+                            uchar * bstart = buffer;
                             if( !is_tiled )
                                 ok = TIFFReadRGBAStrip( tif, y, (uint32*)buffer );
                             else
+                            {
                                 ok = TIFFReadRGBATile( tif, x, y, (uint32*)buffer );
-
+                                //Tiles fill the buffer from the bottom up
+                                bstart += (tile_height0 - tile_height) * tile_width0 * 4;
+                            }
                             if( !ok )
                             {
                                 close();
@@ -272,19 +279,19 @@ bool  TiffDecoder::readData( Mat& img )
                                 {
                                     if (wanted_channels == 4)
                                     {
-                                        icvCvt_BGRA2RGBA_8u_C4R( buffer + i*tile_width*4, 0,
+                                        icvCvt_BGRA2RGBA_8u_C4R( bstart + i*tile_width0*4, 0,
                                                              data + x*4 + img.step*(tile_height - i - 1), 0,
                                                              cvSize(tile_width,1) );
                                     }
                                     else
                                     {
-                                        icvCvt_BGRA2BGR_8u_C4C3R( buffer + i*tile_width*4, 0,
+                                        icvCvt_BGRA2BGR_8u_C4C3R( bstart + i*tile_width0*4, 0,
                                                              data + x*3 + img.step*(tile_height - i - 1), 0,
                                                              cvSize(tile_width,1), 2 );
                                     }
                                 }
                                 else
-                                    icvCvt_BGRA2Gray_8u_C4C1R( buffer + i*tile_width*4, 0,
+                                    icvCvt_BGRA2Gray_8u_C4C1R( bstart + i*tile_width0*4, 0,
                                                               data + x + img.step*(tile_height - i - 1), 0,
                                                               cvSize(tile_width,1), 2 );
                             break;
@@ -309,19 +316,19 @@ bool  TiffDecoder::readData( Mat& img )
                                 {
                                     if( ncn == 1 )
                                     {
-                                        icvCvt_Gray2BGR_16u_C1C3R(buffer16 + i*tile_width*ncn, 0,
+                                        icvCvt_Gray2BGR_16u_C1C3R(buffer16 + i*tile_width0*ncn, 0,
                                                                   (ushort*)(data + img.step*i) + x*3, 0,
                                                                   cvSize(tile_width,1) );
                                     }
                                     else if( ncn == 3 )
                                     {
-                                        icvCvt_RGB2BGR_16u_C3R(buffer16 + i*tile_width*ncn, 0,
+                                        icvCvt_RGB2BGR_16u_C3R(buffer16 + i*tile_width0*ncn, 0,
                                                                (ushort*)(data + img.step*i) + x*3, 0,
                                                                cvSize(tile_width,1) );
                                     }
                                     else
                                     {
-                                        icvCvt_BGRA2BGR_16u_C4C3R(buffer16 + i*tile_width*ncn, 0,
+                                        icvCvt_BGRA2BGR_16u_C4C3R(buffer16 + i*tile_width0*ncn, 0,
                                                                (ushort*)(data + img.step*i) + x*3, 0,
                                                                cvSize(tile_width,1), 2 );
                                     }
@@ -331,12 +338,12 @@ bool  TiffDecoder::readData( Mat& img )
                                     if( ncn == 1 )
                                     {
                                         memcpy((ushort*)(data + img.step*i)+x,
-                                               buffer16 + i*tile_width*ncn,
+                                               buffer16 + i*tile_width0*ncn,
                                                tile_width*sizeof(buffer16[0]));
                                     }
                                     else
                                     {
-                                        icvCvt_BGRA2Gray_16u_CnC1R(buffer16 + i*tile_width*ncn, 0,
+                                        icvCvt_BGRA2Gray_16u_CnC1R(buffer16 + i*tile_width0*ncn, 0,
                                                                (ushort*)(data + img.step*i) + x, 0,
                                                                cvSize(tile_width,1), ncn, 2 );
                                     }
@@ -364,13 +371,13 @@ bool  TiffDecoder::readData( Mat& img )
                                 if(dst_bpp == 32)
                                 {
                                     memcpy((float*)(data + img.step*i)+x,
-                                           buffer32 + i*tile_width*ncn,
+                                           buffer32 + i*tile_width0*ncn,
                                            tile_width*sizeof(buffer32[0]));
                                 }
                                 else
                                 {
                                     memcpy((double*)(data + img.step*i)+x,
-                                         buffer64 + i*tile_width*ncn,
+                                         buffer64 + i*tile_width0*ncn,
                                          tile_width*sizeof(buffer64[0]));
                                 }
                             }
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index d225cb314..fbd064060 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -58,6 +58,14 @@
 #include <assert.h>
 
 #if defined WIN32 || defined WINCE
+    #if !defined _WIN32_WINNT
+        #ifdef HAVE_MSMF
+            #define _WIN32_WINNT 0x0600 // Windows Vista
+        #else
+            #define _WIN32_WINNT 0x0500 // Windows 2000
+        #endif
+    #endif
+
     #include <windows.h>
     #undef small
     #undef min
@@ -128,6 +136,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
 CvCapture* cvCreateCameraCapture_Android( int index );
 CvCapture* cvCreateCameraCapture_XIMEA( int index );
 CvCapture* cvCreateCameraCapture_AVFoundation(int index);
+CvCapture* cvCreateCameraCapture_IntelPerC(int index);
 
 
 CVAPI(int) cvHaveImageReader(const char* filename);
diff --git a/modules/highgui/src/rgbe.cpp b/modules/highgui/src/rgbe.cpp
index a28100a60..c35197daa 100644
--- a/modules/highgui/src/rgbe.cpp
+++ b/modules/highgui/src/rgbe.cpp
@@ -79,7 +79,7 @@ enum rgbe_error_codes {
   rgbe_read_error,
   rgbe_write_error,
   rgbe_format_error,
-  rgbe_memory_error,
+  rgbe_memory_error
 };
 
 /* default error routine.  change this to change error handling */
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 428ef51ef..03ff988d7 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -216,6 +216,11 @@ void cv::setMouseCallback( const String& windowName, MouseCallback onMouse, void
     cvSetMouseCallback(windowName.c_str(), onMouse, param);
 }
 
+int cv::getMouseWheelDelta( int flags )
+{
+    return CV_GET_WHEEL_DELTA(flags);
+}
+
 int cv::startWindowThread()
 {
     return cvStartWindowThread();
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index 90dbb771c..0c397fd14 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -40,31 +40,15 @@
 //M*/
 
 #include "precomp.hpp"
+#include <windowsx.h> // required for GET_X_LPARAM() and GET_Y_LPARAM() macros
 
 #if defined WIN32 || defined _WIN32
 
-#define COMPILE_MULTIMON_STUBS // Required for multi-monitor support
-#ifndef _MULTIMON_USE_SECURE_CRT
-#  define _MULTIMON_USE_SECURE_CRT 0 // some MinGW platforms have no strncpy_s
-#endif
-
-#if defined SM_CMONITORS && !defined MONITOR_DEFAULTTONEAREST
-#  define MONITOR_DEFAULTTONULL       0x00000000
-#  define MONITOR_DEFAULTTOPRIMARY    0x00000001
-#  define MONITOR_DEFAULTTONEAREST    0x00000002
-#  define MONITORINFOF_PRIMARY        0x00000001
-#endif
-#ifndef __inout
-#  define __inout
-#endif
-
 #ifdef __GNUC__
 #  pragma GCC diagnostic ignored "-Wmissing-declarations"
 #endif
-#include <MultiMon.h>
 
 #include <commctrl.h>
-#include <winuser.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -106,6 +90,10 @@ static const char* trackbar_text =
 
 #endif
 
+#ifndef WM_MOUSEHWHEEL
+    #define WM_MOUSEHWHEEL 0x020E
+#endif
+
 static void FillBitmapInfo( BITMAPINFO* bmi, int width, int height, int bpp, int origin )
 {
     assert( bmi && width >= 0 && height >= 0 && (bpp == 8 || bpp == 24 || bpp == 32));
@@ -1378,6 +1366,39 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
             SetFocus(window->hwnd);
         break;
 
+    case WM_MOUSEWHEEL:
+    case WM_MOUSEHWHEEL:
+       if( window->on_mouse )
+       {
+          int flags = (wParam & MK_LBUTTON      ? CV_EVENT_FLAG_LBUTTON  : 0)|
+                      (wParam & MK_RBUTTON      ? CV_EVENT_FLAG_RBUTTON  : 0)|
+                      (wParam & MK_MBUTTON      ? CV_EVENT_FLAG_MBUTTON  : 0)|
+                      (wParam & MK_CONTROL      ? CV_EVENT_FLAG_CTRLKEY  : 0)|
+                      (wParam & MK_SHIFT        ? CV_EVENT_FLAG_SHIFTKEY : 0)|
+                      (GetKeyState(VK_MENU) < 0 ? CV_EVENT_FLAG_ALTKEY   : 0);
+          int event = (uMsg == WM_MOUSEWHEEL    ? CV_EVENT_MOUSEWHEEL    : CV_EVENT_MOUSEHWHEEL);
+
+          // Set the wheel delta of mouse wheel to be in the upper word of 'event'
+          int delta = GET_WHEEL_DELTA_WPARAM(wParam);
+          flags |= (delta << 16);
+
+          POINT pt;
+          pt.x = GET_X_LPARAM( lParam );
+          pt.y = GET_Y_LPARAM( lParam );
+          ::ScreenToClient(hwnd, &pt); // Convert screen coordinates to client coordinates.
+
+          RECT rect;
+          GetClientRect( window->hwnd, &rect );
+
+          SIZE size = {0,0};
+          icvGetBitmapData( window, &size, 0, 0 );
+
+          window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1),
+                                   pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags,
+                                   window->on_mouse_param );
+       }
+       break;
+
     case WM_ERASEBKGND:
         {
             RECT cr, tr, wrc;
@@ -1475,8 +1496,8 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
             if( uMsg == WM_LBUTTONUP || uMsg == WM_RBUTTONUP || uMsg == WM_MBUTTONUP )
                 ReleaseCapture();
 
-            pt.x = LOWORD( lParam );
-            pt.y = HIWORD( lParam );
+            pt.x = GET_X_LPARAM( lParam );
+            pt.y = GET_Y_LPARAM( lParam );
 
             GetClientRect( window->hwnd, &rect );
             icvGetBitmapData( window, &size, 0, 0 );
diff --git a/modules/highgui/test/test_ffmpeg.cpp b/modules/highgui/test/test_ffmpeg.cpp
index 727971e39..5dcd67be6 100644
--- a/modules/highgui/test/test_ffmpeg.cpp
+++ b/modules/highgui/test/test_ffmpeg.cpp
@@ -88,7 +88,7 @@ public:
             stringstream s;
             s << tag;
 
-            const string filename = "output_"+s.str()+".avi";
+            const string filename = tempfile((s.str()+".avi").c_str());
 
             try
             {
diff --git a/modules/highgui/test/test_grfmt.cpp b/modules/highgui/test/test_grfmt.cpp
index d670fa3da..11533e3ca 100644
--- a/modules/highgui/test/test_grfmt.cpp
+++ b/modules/highgui/test/test_grfmt.cpp
@@ -396,7 +396,13 @@ TEST(Highgui_Jpeg, encode_empty)
 #define int64 int64_hack_
 #include "tiff.h"
 
+#ifdef ANDROID
+// Test disabled as it uses a lot of memory.
+// It is killed with SIGKILL by out of memory killer.
+TEST(Highgui_Tiff, DISABLED_decode_tile16384x16384)
+#else
 TEST(Highgui_Tiff, decode_tile16384x16384)
+#endif
 {
     // see issue #2161
     cv::Mat big(16384, 16384, CV_8UC1, cv::Scalar::all(0));
@@ -412,8 +418,8 @@ TEST(Highgui_Tiff, decode_tile16384x16384)
 
     try
     {
-        cv::imread(file3);
-        EXPECT_NO_THROW(cv::imread(file4));
+        cv::imread(file3, IMREAD_UNCHANGED);
+        EXPECT_NO_THROW(cv::imread(file4, IMREAD_UNCHANGED));
     }
     catch(const std::bad_alloc&)
     {
@@ -423,6 +429,95 @@ TEST(Highgui_Tiff, decode_tile16384x16384)
     remove(file3.c_str());
     remove(file4.c_str());
 }
+
+TEST(Highgui_Tiff, write_read_16bit_big_little_endian)
+{
+    // see issue #2601 "16-bit Grayscale TIFF Load Failures Due to Buffer Underflow and Endianness"
+
+    // Setup data for two minimal 16-bit grayscale TIFF files in both endian formats
+    uchar tiff_sample_data[2][86] = { {
+        // Little endian
+        0x49, 0x49, 0x2a, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xad, 0xde, 0xef, 0xbe, 0x06, 0x00, 0x00, 0x01,
+        0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x01, 0x03, 0x00, 0x01, 0x00,
+        0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+        0x00, 0x00, 0x06, 0x01, 0x03, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x01,
+        0x04, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x17, 0x01, 0x04, 0x00, 0x01, 0x00,
+        0x00, 0x00, 0x04, 0x00, 0x00, 0x00 }, {
+        // Big endian
+        0x4d, 0x4d, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x0c, 0xde, 0xad, 0xbe, 0xef, 0x00, 0x06, 0x01, 0x00,
+        0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x00, 0x01, 0x01, 0x00, 0x03, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10,
+        0x00, 0x00, 0x01, 0x06, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0x01, 0x11,
+        0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x01, 0x17, 0x00, 0x04, 0x00, 0x00,
+        0x00, 0x01, 0x00, 0x00, 0x00, 0x04 }
+        };
+
+    // Test imread() for both a little endian TIFF and big endian TIFF
+    for (int i = 0; i < 2; i++)
+    {
+        string filename = cv::tempfile(".tiff");
+
+        // Write sample TIFF file
+        FILE* fp = fopen(filename.c_str(), "wb");
+        ASSERT_TRUE(fp != NULL);
+        ASSERT_EQ((size_t)1, fwrite(tiff_sample_data, 86, 1, fp));
+        fclose(fp);
+
+        Mat img = imread(filename, IMREAD_UNCHANGED);
+
+        EXPECT_EQ(1, img.rows);
+        EXPECT_EQ(2, img.cols);
+        EXPECT_EQ(CV_16U, img.type());
+        EXPECT_EQ(sizeof(ushort), img.elemSize());
+        EXPECT_EQ(1, img.channels());
+        EXPECT_EQ(0xDEAD, img.at<ushort>(0,0));
+        EXPECT_EQ(0xBEEF, img.at<ushort>(0,1));
+
+        remove(filename.c_str());
+    }
+}
+
+class CV_GrfmtReadTifTiledWithNotFullTiles: public cvtest::BaseTest
+{
+public:
+    void run(int)
+    {
+        try
+        {
+            /* see issue #3472 - dealing with tiled images where the tile size is
+             * not a multiple of image size.
+             * The tiled images were created with 'convert' from ImageMagick,
+             * using the command 'convert <input> -define tiff:tile-geometry=128x128 -depth [8|16] <output>
+             * Note that the conversion to 16 bits expands the range from 0-255 to 0-255*255,
+             * so the test converts back but rounding errors cause small differences.
+             */
+            cv::Mat img = imread(string(ts->get_data_path()) + "readwrite/non_tiled.tif",-1);
+            if (img.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+            ASSERT_TRUE(img.channels() == 3);
+            cv::Mat tiled8 = imread(string(ts->get_data_path()) + "readwrite/tiled_8.tif", -1);
+            if (tiled8.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+            ASSERT_PRED_FORMAT2(cvtest::MatComparator(0, 0), img, tiled8);
+
+            cv::Mat tiled16 = imread(string(ts->get_data_path()) + "readwrite/tiled_16.tif", -1);
+            if (tiled16.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+            ASSERT_TRUE(tiled16.elemSize() == 6);
+            tiled16.convertTo(tiled8, CV_8UC3, 1./256.);
+            ASSERT_PRED_FORMAT2(cvtest::MatComparator(2, 0), img, tiled8);
+            // What about 32, 64 bit?
+        }
+        catch(...)
+        {
+            ts->set_failed_test_info(cvtest::TS::FAIL_EXCEPTION);
+        }
+        ts->set_failed_test_info(cvtest::TS::OK);
+    }
+};
+
+TEST(Highgui_Tiff, decode_tile_remainder)
+{
+    CV_GrfmtReadTifTiledWithNotFullTiles test; test.safe_run();
+}
+
 #endif
 
 #ifdef HAVE_WEBP
diff --git a/modules/highgui/test/test_positioning.cpp b/modules/highgui/test/test_positioning.cpp
index edc8dcf42..993a76cb4 100644
--- a/modules/highgui/test/test_positioning.cpp
+++ b/modules/highgui/test/test_positioning.cpp
@@ -67,7 +67,7 @@ private:
 class CV_VideoProgressivePositioningTest: public CV_VideoPositioningTest
 {
 public:
-    CV_VideoProgressivePositioningTest() : CV_VideoPositioningTest() {};
+    CV_VideoProgressivePositioningTest() : CV_VideoPositioningTest() { }
     ~CV_VideoProgressivePositioningTest();
     void run(int);
 };
@@ -75,7 +75,7 @@ public:
 class CV_VideoRandomPositioningTest: public CV_VideoPositioningTest
 {
 public:
-    CV_VideoRandomPositioningTest(): CV_VideoPositioningTest() {};
+    CV_VideoRandomPositioningTest(): CV_VideoPositioningTest() { }
     ~CV_VideoRandomPositioningTest();
     void run(int);
 };
diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp
index 8468e4618..826d16574 100644
--- a/modules/highgui/test/test_precomp.hpp
+++ b/modules/highgui/test/test_precomp.hpp
@@ -35,6 +35,7 @@
     defined(HAVE_XIMEA)        || \
     defined(HAVE_AVFOUNDATION) || \
     defined(HAVE_GIGE_API)     || \
+    defined(HAVE_INTELPERC)    || \
     (0)
     //defined(HAVE_ANDROID_NATIVE_CAMERA) ||   - enable after #1193
 #  define BUILD_WITH_CAMERA_SUPPORT 1
diff --git a/modules/highgui/test/test_video_io.cpp b/modules/highgui/test/test_video_io.cpp
index 87bb1a7ae..cacfde0b3 100644
--- a/modules/highgui/test/test_video_io.cpp
+++ b/modules/highgui/test/test_video_io.cpp
@@ -332,9 +332,7 @@ void CV_HighGuiTest::VideoTest(const string& dir, const cvtest::VideoFormat& fmt
         }
     }
 
-    printf("Before saved release for %s\n", tmp_name.c_str());
     cvReleaseCapture( &saved );
-    printf("After release\n");
 
     ts->printf(ts->LOG, "end test function : ImagesVideo \n");
 }
diff --git a/modules/imgproc/doc/feature_detection.rst b/modules/imgproc/doc/feature_detection.rst
index 87e14d98f..023028823 100644
--- a/modules/imgproc/doc/feature_detection.rst
+++ b/modules/imgproc/doc/feature_detection.rst
@@ -34,7 +34,7 @@ http://en.wikipedia.org/wiki/Canny_edge_detector
 
    * An example on using the canny edge detector can be found at opencv_source_code/samples/cpp/edge.cpp
 
-   * (Python) An example on using the canny edge detector can be found at opencv_source_code/samples/cpp/edge.py
+   * (Python) An example on using the canny edge detector can be found at opencv_source_code/samples/python/edge.py
 
 cornerEigenValsAndVecs
 ----------------------
@@ -358,11 +358,11 @@ HoughLines
 ----------
 Finds lines in a binary image using the standard Hough transform.
 
-.. ocv:function:: void HoughLines( InputArray image, OutputArray lines, double rho, double theta, int threshold, double srn=0, double stn=0 )
+.. ocv:function:: void HoughLines( InputArray image, OutputArray lines, double rho, double theta, int threshold, double srn=0, double stn=0, double min_theta=0, double max_theta=CV_PI )
 
-.. ocv:pyfunction:: cv2.HoughLines(image, rho, theta, threshold[, lines[, srn[, stn]]]) -> lines
+.. ocv:pyfunction:: cv2.HoughLines(image, rho, theta, threshold[, lines[, srn[, stn[, min_theta[, max_theta]]]]]) -> lines
 
-.. ocv:cfunction:: CvSeq* cvHoughLines2( CvArr* image, void* line_storage, int method, double rho, double theta, int threshold, double param1=0, double param2=0 )
+.. ocv:cfunction:: CvSeq* cvHoughLines2( CvArr* image, void* line_storage, int method, double rho, double theta, int threshold, double param1=0, double param2=0, double min_theta=0, double max_theta=CV_PI )
 
     :param image: 8-bit, single-channel binary source image. The image may be modified by the function.
 
@@ -378,6 +378,10 @@ Finds lines in a binary image using the standard Hough transform.
 
     :param stn: For the multi-scale Hough transform, it is a divisor for the distance resolution  ``theta``.
 
+    :param min_theta: For standard and multi-scale Hough transform, minimum angle to check for lines. Must fall between 0 and max_theta.
+
+    :param max_theta: For standard and multi-scale Hough transform, maximum angle to check for lines. Must fall between min_theta and CV_PI.
+
     :param method: One of the following Hough transform variants:
 
             * **CV_HOUGH_STANDARD** classical or standard Hough transform. Every line is represented by two floating-point numbers  :math:`(\rho, \theta)` , where  :math:`\rho`  is a distance between (0,0) point and the line, and  :math:`\theta`  is the angle between x-axis and the normal to the line. Thus, the matrix must be (the created sequence will be) of  ``CV_32FC2``  type
diff --git a/modules/imgproc/doc/filtering.rst b/modules/imgproc/doc/filtering.rst
index 166e83e9f..a055b129f 100755
--- a/modules/imgproc/doc/filtering.rst
+++ b/modules/imgproc/doc/filtering.rst
@@ -412,29 +412,6 @@ http://www.dai.ed.ac.uk/CVonline/LOCAL\_COPIES/MANDUCHI1/Bilateral\_Filtering.ht
 This filter does not work inplace.
 
 
-adaptiveBilateralFilter
------------------------
-Applies the adaptive bilateral filter to an image.
-
-.. ocv:function:: void adaptiveBilateralFilter( InputArray src, OutputArray dst, Size ksize, double sigmaSpace, double maxSigmaColor = 20.0, Point anchor=Point(-1, -1), int borderType=BORDER_DEFAULT )
-
-.. ocv:pyfunction:: cv2.adaptiveBilateralFilter(src, ksize, sigmaSpace[, dst[, anchor[, borderType]]]) -> dst
-
-    :param src: The source image
-
-    :param dst: The destination image; will have the same size and the same type as src
-
-    :param ksize: The kernel size. This is the neighborhood where the local variance will be calculated, and where pixels will contribute (in a weighted manner).
-
-    :param sigmaSpace: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param maxSigmaColor: Maximum allowed sigma color (will clamp the value calculated in the ksize neighborhood. Larger value of the parameter means that more dissimilar pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param borderType: Pixel extrapolation method.
-
-A main part of our strategy will be to load each raw pixel once, and reuse it to calculate all pixels in the output (filtered) image that need this pixel value. The math of the filter is that of the usual bilateral filter, except that the sigma color is calculated in the neighborhood, and clamped by the optional input value.
-
-
 blur
 ----
 Blurs an image using the normalized box filter.
diff --git a/modules/imgproc/doc/miscellaneous_transformations.rst b/modules/imgproc/doc/miscellaneous_transformations.rst
index df63c929b..32690cf8f 100644
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -110,6 +110,8 @@ But in case of a non-linear transformation, an input RGB image should be normali
 
 If you use ``cvtColor`` with 8-bit images, the conversion will have some information lost. For many applications, this will not be noticeable but it is recommended to use 32-bit images in applications that need the full range of colors or that convert an image before an operation and then convert back.
 
+If conversion adds the alpha channel, its value will set to the maximum of corresponding channel range: 255 for ``CV_8U``, 65535 for ``CV_16U``, 1 for ``CV_32F``.
+
 The function can do the following transformations:
 
 *
@@ -124,7 +126,7 @@ The function can do the following transformations:
 
     .. math::
 
-        \text{Gray to RGB[A]:} \quad R  \leftarrow Y, G  \leftarrow Y, B  \leftarrow Y, A  \leftarrow 0
+        \text{Gray to RGB[A]:} \quad R  \leftarrow Y, G  \leftarrow Y, B  \leftarrow Y, A  \leftarrow \max (ChannelRange)
 
     The conversion from a RGB image to gray is done with:
 
@@ -634,7 +636,7 @@ The functions calculate one or more integral images for the source image as foll
 
     \texttt{tilted} (X,Y) =  \sum _{y<Y,abs(x-X+1) \leq Y-y-1}  \texttt{image} (x,y)
 
-Using these integral images, you can calculate sa um, mean, and standard deviation over a specific up-right or rotated rectangular region of the image in a constant time, for example:
+Using these integral images, you can calculate sum, mean, and standard deviation over a specific up-right or rotated rectangular region of the image in a constant time, for example:
 
 .. math::
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 5a9450bf2..ffa41655b 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -203,7 +203,8 @@ enum { HISTCMP_CORREL        = 0,
        HISTCMP_CHISQR        = 1,
        HISTCMP_INTERSECT     = 2,
        HISTCMP_BHATTACHARYYA = 3,
-       HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA
+       HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA,
+       HISTCMP_CHISQR_ALT    = 4
      };
 
 //! the color conversion code
@@ -952,7 +953,7 @@ public:
  */
     CV_WRAP virtual int compareSegments(const Size& size, InputArray lines1, InputArray lines2, InputOutputArray _image = noArray()) = 0;
 
-    virtual ~LineSegmentDetector() {};
+    virtual ~LineSegmentDetector() { }
 };
 
 //! Returns a pointer to a LineSegmentDetector class.
@@ -1065,11 +1066,6 @@ CV_EXPORTS_W void bilateralFilter( InputArray src, OutputArray dst, int d,
                                    double sigmaColor, double sigmaSpace,
                                    int borderType = BORDER_DEFAULT );
 
-//! smooths the image using adaptive bilateral filter
-CV_EXPORTS_W void adaptiveBilateralFilter( InputArray src, OutputArray dst, Size ksize,
-                                           double sigmaSpace, double maxSigmaColor = 20.0, Point anchor=Point(-1, -1),
-                                           int borderType=BORDER_DEFAULT );
-
 //! smooths the image using the box filter. Each pixel is processed in O(1) time
 CV_EXPORTS_W void boxFilter( InputArray src, OutputArray dst, int ddepth,
                              Size ksize, Point anchor = Point(-1,-1),
@@ -1151,7 +1147,8 @@ CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
 //! finds lines in the black-n-white image using the standard or pyramid Hough transform
 CV_EXPORTS_W void HoughLines( InputArray image, OutputArray lines,
                               double rho, double theta, int threshold,
-                              double srn = 0, double stn = 0 );
+                              double srn = 0, double stn = 0,
+                              double min_theta = 0, double max_theta = CV_PI );
 
 //! finds line segments in the black-n-white image using probabilistic Hough transform
 CV_EXPORTS_W void HoughLinesP( InputArray image, OutputArray lines,
diff --git a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
index 4e2dc7142..168a5cfd2 100644
--- a/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
+++ b/modules/imgproc/include/opencv2/imgproc/imgproc_c.h
@@ -364,7 +364,7 @@ CV_INLINE double cvContourPerimeter( const void* contour )
 }
 
 
-/* Calculates contour boundning rectangle (update=1) or
+/* Calculates contour bounding rectangle (update=1) or
    just retrieves pre-calculated rectangle (update=0) */
 CVAPI(CvRect)  cvBoundingRect( CvArr* points, int update CV_DEFAULT(0) );
 
@@ -601,7 +601,8 @@ CVAPI(void)  cvGoodFeaturesToTrack( const CvArr* image, CvArr* eig_image,
    param1 ~ srn, param2 ~ stn - for multi-scale */
 CVAPI(CvSeq*)  cvHoughLines2( CvArr* image, void* line_storage, int method,
                               double rho, double theta, int threshold,
-                              double param1 CV_DEFAULT(0), double param2 CV_DEFAULT(0));
+                              double param1 CV_DEFAULT(0), double param2 CV_DEFAULT(0),
+                              double min_theta CV_DEFAULT(0), double max_theta CV_DEFAULT(CV_PI));
 
 /* Finds circles in the image */
 CVAPI(CvSeq*) cvHoughCircles( CvArr* image, void* circle_storage,
diff --git a/modules/imgproc/perf/opencl/perf_accumulate.cpp b/modules/imgproc/perf/opencl/perf_accumulate.cpp
new file mode 100644
index 000000000..5b7ac4c75
--- /dev/null
+++ b/modules/imgproc/perf/opencl/perf_accumulate.cpp
@@ -0,0 +1,140 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////// Accumulate ///////////////////////////////////
+
+typedef Size_MatType AccumulateFixture;
+
+OCL_PERF_TEST_P(AccumulateFixture, Accumulate,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::accumulate(src, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateSquare ///////////////////////////////////
+
+typedef Size_MatType AccumulateSquareFixture;
+
+OCL_PERF_TEST_P(AccumulateSquareFixture, AccumulateSquare,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateSquare(src, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateProduct ///////////////////////////////////
+
+typedef Size_MatType AccumulateProductFixture;
+
+OCL_PERF_TEST_P(AccumulateProductFixture, AccumulateProduct,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src1, src2, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateProduct(src1, src2, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+/////////////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+typedef Size_MatType AccumulateWeightedFixture;
+
+OCL_PERF_TEST_P(AccumulateWeightedFixture, AccumulateWeighted,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params), cn = CV_MAT_CN(srcType), dstType = CV_32FC(cn);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
+
+    UMat src(srcSize, srcType), dst(srcSize, dstType);
+    declare.in(src, dst, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::accumulateWeighted(src, dst, 2.0);
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif
diff --git a/modules/ocl/perf/perf_norm.cpp b/modules/imgproc/perf/opencl/perf_blend.cpp
similarity index 71%
rename from modules/ocl/perf/perf_norm.cpp
rename to modules/imgproc/perf/opencl/perf_blend.cpp
index ff49eb4ed..f595069bd 100644
--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/imgproc/perf/opencl/perf_blend.cpp
@@ -43,44 +43,40 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
+#ifdef HAVE_OPENCL
 
-///////////// norm////////////////////////
+namespace cvtest {
+namespace ocl {
 
-typedef tuple<Size, MatType> normParams;
-typedef TestBaseWithParam<normParams> normFixture;
+///////////// BlendLinear ////////////////////////
 
-PERF_TEST_P(normFixture, norm, testing::Combine(
-                OCL_TYPICAL_MAT_SIZES,
-                OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+typedef Size_MatType BlendLinearFixture;
+
+OCL_PERF_TEST_P(BlendLinearFixture, BlendLinear, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
 {
-    const normParams params = GetParam();
+    Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    double value = 0.0;
-    const double eps = CV_MAT_DEPTH(type) == CV_8U ? DBL_EPSILON : 1e-3;
+    const int srcType = get<1>(params);
+    const double eps = CV_MAT_DEPTH(srcType) <= CV_32S ? 1.0 : 0.2;
 
-    Mat src1(srcSize, type), src2(srcSize, type);
-    declare.in(src1, src2, WARMUP_RNG);
+    checkDeviceMaxMemoryAllocSize(srcSize, srcType);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2);
+    UMat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
+    UMat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
 
-        OCL_TEST_CYCLE() value = cv::ocl::norm(oclSrc1, oclSrc2, NORM_INF);
+    declare.in(src1, src2, WARMUP_RNG).in(weights1, weights2, WARMUP_READ).out(dst);
+    randu(weights1, 0, 1);
+    randu(weights2, 0, 1);
 
-        SANITY_CHECK(value, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() value = cv::norm(src1, src2, NORM_INF);
+    OCL_TEST_CYCLE() cv::blendLinear(src1, src2, weights1, weights2, dst);
 
-        SANITY_CHECK(value);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, eps);
 }
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_color.cpp b/modules/imgproc/perf/opencl/perf_color.cpp
similarity index 82%
rename from modules/ocl/perf/perf_color.cpp
rename to modules/imgproc/perf/opencl/perf_color.cpp
index 1145f1f2e..21742fece 100644
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/imgproc/perf/opencl/perf_color.cpp
@@ -43,24 +43,29 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include "perf_precomp.hpp"
 
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
 using std::tr1::make_tuple;
 
 ///////////// cvtColor////////////////////////
 
 CV_ENUM(ConversionTypes, COLOR_RGB2GRAY, COLOR_RGB2BGR, COLOR_RGB2YUV, COLOR_YUV2RGB, COLOR_RGB2YCrCb,
         COLOR_YCrCb2RGB, COLOR_RGB2XYZ, COLOR_XYZ2RGB, COLOR_RGB2HSV, COLOR_HSV2RGB, COLOR_RGB2HLS,
-        COLOR_HLS2RGB, COLOR_BGR5652BGR, COLOR_BGR2BGR565, COLOR_RGBA2mRGBA, COLOR_mRGBA2RGBA, COLOR_YUV2RGB_NV12)
+        COLOR_HLS2RGB, COLOR_BGR5652BGR, COLOR_BGR2BGR565, COLOR_RGBA2mRGBA, COLOR_mRGBA2RGBA, COLOR_YUV2RGB_NV12,
+        COLOR_RGB2Lab, COLOR_Lab2BGR)
 
-typedef tuple<Size, tuple<ConversionTypes, int, int> > cvtColorParams;
-typedef TestBaseWithParam<cvtColorParams> cvtColorFixture;
+typedef tuple<Size, tuple<ConversionTypes, int, int> > CvtColorParams;
+typedef TestBaseWithParam<CvtColorParams> CvtColorFixture;
 
-PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
-                testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)),
+OCL_PERF_TEST_P(CvtColorFixture, CvtColor, testing::Combine(
+                OCL_TEST_SIZES,
                 testing::Values(
                     make_tuple(ConversionTypes(COLOR_RGB2GRAY), 3, 1),
                     make_tuple(ConversionTypes(COLOR_RGB2BGR), 3, 3),
@@ -78,33 +83,25 @@ PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine(
                     make_tuple(ConversionTypes(COLOR_BGR2BGR565), 3, 2),
                     make_tuple(ConversionTypes(COLOR_RGBA2mRGBA), 4, 4),
                     make_tuple(ConversionTypes(COLOR_mRGBA2RGBA), 4, 4),
-                    make_tuple(ConversionTypes(COLOR_YUV2RGB_NV12), 1, 3)
+                    make_tuple(ConversionTypes(COLOR_YUV2RGB_NV12), 1, 3),
+                    make_tuple(ConversionTypes(COLOR_RGB2Lab), 3, 3),
+                    make_tuple(ConversionTypes(COLOR_Lab2BGR), 3, 4)
                     )))
 {
-    cvtColorParams params = GetParam();
+    CvtColorParams params = GetParam();
     const Size srcSize = get<0>(params);
     const tuple<int, int, int> conversionParams = get<1>(params);
     const int code = get<0>(conversionParams), scn = get<1>(conversionParams),
             dcn = get<2>(conversionParams);
 
-    Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));
+    UMat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));
     declare.in(src, WARMUP_RNG).out(dst);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());
+    OCL_TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);
 
-        OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, 1);
 }
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/perf/opencl/perf_filters.cpp b/modules/imgproc/perf/opencl/perf_filters.cpp
new file mode 100644
index 000000000..57b928c28
--- /dev/null
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -0,0 +1,327 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef tuple<Size, MatType, int> FilterParams;
+typedef TestBaseWithParam<FilterParams> FilterFixture;
+
+///////////// Blur ////////////////////////
+
+typedef FilterFixture BlurFixture;
+
+OCL_PERF_TEST_P(BlurFixture, Blur,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params), bordertype = BORDER_CONSTANT;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::blur(src, dst, Size(ksize, ksize), Point(-1, -1), bordertype);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// SqrBoxFilter ////////////////////////
+
+typedef tuple<Size, MatType, Size> SqrBoxFilterParams;
+typedef TestBaseWithParam<SqrBoxFilterParams> SqrBoxFilterFixture;
+
+OCL_PERF_TEST_P(SqrBoxFilterFixture, SqrBoxFilter,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
+                                   OCL_PERF_ENUM(Size(3, 3), Size(20, 3), Size(3, 20), Size(20, 20))))
+{
+    const SqrBoxFilterParams params = GetParam();
+    const Size srcSize = get<0>(params), ksize = get<2>(params);
+    const int type = get<1>(params), depth = CV_MAT_DEPTH(type),
+            ddepth = depth == CV_8U ? CV_32S : CV_32F;
+    const double eps = ddepth == CV_32S ? 0 : 5e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_MAKE_TYPE(ddepth, CV_MAT_CN(type)));
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::sqrBoxFilter(src, dst, ddepth, ksize, Point(-1, -1), false);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Laplacian////////////////////////
+
+typedef FilterFixture LaplacianFixture;
+
+OCL_PERF_TEST_P(LaplacianFixture, Laplacian,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize, 1);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Erode ////////////////////
+
+typedef FilterFixture ErodeFixture;
+
+OCL_PERF_TEST_P(ErodeFixture, Erode,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::erode(src, dst, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Dilate ////////////////////
+
+typedef FilterFixture DilateFixture;
+
+OCL_PERF_TEST_P(DilateFixture, Dilate,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::dilate(src, dst, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// MorphologyEx ////////////////////////
+
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+
+typedef tuple<Size, MatType, MorphOp, int> MorphologyExParams;
+typedef TestBaseWithParam<MorphologyExParams> MorphologyExFixture;
+
+OCL_PERF_TEST_P(MorphologyExFixture, MorphologyEx,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, MorphOp::all(), OCL_PERF_ENUM(3, 5)))
+{
+    const MorphologyExParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), op = get<2>(params), ksize = get<3>(params);
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);
+
+    OCL_TEST_CYCLE() cv::morphologyEx(src, dst, op, ker);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Sobel ////////////////////////
+
+typedef Size_MatType SobelFixture;
+
+OCL_PERF_TEST_P(SobelFixture, Sobel,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 1;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Scharr ////////////////////////
+
+typedef Size_MatType ScharrFixture;
+
+OCL_PERF_TEST_P(ScharrFixture, Scharr,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 0;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::Scharr(src, dst, -1, dx, dy);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// GaussianBlur ////////////////////////
+
+typedef FilterFixture GaussianBlurFixture;
+
+OCL_PERF_TEST_P(GaussianBlurFixture, GaussianBlur,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5, 7)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 + DBL_EPSILON : 3e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::GaussianBlur(src, dst, Size(ksize, ksize), 0);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Filter2D ////////////////////////
+
+typedef FilterFixture Filter2DFixture;
+
+OCL_PERF_TEST_P(Filter2DFixture, Filter2D,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, OCL_PERF_ENUM(3, 5)))
+{
+    const FilterParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type), kernel(ksize, ksize, CV_32SC1);
+    declare.in(src, WARMUP_RNG).in(kernel).out(dst);
+    randu(kernel, -3.0, 3.0);
+
+    OCL_TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Bilateral ////////////////////////
+
+typedef TestBaseWithParam<Size> BilateralFixture;
+
+OCL_PERF_TEST_P(BilateralFixture, Bilateral, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+    const int d = 7;
+    const double sigmacolor = 50.0, sigmaspace = 50.0;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::bilateralFilter(src, dst, d, sigmacolor, sigmaspace);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// MedianBlur ////////////////////////
+
+typedef tuple<Size, int> MedianBlurParams;
+typedef TestBaseWithParam<MedianBlurParams> MedianBlurFixture;
+
+OCL_PERF_TEST_P(MedianBlurFixture, Bilateral, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(3, 5)))
+{
+    MedianBlurParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int ksize = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::medianBlur(src, dst, ksize);
+
+    SANITY_CHECK(dst);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/src/precomp.hpp b/modules/imgproc/perf/opencl/perf_gftt.cpp
similarity index 60%
rename from modules/ocl/src/precomp.hpp
rename to modules/imgproc/perf/opencl/perf_gftt.cpp
index 9cdb07aae..29626c62e 100644
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/imgproc/perf/opencl/perf_gftt.cpp
@@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -9,15 +9,11 @@
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
-//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//    Guoping Long, longguoping@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -45,53 +41,47 @@
 //
 //M*/
 
-#ifndef __OPENCV_PRECOMP_H__
-#define __OPENCV_PRECOMP_H__
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-#if defined _MSC_VER && _MSC_VER >= 1200
-#pragma warning( disable: 4127 4267 4324 4244 4251 4710 4711 4514 4996 )
-#endif
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-#include "cvconfig.h"
-
-#include <map>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <algorithm>
 #include <sstream>
-#include <exception>
-#include <stdio.h>
 
-#undef OPENCV_NOSTL
+#ifdef HAVE_OPENCL
 
-#include "opencv2/imgproc.hpp"
-#include "opencv2/objdetect/objdetect_c.h"
-#include "opencv2/ocl.hpp"
+namespace cvtest {
+namespace ocl {
 
-#include "opencv2/core/utility.hpp"
-#include "opencv2/core/private.hpp"
-#include "opencv2/core/ocl.hpp"
+//////////////////////////// GoodFeaturesToTrack //////////////////////////
 
-#define __ATI__
+typedef tuple<String, double, bool> GoodFeaturesToTrackParams;
+typedef TestBaseWithParam<GoodFeaturesToTrackParams> GoodFeaturesToTrackFixture;
 
-#if defined (HAVE_OPENCL)
-
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#include "opencv2/ocl/private/util.hpp"
-#include "safe_call.hpp"
-
-#else /* defined(HAVE_OPENCL) */
-
-static inline void throw_nogpu()
+OCL_PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrack,
+                ::testing::Combine(OCL_PERF_ENUM(String("gpu/opticalflow/rubberwhale1.png")),
+                                   OCL_PERF_ENUM(0.0, 3.0), Bool()))
 {
-    CV_Error(CV_GpuNotSupported, "The library is compilled without OpenCL support.\n");
+    GoodFeaturesToTrackParams params = GetParam();
+    const String fileName = get<0>(params);
+    const double minDistance = get<1>(params), qualityLevel = 0.01;
+    const bool harrisDetector = get<2>(params);
+    const int maxCorners = 1000;
+
+    Mat img = imread(getDataPath(fileName), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty()) << "could not load " << fileName;
+
+    checkDeviceMaxMemoryAllocSize(img.size(), img.type());
+
+    UMat src(img.size(), img.type()), dst(1, maxCorners, CV_32FC2);
+    img.copyTo(src);
+
+    declare.in(src, WARMUP_READ).out(dst);
+
+    OCL_TEST_CYCLE() cv::goodFeaturesToTrack(src, dst, maxCorners, qualityLevel,
+                                             minDistance, noArray(), 3, harrisDetector, 0.04);
+
+    SANITY_CHECK(dst);
 }
 
-#endif /* defined(HAVE_OPENCL) */
+} } // namespace cvtest::ocl
 
-#endif /* __OPENCV_PRECOMP_H__ */
+#endif
diff --git a/modules/imgproc/perf/opencl/perf_imgproc.cpp b/modules/imgproc/perf/opencl/perf_imgproc.cpp
new file mode 100644
index 000000000..ae6112e0d
--- /dev/null
+++ b/modules/imgproc/perf/opencl/perf_imgproc.cpp
@@ -0,0 +1,286 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// equalizeHist ////////////////////////
+
+typedef TestBaseWithParam<Size> EqualizeHistFixture;
+
+OCL_PERF_TEST_P(EqualizeHistFixture, EqualizeHist, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+    const double eps = 1;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::equalizeHist(src, dst);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// calcHist ////////////////////////
+
+typedef TestBaseWithParam<Size> CalcHistFixture;
+
+OCL_PERF_TEST_P(CalcHistFixture, CalcHist, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), hist(256, 1, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(hist);
+
+    OCL_TEST_CYCLE() cv::calcHist(std::vector<UMat>(1, src), channels, noArray(), hist, histSize, ranges, false);
+
+    SANITY_CHECK(hist);
+}
+
+/////////// CopyMakeBorder //////////////////////
+
+CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101)
+
+typedef tuple<Size, MatType, Border> CopyMakeBorderParamType;
+typedef TestBaseWithParam<CopyMakeBorderParamType> CopyMakeBorderFixture;
+
+OCL_PERF_TEST_P(CopyMakeBorderFixture, CopyMakeBorder,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, Border::all()))
+{
+    const CopyMakeBorderParamType params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst;
+    const Size dstSize = srcSize + Size(12, 12);
+    dst.create(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::copyMakeBorder(src, dst, 7, 5, 5, 7, borderType, cv::Scalar(1.0));
+
+    SANITY_CHECK(dst);
+}
+
+///////////// CornerMinEigenVal ////////////////////////
+
+typedef Size_MatType CornerMinEigenValFixture;
+
+OCL_PERF_TEST_P(CornerMinEigenValFixture, CornerMinEigenVal,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+    const int blockSize = 7, apertureSize = 1 + 2 * 3;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// CornerHarris ////////////////////////
+
+typedef Size_MatType CornerHarrisFixture;
+
+OCL_PERF_TEST_P(CornerHarrisFixture, CornerHarris,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::cornerHarris(src, dst, 5, 7, 0.1, borderType);
+
+    SANITY_CHECK(dst, 5e-6, ERROR_RELATIVE);
+}
+
+///////////// PreCornerDetect ////////////////////////
+
+typedef Size_MatType PreCornerDetectFixture;
+
+OCL_PERF_TEST_P(PreCornerDetectFixture, PreCornerDetect,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), borderType = BORDER_REFLECT;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, CV_32FC1);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::preCornerDetect(src, dst, 3, borderType);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Integral ////////////////////////
+
+typedef tuple<Size, MatDepth> IntegralParams;
+typedef TestBaseWithParam<IntegralParams> IntegralFixture;
+
+OCL_PERF_TEST_P(IntegralFixture, Integral1, ::testing::Combine(OCL_TEST_SIZES, OCL_PERF_ENUM(CV_32S, CV_32F)))
+{
+    const IntegralParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int ddepth = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, ddepth);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize + Size(1, 1), ddepth);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::integral(src, dst, ddepth);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
+///////////// Threshold ////////////////////////
+
+CV_ENUM(ThreshType, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO_INV)
+
+typedef tuple<Size, MatType, ThreshType> ThreshParams;
+typedef TestBaseWithParam<ThreshParams> ThreshFixture;
+
+OCL_PERF_TEST_P(ThreshFixture, Threshold,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, ThreshType::all()))
+{
+    const ThreshParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int srcType = get<1>(params);
+    const int threshType = get<2>(params);
+    const double maxValue = 220.0, threshold = 50;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, srcType);
+
+    UMat src(srcSize, srcType), dst(srcSize, srcType);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::threshold(src, dst, threshold, maxValue, threshType);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// CLAHE ////////////////////////
+
+typedef TestBaseWithParam<Size> CLAHEFixture;
+
+OCL_PERF_TEST_P(CLAHEFixture, CLAHE, OCL_TEST_SIZES)
+{
+    const Size srcSize = GetParam();
+
+    checkDeviceMaxMemoryAllocSize(srcSize, CV_8UC1);
+
+    UMat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
+    const double clipLimit = 40.0;
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
+    OCL_TEST_CYCLE() clahe->apply(src, dst);
+
+    SANITY_CHECK(dst);
+}
+
+///////////// Canny ////////////////////////
+
+typedef tuple<int, bool> CannyParams;
+typedef TestBaseWithParam<CannyParams> CannyFixture;
+
+OCL_PERF_TEST_P(CannyFixture, Canny, ::testing::Combine(OCL_PERF_ENUM(3, 5), Bool()))
+{
+    const CannyParams params = GetParam();
+    int apertureSize = get<0>(params);
+    bool L2Grad = get<1>(params);
+
+    Mat _img = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_TRUE(!_img.empty()) << "can't open aloe-L.png";
+
+    UMat img;
+    _img.copyTo(img);
+    UMat edges(img.size(), CV_8UC1);
+
+    declare.in(img, WARMUP_RNG).out(edges);
+
+    OCL_TEST_CYCLE() cv::Canny(img, edges, 50.0, 100.0, apertureSize, L2Grad);
+
+    if (apertureSize == 3)
+        SANITY_CHECK(edges);
+    else
+        SANITY_CHECK_NOTHING();
+}
+
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
new file mode 100644
index 000000000..217882604
--- /dev/null
+++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp
@@ -0,0 +1,210 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// WarpAffine ////////////////////////
+
+CV_ENUM(InterType, INTER_NEAREST, INTER_LINEAR)
+
+typedef tuple<Size, MatType, InterType> WarpAffineParams;
+typedef TestBaseWithParam<WarpAffineParams> WarpAffineFixture;
+
+OCL_PERF_TEST_P(WarpAffineFixture, WarpAffine,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, InterType::all()))
+{
+    static const double coeffs[2][3] =
+    {
+        { cos(CV_PI / 6), -sin(CV_PI / 6), 100.0  },
+        { sin(CV_PI / 6), cos(CV_PI / 6) , -100.0 }
+    };
+    Mat M(2, 3, CV_64F, (void *)coeffs);
+
+    const WarpAffineParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::warpAffine(src, dst, M, srcSize, interpolation);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// WarpPerspective ////////////////////////
+
+typedef WarpAffineParams WarpPerspectiveParams;
+typedef TestBaseWithParam<WarpPerspectiveParams> WarpPerspectiveFixture;
+
+OCL_PERF_TEST_P(WarpPerspectiveFixture, WarpPerspective,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, InterType::all()))
+{
+    static const double coeffs[3][3] =
+    {
+        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
+        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0},
+        {0.0, 0.0, 1.0}
+    };
+    Mat M(3, 3, CV_64F, (void *)coeffs);
+
+    const WarpPerspectiveParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::warpPerspective(src, dst, M, srcSize, interpolation);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Resize ////////////////////////
+
+typedef tuple<Size, MatType, InterType, double> ResizeParams;
+typedef TestBaseWithParam<ResizeParams> ResizeFixture;
+
+OCL_PERF_TEST_P(ResizeFixture, Resize,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134,
+                               InterType::all(), ::testing::Values(0.5, 2.0)))
+{
+    const ResizeParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interType = get<2>(params);
+    double scale = get<3>(params);
+    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, interType);
+
+    SANITY_CHECK(dst, eps);
+}
+
+typedef tuple<Size, MatType, double> ResizeAreaParams;
+typedef TestBaseWithParam<ResizeAreaParams> ResizeAreaFixture;
+
+OCL_PERF_TEST_P(ResizeAreaFixture, Resize,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, ::testing::Values(0.3, 0.5, 0.6)))
+{
+    const ResizeAreaParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    double scale = get<2>(params);
+    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+    checkDeviceMaxMemoryAllocSize(dstSize, type);
+
+    UMat src(srcSize, type), dst(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, cv::INTER_AREA);
+
+    SANITY_CHECK(dst, eps);
+}
+
+///////////// Remap ////////////////////////
+
+typedef tuple<Size, MatType, InterType> RemapParams;
+typedef TestBaseWithParam<RemapParams> RemapFixture;
+
+OCL_PERF_TEST_P(RemapFixture, Remap,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, InterType::all()))
+{
+    const RemapParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), interpolation = get<2>(params), borderMode = BORDER_CONSTANT;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-4;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    UMat xmap(srcSize, CV_32FC1), ymap(srcSize, CV_32FC1);
+
+    {
+        Mat _xmap = xmap.getMat(ACCESS_WRITE), _ymap = ymap.getMat(ACCESS_WRITE);
+        for (int i = 0; i < srcSize.height; ++i)
+        {
+            float * const xmap_row = _xmap.ptr<float>(i);
+            float * const ymap_row = _ymap.ptr<float>(i);
+
+            for (int j = 0; j < srcSize.width; ++j)
+            {
+                xmap_row[j] = (j - srcSize.width * 0.5f) * 0.75f + srcSize.width * 0.5f;
+                ymap_row[j] = (i - srcSize.height * 0.5f) * 0.75f + srcSize.height * 0.5f;
+            }
+        }
+    }
+    declare.in(src, WARMUP_RNG).in(xmap, ymap, WARMUP_READ).out(dst);
+
+    OCL_TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
+
+    SANITY_CHECK(dst, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/perf/opencl/perf_matchTemplate.cpp b/modules/imgproc/perf/opencl/perf_matchTemplate.cpp
new file mode 100644
index 000000000..721b45a60
--- /dev/null
+++ b/modules/imgproc/perf/opencl/perf_matchTemplate.cpp
@@ -0,0 +1,94 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+
+namespace ocl {
+
+CV_ENUM(MethodType, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
+
+typedef std::tr1::tuple<Size, Size, MethodType> ImgSize_TmplSize_Method_t;
+typedef TestBaseWithParam<ImgSize_TmplSize_Method_t> ImgSize_TmplSize_Method;
+
+OCL_PERF_TEST_P(ImgSize_TmplSize_Method, MatchTemplate,
+        ::testing::Combine(
+            testing::Values(szSmall128, cv::Size(320, 240),
+                            cv::Size(640, 480), cv::Size(800, 600),
+                            cv::Size(1024, 768), cv::Size(1280, 1024)),
+            testing::Values(cv::Size(12, 12), cv::Size(28, 9),
+                            cv::Size(8, 30), cv::Size(16, 16)),
+            MethodType::all()
+            )
+        )
+{
+    const ImgSize_TmplSize_Method_t params = GetParam();
+    const Size imgSz = get<0>(params), tmplSz = get<1>(params);
+    const int method = get<2>(params);
+
+    UMat img(imgSz, CV_8UC1), tmpl(tmplSz, CV_8UC1);
+    UMat result(imgSz - tmplSz + Size(1, 1), CV_32F);
+
+    declare.in(img, tmpl, WARMUP_RNG).out(result);
+
+    OCL_TEST_CYCLE() matchTemplate(img, tmpl, result, method);
+
+    bool isNormed =
+        method == TM_CCORR_NORMED ||
+        method == TM_SQDIFF_NORMED ||
+        method == TM_CCOEFF_NORMED;
+    double eps = isNormed ? 3e-2
+        : 255 * 255 * tmpl.total() * 1e-4;
+
+    if (isNormed)
+        SANITY_CHECK(result, eps, ERROR_RELATIVE);
+    else
+        SANITY_CHECK(result, eps);
+}
+
+
+/////////// matchTemplate (performance tests from 2.4) ////////////////////////
+
+typedef Size_MatType CV_TM_CCORRFixture;
+
+OCL_PERF_TEST_P(CV_TM_CCORRFixture, matchTemplate,
+                ::testing::Combine(::testing::Values(Size(1000, 1000), Size(2000, 2000)),
+                               OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params), templSize(5, 5);
+    const int type = get<1>(params);
+
+    UMat src(srcSize, type), templ(templSize, type);
+    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
+    UMat dst(dstSize, CV_32F);
+
+    declare.in(src, templ, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::matchTemplate(src, templ, dst, CV_TM_CCORR);
+
+    SANITY_CHECK(dst, 1e-4);
+}
+
+typedef TestBaseWithParam<Size> CV_TM_CCORR_NORMEDFixture;
+
+OCL_PERF_TEST_P(CV_TM_CCORR_NORMEDFixture, matchTemplate,
+                ::testing::Values(Size(1000, 1000), Size(2000, 2000), Size(4000, 4000)))
+{
+    const Size srcSize = GetParam(), templSize(5, 5);
+
+    UMat src(srcSize, CV_8UC1), templ(templSize, CV_8UC1);
+    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
+    UMat dst(dstSize, CV_8UC1);
+
+    declare.in(src, templ, WARMUP_RNG).out(dst);
+
+    OCL_TEST_CYCLE() cv::matchTemplate(src, templ, dst, CV_TM_CCORR_NORMED);
+
+    SANITY_CHECK(dst, 3e-2);
+}
+
+} }
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_fft.cpp b/modules/imgproc/perf/opencl/perf_moments.cpp
similarity index 74%
rename from modules/ocl/perf/perf_fft.cpp
rename to modules/imgproc/perf/opencl/perf_moments.cpp
index 49da65936..e77b76850 100644
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/imgproc/perf/opencl/perf_moments.cpp
@@ -26,7 +26,7 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other Materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
@@ -45,44 +45,34 @@
 //M*/
 
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-using namespace perf;
+#ifdef HAVE_OPENCL
 
-///////////// dft ////////////////////////
+namespace cvtest {
+namespace ocl {
 
-typedef TestBaseWithParam<Size> dftFixture;
+///////////// Moments ////////////////////////
 
-#ifdef HAVE_CLAMDFFT
+typedef tuple<Size, bool> MomentsParams;
+typedef TestBaseWithParam<MomentsParams> MomentsFixture;
 
-PERF_TEST_P(dftFixture, dft, OCL_TYPICAL_MAT_SIZES)
+OCL_PERF_TEST_P(MomentsFixture, Moments,
+    ::testing::Combine(OCL_TEST_SIZES, ::testing::Bool()))
 {
-    const Size srcSize = GetParam();
+    const MomentsParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const bool binaryImage = get<1>(params);
 
-    Mat src(srcSize, CV_32FC2), dst;
-    randu(src, 0.0f, 1.0f);
-    declare.in(src);
+    cv::Moments m;
+    UMat src(srcSize, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
 
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(7.4);
+    OCL_TEST_CYCLE() m = cv::moments(src, binaryImage);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst;
-
-        OCL_TEST_CYCLE() cv::ocl::dft(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1.5);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::dft(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK_MOMENTS(m, 1e-6, ERROR_RELATIVE);
 }
 
-#endif
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_pyramid.cpp b/modules/imgproc/perf/opencl/perf_pyramid.cpp
similarity index 60%
rename from modules/ocl/perf/perf_pyramid.cpp
rename to modules/imgproc/perf/opencl/perf_pyramid.cpp
index 820dd6062..55bb0679b 100644
--- a/modules/ocl/perf/perf_pyramid.cpp
+++ b/modules/imgproc/perf/opencl/perf_pyramid.cpp
@@ -43,88 +43,92 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
+#ifdef HAVE_OPENCL
 
-///////////// pyrDown //////////////////////
+namespace cvtest {
+namespace ocl {
 
-typedef Size_MatType pyrDownFixture;
+///////////// PyrDown //////////////////////
 
-PERF_TEST_P(pyrDownFixture, pyrDown,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+typedef Size_MatType PyrDownFixture;
+
+OCL_PERF_TEST_P(PyrDownFixture, PyrDown,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
-    Size dstSize((srcSize.height + 1) >> 1, (srcSize.width + 1) >> 1);
+    const Size dstSize((srcSize.height + 1) >> 1, (srcSize.width + 1) >> 1);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
 
     checkDeviceMaxMemoryAllocSize(srcSize, type);
     checkDeviceMaxMemoryAllocSize(dstSize, type);
 
-    Mat src(srcSize, type), dst;
-    dst.create(dstSize, type);
+    UMat src(srcSize, type), dst(dstSize, type);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
+    OCL_TEST_CYCLE() cv::pyrDown(src, dst);
 
-        OCL_TEST_CYCLE() ocl::pyrDown(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() pyrDown(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, eps);
 }
 
-///////////// pyrUp ////////////////////////
+///////////// PyrUp ////////////////////////
 
-typedef Size_MatType pyrUpFixture;
+typedef Size_MatType PyrUpFixture;
 
-PERF_TEST_P(pyrUpFixture, pyrUp,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+OCL_PERF_TEST_P(PyrUpFixture, PyrUp,
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
     const int type = get<1>(params);
-    Size dstSize(srcSize.height << 1, srcSize.width << 1);
+    const Size dstSize(srcSize.height << 1, srcSize.width << 1);
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
 
     checkDeviceMaxMemoryAllocSize(srcSize, type);
     checkDeviceMaxMemoryAllocSize(dstSize, type);
 
-    Mat src(srcSize, type), dst;
-    dst.create(dstSize, type);
+    UMat src(srcSize, type), dst(dstSize, type);
     declare.in(src, WARMUP_RNG).out(dst);
 
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
+    OCL_TEST_CYCLE() cv::pyrDown(src, dst);
 
-        OCL_TEST_CYCLE() ocl::pyrDown(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() pyrDown(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
+    SANITY_CHECK(dst, eps);
 }
+
+///////////// buildPyramid ////////////////////////
+
+typedef Size_MatType BuildPyramidFixture;
+
+OCL_PERF_TEST_P(BuildPyramidFixture, BuildPyramid,
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), maxLevel = 5;
+    const double eps = CV_MAT_DEPTH(type) <= CV_32S ? 1 : 1e-5;
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    std::vector<UMat> dst(maxLevel);
+    UMat src(srcSize, type);
+    declare.in(src, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::buildPyramid(src, dst, maxLevel);
+
+    UMat dst0 = dst[0], dst1 = dst[1], dst2 = dst[2], dst3 = dst[3], dst4 = dst[4];
+
+    SANITY_CHECK(dst0, eps);
+    SANITY_CHECK(dst1, eps);
+    SANITY_CHECK(dst2, eps);
+    SANITY_CHECK(dst3, eps);
+    SANITY_CHECK(dst4, eps);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/perf/perf_filter2d.cpp b/modules/imgproc/perf/perf_filter2d.cpp
index cfce852b7..3b70d3a00 100644
--- a/modules/imgproc/perf/perf_filter2d.cpp
+++ b/modules/imgproc/perf/perf_filter2d.cpp
@@ -8,7 +8,7 @@ using std::tr1::make_tuple;
 using std::tr1::get;
 
 
-CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101);
+CV_ENUM(BorderMode, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101)
 
 typedef TestBaseWithParam< tr1::tuple<Size, int, BorderMode> > TestFilter2d;
 typedef TestBaseWithParam< tr1::tuple<string, int> > Image_KernelSize;
diff --git a/modules/imgproc/perf/perf_phasecorr.cpp b/modules/imgproc/perf/perf_phasecorr.cpp
new file mode 100644
index 000000000..ee9d94e31
--- /dev/null
+++ b/modules/imgproc/perf/perf_phasecorr.cpp
@@ -0,0 +1,22 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+typedef TestBaseWithParam<Size > CreateHanningWindowFixture;
+
+PERF_TEST_P( CreateHanningWindowFixture, CreateHanningWindow, Values(szVGA, sz1080p))
+{
+    const Size size = GetParam();
+    Mat dst(size, CV_32FC1);
+
+    declare.in(dst, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE() cv::createHanningWindow(dst, size, CV_32FC1);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index 0ed2b3fed..f130f34da 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv
 {
@@ -352,15 +353,83 @@ inline int getAccTabIdx(int sdepth, int ddepth)
            sdepth == CV_64F && ddepth == CV_64F ? 6 : -1;
 }
 
+#ifdef HAVE_OPENCL
+
+enum
+{
+    ACCUMULATE = 0,
+    ACCUMULATE_SQUARE = 1,
+    ACCUMULATE_PRODUCT = 2,
+    ACCUMULATE_WEIGHTED = 3
+};
+
+static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray _dst, double alpha,
+                            InputArray _mask, int op_type )
+{
+    CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
+              op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
+
+    int stype = _src.type(), cn = CV_MAT_CN(stype);
+    int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth();
+
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
+            haveMask = !_mask.empty();
+
+    if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
+        return false;
+
+    const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
+                                   "ACCUMULATE_WEIGHTED" };
+
+    ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
+                  format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s",
+                         opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
+                         ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
+            dstarg = ocl::KernelArg::ReadWrite(dst),
+            maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+
+    int argidx = k.set(0, srcarg);
+    if (op_type == ACCUMULATE_PRODUCT)
+        argidx = k.set(argidx, src2arg);
+    argidx = k.set(argidx, dstarg);
+    if (op_type == ACCUMULATE_WEIGHTED)
+    {
+        if (ddepth == CV_32F)
+            argidx = k.set(argidx, (float)alpha);
+        else
+            argidx = k.set(argidx, alpha);
+    }
+    if (haveMask)
+        k.set(argidx, maskarg);
+
+    size_t globalsize[2] = { src.cols, src.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }
 
 void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
 
-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 
     int fidx = getAccTabIdx(sdepth, ddepth);
     AccFunc func = fidx >= 0 ? accTab[fidx] : 0;
@@ -372,17 +441,21 @@ void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
     int len = (int)it.size;
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn);
 }
 
-
 void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
 
-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE_SQUARE))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 
     int fidx = getAccTabIdx(sdepth, ddepth);
     AccFunc func = fidx >= 0 ? accSqrTab[fidx] : 0;
@@ -394,18 +467,23 @@ void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _m
     int len = (int)it.size;
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn);
 }
 
 void cv::accumulateProduct( InputArray _src1, InputArray _src2,
                             InputOutputArray _dst, InputArray _mask )
 {
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src1.depth(), ddepth = dst.depth(), cn = src1.channels();
+    int stype = _src1.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
 
-    CV_Assert( src2.size && src1.size && src2.type() == src1.type() );
-    CV_Assert( dst.size == src1.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src1.size && mask.type() == CV_8U) );
+    CV_Assert( _src1.sameSize(_src2) && stype == _src2.type() );
+    CV_Assert( _src1.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src1.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src1.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src1, _src2, _dst, 0.0, _mask, ACCUMULATE_PRODUCT))
+
+    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 
     int fidx = getAccTabIdx(sdepth, ddepth);
     AccProdFunc func = fidx >= 0 ? accProdTab[fidx] : 0;
@@ -417,18 +495,22 @@ void cv::accumulateProduct( InputArray _src1, InputArray _src2,
     int len = (int)it.size;
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, cn);
+        func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, scn);
 }
 
-
 void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
                              double alpha, InputArray _mask )
 {
-    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-    int sdepth = src.depth(), ddepth = dst.depth(), cn = src.channels();
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
 
-    CV_Assert( dst.size == src.size && dst.channels() == cn );
-    CV_Assert( mask.empty() || (mask.size == src.size && mask.type() == CV_8U) );
+    CV_Assert( _src.sameSize(_dst) && dcn == scn );
+    CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
+
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_accumulate(_src, noArray(), _dst, alpha, _mask, ACCUMULATE_WEIGHTED))
+
+    Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 
     int fidx = getAccTabIdx(sdepth, ddepth);
     AccWFunc func = fidx >= 0 ? accWTab[fidx] : 0;
@@ -440,7 +522,7 @@ void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
     int len = (int)it.size;
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
-        func(ptrs[0], ptrs[1], ptrs[2], len, cn, alpha);
+        func(ptrs[0], ptrs[1], ptrs[2], len, scn, alpha);
 }
 
 
diff --git a/modules/imgproc/src/blend.cpp b/modules/imgproc/src/blend.cpp
index 91b261dfa..4fbdff9c3 100644
--- a/modules/imgproc/src/blend.cpp
+++ b/modules/imgproc/src/blend.cpp
@@ -91,6 +91,8 @@ private:
     Mat * dst;
 };
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
 {
     int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
@@ -113,6 +115,8 @@ static bool ocl_blendLinear( InputArray _src1, InputArray _src2, InputArray _wei
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 }
 
 void cv::blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
@@ -126,8 +130,8 @@ void cv::blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1,
 
     _dst.create(size, type);
 
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_blendLinear(_src1, _src2, _weights1, _weights2, _dst))
-        return;
+    CV_OCL_RUN(_dst.isUMat(),
+               ocl_blendLinear(_src1, _src2, _weights1, _weights2, _dst))
 
     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), weights1 = _weights1.getMat(),
             weights2 = _weights2.getMat(), dst = _dst.getMat();
diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index dfa7953b1..990074d49 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -40,16 +40,20 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
+/*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
 #define USE_IPP_CANNY 1
 #else
 #undef USE_IPP_CANNY
 #endif
+*/
 
-#ifdef USE_IPP_CANNY
 namespace cv
 {
+
+#ifdef USE_IPP_CANNY
 static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
 {
     int size = 0, size1 = 0;
@@ -82,22 +86,169 @@ static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
         return false;
     return true;
 }
-}
 #endif
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
+                      int aperture_size, bool L2gradient, int cn, const Size & size)
+{
+    UMat dx(size, CV_16SC(cn)), dy(size, CV_16SC(cn));
+
+    if (L2gradient)
+    {
+        low_thresh = std::min(32767.0f, low_thresh);
+        high_thresh = std::min(32767.0f, high_thresh);
+
+        if (low_thresh > 0) low_thresh *= low_thresh;
+        if (high_thresh > 0) high_thresh *= high_thresh;
+    }
+    int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
+    Size esize(size.width + 2, size.height + 2);
+
+    UMat mag;
+    size_t globalsize[2] = { size.width * cn, size.height }, localsize[2] = { 16, 16 };
+
+    if (aperture_size == 3 && !_src.isSubmatrix())
+    {
+        // Sobel calculation
+        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc);
+        if (calcSobelRowPassKernel.empty())
+            return false;
+
+        UMat src = _src.getUMat(), dxBuf(size, CV_16SC(cn)), dyBuf(size, CV_16SC(cn));
+        calcSobelRowPassKernel.args(ocl::KernelArg::ReadOnly(src),
+                                    ocl::KernelArg::WriteOnlyNoSize(dxBuf),
+                                    ocl::KernelArg::WriteOnlyNoSize(dyBuf));
+
+        if (!calcSobelRowPassKernel.run(2, globalsize, localsize, false))
+            return false;
+
+        // magnitude calculation
+        ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
+                                    L2gradient ? " -D L2GRAD" : "");
+        if (magnitudeKernel.empty())
+            return false;
+
+        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        dx.create(size, CV_16SC(cn));
+        dy.create(size, CV_16SC(cn));
+
+        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
+                             ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
+                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+
+        if (!magnitudeKernel.run(2, globalsize, localsize, false))
+            return false;
+    }
+    else
+    {
+        dx.create(size, CV_16SC(cn));
+        dy.create(size, CV_16SC(cn));
+
+        Sobel(_src, dx, CV_16SC1, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+        Sobel(_src, dy, CV_16SC1, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
+
+        // magnitude calculation
+        ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
+                                    L2gradient ? " -D L2GRAD" : "");
+        if (magnitudeKernel.empty())
+            return false;
+
+        mag = UMat(esize, CV_32SC(cn), Scalar::all(0));
+        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+                             ocl::KernelArg::WriteOnlyNoSize(mag, cn), size.height, size.width);
+
+        if (!magnitudeKernel.run(2, globalsize, NULL, false))
+            return false;
+    }
+
+    // map calculation
+    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc);
+    if (calcMapKernel.empty())
+        return false;
+
+    UMat map(esize, CV_32SC(cn));
+    calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map, cn),
+                       size.height, size.width, low, high);
+
+    if (!calcMapKernel.run(2, globalsize, localsize, false))
+        return false;
+
+    // local hysteresis thresholding
+    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc);
+    if (edgesHysteresisLocalKernel.empty())
+        return false;
+
+    UMat stack(1, size.area(), CV_16UC2), counter(1, 1, CV_32SC1, Scalar::all(0));
+    edgesHysteresisLocalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
+                                    ocl::KernelArg::PtrReadWrite(counter), size.height, size.width);
+    if (!edgesHysteresisLocalKernel.run(2, globalsize, localsize, false))
+        return false;
+
+    // global hysteresis thresholding
+    UMat stack2(1, size.area(), CV_16UC2);
+    int count;
+
+    for ( ; ; )
+    {
+        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc);
+        if (edgesHysteresisGlobalKernel.empty())
+            return false;
+
+        {
+            Mat _counter = counter.getMat(ACCESS_RW);
+            count = _counter.at<int>(0, 0);
+            if (count == 0)
+                break;
+
+            _counter.at<int>(0, 0) = 0;
+        }
+
+        edgesHysteresisGlobalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
+                                         ocl::KernelArg::PtrReadWrite(stack2), ocl::KernelArg::PtrReadWrite(counter),
+                                         size.height, size.width, count);
+
+#define divUp(total, grain) ((total + grain - 1) / grain)
+        size_t localsize2[2] = { 128, 1 }, globalsize2[2] = { std::min(count, 65535) * 128, divUp(count, 65535) };
+#undef divUp
+
+        if (!edgesHysteresisGlobalKernel.run(2, globalsize2, localsize2, false))
+            return false;
+
+        std::swap(stack, stack2);
+    }
+
+    // get edges
+    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc);
+    if (getEdgesKernel.empty())
+        return false;
+
+    _dst.create(size, CV_8UC(cn));
+    UMat dst = _dst.getUMat();
+
+    getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+    return getEdgesKernel.run(2, globalsize, NULL, false);
+}
+
+#endif
+
+}
+
 void cv::Canny( InputArray _src, OutputArray _dst,
                 double low_thresh, double high_thresh,
                 int aperture_size, bool L2gradient )
 {
-    Mat src = _src.getMat();
-    CV_Assert( src.depth() == CV_8U );
+    const int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    const Size size = _src.size();
 
-    _dst.create(src.size(), CV_8U);
-    Mat dst = _dst.getMat();
+    CV_Assert( depth == CV_8U );
+    _dst.create(size, CV_8U);
 
     if (!L2gradient && (aperture_size & CV_CANNY_L2_GRADIENT) == CV_CANNY_L2_GRADIENT)
     {
-        //backward compatibility
+        // backward compatibility
         aperture_size &= ~CV_CANNY_L2_GRADIENT;
         L2gradient = true;
     }
@@ -108,6 +259,11 @@ void cv::Canny( InputArray _src, OutputArray _dst,
     if (low_thresh > high_thresh)
         std::swap(low_thresh, high_thresh);
 
+    CV_OCL_RUN(_dst.isUMat() && cn == 1,
+               ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
+
+    Mat src = _src.getMat(), dst = _dst.getMat();
+
 #ifdef HAVE_TEGRA_OPTIMIZATION
     if (tegra::canny(src, dst, low_thresh, high_thresh, aperture_size, L2gradient))
         return;
@@ -119,12 +275,11 @@ void cv::Canny( InputArray _src, OutputArray _dst,
         return;
 #endif
 
-    const int cn = src.channels();
     Mat dx(src.rows, src.cols, CV_16SC(cn));
     Mat dy(src.rows, src.cols, CV_16SC(cn));
 
-    Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, cv::BORDER_REPLICATE);
-    Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, cv::BORDER_REPLICATE);
+    Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
+    Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
 
     if (L2gradient)
     {
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
index 89fb62bd0..f19f19243 100644
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@@ -40,17 +40,97 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 // ----------------------------------------------------------------------
 // CLAHE
 
+#ifdef HAVE_OPENCL
+
+namespace clahe
+{
+    static bool calcLut(cv::InputArray _src, cv::OutputArray _dst,
+        const int tilesX, const int tilesY, const cv::Size tileSize,
+        const int clipLimit, const float lutScale)
+    {
+        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
+
+        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+        cv::String opts;
+        if(is_cpu)
+            opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
+
+        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        cv::UMat src = _src.getUMat();
+        _dst.create(tilesX * tilesY, 256, CV_8UC1);
+        cv::UMat dst = _dst.getUMat();
+
+        int tile_size[2];
+        tile_size[0] = tileSize.width;
+        tile_size[1] = tileSize.height;
+
+        size_t localThreads[3]  = { 32, 8, 1 };
+        size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
+
+        int idx = 0;
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
+        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
+        idx = k.set(idx, tile_size);
+        idx = k.set(idx, tilesX);
+        idx = k.set(idx, clipLimit);
+        k.set(idx, lutScale);
+
+        return k.run(2, globalThreads, localThreads, false);
+    }
+
+    static bool transform(cv::InputArray _src, cv::OutputArray _dst, cv::InputArray _lut,
+        const int tilesX, const int tilesY, const cv::Size & tileSize)
+    {
+
+        cv::ocl::Kernel k("transform", cv::ocl::imgproc::clahe_oclsrc);
+        if(k.empty())
+            return false;
+
+        int tile_size[2];
+        tile_size[0] = tileSize.width;
+        tile_size[1] = tileSize.height;
+
+        cv::UMat src = _src.getUMat();
+        _dst.create(src.size(), src.type());
+        cv::UMat dst = _dst.getUMat();
+        cv::UMat lut = _lut.getUMat();
+
+        size_t localThreads[3]  = { 32, 8, 1 };
+        size_t globalThreads[3] = { src.cols, src.rows, 1 };
+
+        int idx = 0;
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
+        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
+        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(lut));
+        idx = k.set(idx, src.cols);
+        idx = k.set(idx, src.rows);
+        idx = k.set(idx, tile_size);
+        idx = k.set(idx, tilesX);
+        k.set(idx, tilesY);
+
+        return k.run(2, globalThreads, localThreads, false);
+    }
+}
+
+#endif
+
 namespace
 {
     class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
     {
     public:
-        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
-            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
+        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int clipLimit, float lutScale) :
+            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), clipLimit_(clipLimit), lutScale_(lutScale)
         {
         }
 
@@ -62,7 +142,6 @@ namespace
 
         cv::Size tileSize_;
         int tilesX_;
-        int tilesY_;
         int clipLimit_;
         float lutScale_;
     };
@@ -242,6 +321,11 @@ namespace
 
         cv::Mat srcExt_;
         cv::Mat lut_;
+
+#ifdef HAVE_OPENCL
+        cv::UMat usrcExt_;
+        cv::UMat ulut_;
+#endif
     };
 
     CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@@ -256,31 +340,38 @@ namespace
 
     void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
     {
-        cv::Mat src = _src.getMat();
+        CV_Assert( _src.type() == CV_8UC1 );
 
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        cv::Mat dst = _dst.getMat();
+#ifdef HAVE_OPENCL
+        bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2;
+#endif
 
         const int histSize = 256;
 
-        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
-
         cv::Size tileSize;
-        cv::Mat srcForLut;
+        cv::_InputArray _srcForLut;
 
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        if (_src.size().width % tilesX_ == 0 && _src.size().height % tilesY_ == 0)
         {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
+            tileSize = cv::Size(_src.size().width / tilesX_, _src.size().height / tilesY_);
+            _srcForLut = _src;
         }
         else
         {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
+#ifdef HAVE_OPENCL
+            if(useOpenCL)
+            {
+                cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
+                tileSize = cv::Size(usrcExt_.size().width / tilesX_, usrcExt_.size().height / tilesY_);
+                _srcForLut = usrcExt_;
+            }
+            else
+#endif
+            {
+                cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
+                tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_);
+                _srcForLut = srcExt_;
+            }
         }
 
         const int tileSizeTotal = tileSize.area();
@@ -293,7 +384,19 @@ namespace
             clipLimit = std::max(clipLimit, 1);
         }
 
-        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
+#ifdef HAVE_OPENCL
+        if (useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
+            if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) )
+                return;
+#endif
+
+        cv::Mat src = _src.getMat();
+        _dst.create( src.size(), src.type() );
+        cv::Mat dst = _dst.getMat();
+        cv::Mat srcForLut = _srcForLut.getMat();
+        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
+
+        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, clipLimit, lutScale);
         cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
 
         CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
@@ -325,6 +428,10 @@ namespace
     {
         srcExt_.release();
         lut_.release();
+#ifdef HAVE_OPENCL
+        usrcExt_.release();
+        ulut_.release();
+#endif
     }
 }
 
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 6d5845ec1..3822ab3c1 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -1594,8 +1594,11 @@ struct RGB2Lab_b
         static volatile int _3 = 3;
         initLabTabs();
 
-        if(!_coeffs) _coeffs = sRGB2XYZ_D65;
-        if(!_whitept) _whitept = D65;
+        if (!_coeffs)
+            _coeffs = sRGB2XYZ_D65;
+        if (!_whitept)
+            _whitept = D65;
+
         float scale[] =
         {
             (1 << lab_shift)/_whitept[0],
@@ -1699,10 +1702,6 @@ struct RGB2Lab_f
             float G = clip(src[1]);
             float B = clip(src[2]);
 
-//            CV_Assert(R >= 0.0f && R <= 1.0f);
-//            CV_Assert(G >= 0.0f && G <= 1.0f);
-//            CV_Assert(B >= 0.0f && B <= 1.0f);
-
             if (gammaTab)
             {
                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
@@ -1738,7 +1737,7 @@ struct Lab2RGB_f
 
     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
               const float* _whitept, bool _srgb )
-    : dstcn(_dstcn), srgb(_srgb), blueInd(blueIdx)
+    : dstcn(_dstcn), srgb(_srgb)
     {
         initLabTabs();
 
@@ -1796,13 +1795,12 @@ struct Lab2RGB_f
 
 
             float x = fxz[0], z = fxz[1];
-            float ro = clip(C0 * x + C1 * y + C2 * z);
-            float go = clip(C3 * x + C4 * y + C5 * z);
-            float bo = clip(C6 * x + C7 * y + C8 * z);
-
-//            CV_Assert(ro >= 0.0f && ro <= 1.0f);
-//            CV_Assert(go >= 0.0f && go <= 1.0f);
-//            CV_Assert(bo >= 0.0f && bo <= 1.0f);
+            float ro = C0 * x + C1 * y + C2 * z;
+            float go = C3 * x + C4 * y + C5 * z;
+            float bo = C6 * x + C7 * y + C8 * z;
+            ro = clip(ro);
+            go = clip(go);
+            bo = clip(bo);
 
             if (gammaTab)
             {
@@ -1820,7 +1818,6 @@ struct Lab2RGB_f
     int dstcn;
     float coeffs[9];
     bool srgb;
-    int blueInd;
 };
 
 #undef clip
@@ -2275,7 +2272,7 @@ struct YUV420p2RGB888Invoker : ParallelLoopBody
         const int rangeBegin = range.start * 2;
         const int rangeEnd = range.end * 2;
 
-        size_t uvsteps[2] = {width/2, stride - width/2};
+        int uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
@@ -2343,7 +2340,7 @@ struct YUV420p2RGBA8888Invoker : ParallelLoopBody
         int rangeBegin = range.start * 2;
         int rangeEnd = range.end * 2;
 
-        size_t uvsteps[2] = {width/2, stride - width/2};
+        int uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
@@ -2688,6 +2685,7 @@ struct mRGBA2RGBA
     }
 };
 
+#ifdef HAVE_OPENCL
 
 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
@@ -2699,7 +2697,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
     size_t globalsize[] = { src.cols, src.rows };
     ocl::Kernel k;
 
-    if(depth != CV_8U && depth != CV_16U && depth != CV_32F)
+    if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
         return false;
 
     switch (code)
@@ -2878,6 +2876,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
                  format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+        if (k.empty())
+            return false;
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
         return k.run(2, globalsize, 0, false);
     }
@@ -2927,6 +2927,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
                  format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+        if (k.empty())
+            return false;
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
         return k.run(2, globalsize, 0, false);
     }
@@ -2981,6 +2983,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
             k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, format("-D depth=%d -D hrange=%d -D bidx=%d -D dcn=3 -D scn=%d",
                                                                       depth, hrange, bidx, scn));
+            if (k.empty())
+                return false;
 
             k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
                    ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
@@ -2990,7 +2994,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         }
         else
             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                     format("-D depth=%d -D hscale=%f -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
+                     format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn));
         break;
     }
     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@@ -3008,7 +3012,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
-                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%f",
+                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff",
                         depth, dcn, bidx, hrange, 6.f/hrange));
         break;
     }
@@ -3021,8 +3025,162 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                  format("-D depth=%d -D dcn=4 -D scn=4 -D bidx=3", depth));
         break;
     }
+    case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
+    {
+        CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
+
+        bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab ? 0 : 2;
+        bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab;
+        dcn = 3;
+
+        k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D dcn=3 -D scn=%d -D bidx=%d%s",
+                        depth, scn, bidx, srgb ? " -D SRGB" : ""));
+        if (k.empty())
+            return false;
+
+        initLabTabs();
+
+        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getUMat();
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                dstarg = ocl::KernelArg::WriteOnly(dst);
+
+        if (depth == CV_8U)
+        {
+            static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
+
+            if (srgb && usRGBGammaTab.empty())
+                Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
+            else if (ulinearGammaTab.empty())
+                Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
+            if (uLabCbrtTab.empty())
+                Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
+
+            {
+                int coeffs[9];
+                const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
+                const float scale[] =
+                {
+                    (1 << lab_shift)/_whitept[0],
+                    (float)(1 << lab_shift),
+                    (1 << lab_shift)/_whitept[2]
+                };
+
+                for (int i = 0; i < 3; i++ )
+                {
+                    coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
+                    coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
+                    coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
+
+                    CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
+                              coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
+                }
+                Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
+            }
+
+            const int Lscale = (116*255+50)/100;
+            const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
+
+            k.args(srcarg, dstarg,
+                   ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
+                   ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
+                   Lscale, Lshift);
+        }
+        else
+        {
+            static UMat usRGBGammaTab, ucoeffs;
+
+            if (srgb && usRGBGammaTab.empty())
+                Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
+
+            {
+                float coeffs[9];
+                const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
+                float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
+
+                for (int i = 0; i < 3; i++)
+                {
+                    int j = i * 3;
+                    coeffs[j + (bidx ^ 2)] = _coeffs[j] * scale[i];
+                    coeffs[j + 1] = _coeffs[j + 1] * scale[i];
+                    coeffs[j + bidx] = _coeffs[j + 2] * scale[i];
+
+                    CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
+                               coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
+                }
+
+                Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
+            }
+
+            float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
+            ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
+
+            if (srgb)
+                k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
+                       ucoeffsarg, _1_3, _a);
+            else
+                k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
+        }
+
+        return k.run(dims, globalsize, NULL, false);
+    }
+    case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
+    {
+        if( dcn <= 0 )
+            dcn = 3;
+        CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
+
+        bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR ? 0 : 2;
+        bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB;
+
+        k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d%s",
+                        depth, dcn, bidx, srgb ? " -D SRGB" : ""));
+        if (k.empty())
+            return false;
+
+        initLabTabs();
+        static UMat ucoeffs, usRGBInvGammaTab;
+
+        if (srgb && usRGBInvGammaTab.empty())
+            Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
+
+        {
+            float coeffs[9];
+            const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
+
+            for( int i = 0; i < 3; i++ )
+            {
+                coeffs[i+(bidx^2)*3] = _coeffs[i]*_whitept[i];
+                coeffs[i+3] = _coeffs[i+3]*_whitept[i];
+                coeffs[i+bidx*3] = _coeffs[i+6]*_whitept[i];
+            }
+
+            Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
+        }
+
+        _dst.create(sz, CV_MAKETYPE(depth, dcn));
+        dst = _dst.getUMat();
+
+        float lThresh = 0.008856f * 903.3f;
+        float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                dstarg = ocl::KernelArg::WriteOnly(dst),
+                coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
+
+        if (srgb)
+            k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
+                   coeffsarg, lThresh, fThresh);
+        else
+            k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
+
+        return k.run(dims, globalsize, NULL, false);
+    }
     default:
-        ;
+        break;
     }
 
     if( !k.empty() )
@@ -3030,11 +3188,13 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
         dst = _dst.getUMat();
         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
-        ok = k.run(dims, globalsize, 0, false);
+        ok = k.run(dims, globalsize, NULL, false);
     }
     return ok;
 }
 
+#endif
+
 }//namespace cv
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -3043,12 +3203,11 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 
 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
     int stype = _src.type();
     int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
 
-    if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
-        return;
+    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat(),
+                ocl_cvtColor(_src, _dst, code, dcn) )
 
     Mat src = _src.getMat(), dst;
     Size sz = src.size();
@@ -3151,7 +3310,7 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
             CV_Assert( scn == 3 || scn == 4 );
             _dst.create(sz, CV_MAKETYPE(depth, 1));
             dst = _dst.getMat();
-
+/*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
             if( code == CV_BGR2GRAY )
             {
@@ -3174,7 +3333,7 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                     return;
             }
 #endif
-
+*/
             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
 
             if( depth == CV_8U )
@@ -3652,7 +3811,7 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 int ustepIdx = 0;
                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
 
-                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); };
+                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
 
                 switch(dcn*10 + bIdx)
                 {
@@ -3763,9 +3922,9 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 dst = _dst.getMat();
 
                 if( depth == CV_8U )
-                {
                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
-                } else {
+                else
+                {
                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
                 }
             }
@@ -3779,9 +3938,9 @@ void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
                 dst = _dst.getMat();
 
                 if( depth == CV_8U )
-                {
                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
-                } else {
+                else
+                {
                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
                 }
             }
diff --git a/modules/imgproc/src/corner.cpp b/modules/imgproc/src/corner.cpp
index 8f8c77006..172a531a3 100644
--- a/modules/imgproc/src/corner.cpp
+++ b/modules/imgproc/src/corner.cpp
@@ -41,14 +41,12 @@
 //M*/
 
 #include "precomp.hpp"
-#include <stdio.h>
-
+#include "opencl_kernels.hpp"
 
 namespace cv
 {
 
-static void
-calcMinEigenVal( const Mat& _cov, Mat& _dst )
+static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
 {
     int i, j;
     Size size = _cov.size();
@@ -104,8 +102,7 @@ calcMinEigenVal( const Mat& _cov, Mat& _dst )
 }
 
 
-static void
-calcHarris( const Mat& _cov, Mat& _dst, double k )
+static void calcHarris( const Mat& _cov, Mat& _dst, double k )
 {
     int i, j;
     Size size = _cov.size();
@@ -219,8 +216,7 @@ static void eigen2x2( const float* cov, float* dst, int n )
     }
 }
 
-static void
-calcEigenValsVecs( const Mat& _cov, Mat& _dst )
+static void calcEigenValsVecs( const Mat& _cov, Mat& _dst )
 {
     Size size = _cov.size();
     if( _cov.isContinuous() && _dst.isContinuous() )
@@ -306,12 +302,110 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
         calcEigenValsVecs( cov, eigenv );
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_cornerMinEigenValVecs(InputArray _src, OutputArray _dst, int block_size,
+                                      int aperture_size, double k, int borderType, int op_type)
+{
+    CV_Assert(op_type == HARRIS || op_type == MINEIGENVAL);
+
+    if ( !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
+           borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101) )
+        return false;
+
+    int type = _src.type(), depth = CV_MAT_DEPTH(type);
+    double scale = (double)(1 << ((aperture_size > 0 ? aperture_size : 3) - 1)) * block_size;
+    if( aperture_size < 0 )
+        scale *= 2.0;
+    if( depth == CV_8U )
+        scale *= 255.0;
+    scale = 1.0 / scale;
+
+    if ( !(type == CV_8UC1 || type == CV_32FC1) )
+        return false;
+
+    UMat Dx, Dy;
+    if (aperture_size > 0)
+    {
+        Sobel(_src, Dx, CV_32F, 1, 0, aperture_size, scale, 0, borderType);
+        Sobel(_src, Dy, CV_32F, 0, 1, aperture_size, scale, 0, borderType);
+    }
+    else
+    {
+        Scharr(_src, Dx, CV_32F, 1, 0, scale, 0, borderType);
+        Scharr(_src, Dy, CV_32F, 0, 1, scale, 0, borderType);
+    }
+
+    const char * const borderTypes[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
+                                         0, "BORDER_REFLECT101" };
+    const char * const cornerType[] = { "CORNER_MINEIGENVAL", "CORNER_HARRIS", 0 };
+
+    ocl::Kernel cornelKernel("corner", ocl::imgproc::corner_oclsrc,
+                             format("-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s -D %s",
+                                    block_size / 2, block_size / 2, block_size, block_size,
+                                    borderTypes[borderType], cornerType[op_type]));
+    if (cornelKernel.empty())
+        return false;
+
+    _dst.createSameSize(_src, CV_32FC1);
+    UMat dst = _dst.getUMat();
+
+    cornelKernel.args(ocl::KernelArg::ReadOnly(Dx), ocl::KernelArg::ReadOnly(Dy),
+                      ocl::KernelArg::WriteOnly(dst), (float)k);
+
+    size_t blockSizeX = 256, blockSizeY = 1;
+    size_t gSize = blockSizeX - block_size / 2 * 2;
+    size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
+    size_t rows_per_thread = 2;
+    size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
+                         ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
+                         (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
+
+    size_t globalsize[2] = { globalSizeX, globalSizeY }, localsize[2] = { blockSizeX, blockSizeY };
+    return cornelKernel.run(2, globalsize, localsize, false);
+}
+
+static bool ocl_preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int borderType, int depth )
+{
+    UMat Dx, Dy, D2x, D2y, Dxy;
+
+    Sobel( _src, Dx, CV_32F, 1, 0, ksize, 1, 0, borderType );
+    Sobel( _src, Dy, CV_32F, 0, 1, ksize, 1, 0, borderType );
+    Sobel( _src, D2x, CV_32F, 2, 0, ksize, 1, 0, borderType );
+    Sobel( _src, D2y, CV_32F, 0, 2, ksize, 1, 0, borderType );
+    Sobel( _src, Dxy, CV_32F, 1, 1, ksize, 1, 0, borderType );
+
+    _dst.create( _src.size(), CV_32FC1 );
+    UMat dst = _dst.getUMat();
+
+    double factor = 1 << (ksize - 1);
+    if( depth == CV_8U )
+        factor *= 255;
+    factor = 1./(factor * factor * factor);
+
+    ocl::Kernel k("preCornerDetect", ocl::imgproc::precornerdetect_oclsrc);
+    if (k.empty())
+        return false;
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(Dx), ocl::KernelArg::ReadOnlyNoSize(Dy),
+           ocl::KernelArg::ReadOnlyNoSize(D2x), ocl::KernelArg::ReadOnlyNoSize(D2y),
+           ocl::KernelArg::ReadOnlyNoSize(Dxy), ocl::KernelArg::WriteOnly(dst), (float)factor);
+
+    size_t globalsize[2] = { dst.cols, dst.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 }
 
 void cv::cornerMinEigenVal( InputArray _src, OutputArray _dst, int blockSize, int ksize, int borderType )
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_cornerMinEigenValVecs(_src, _dst, blockSize, ksize, 0.0, borderType, MINEIGENVAL))
+
     Mat src = _src.getMat();
-    _dst.create( src.size(), CV_32F );
+    _dst.create( src.size(), CV_32FC1 );
     Mat dst = _dst.getMat();
     cornerEigenValsVecs( src, dst, blockSize, ksize, MINEIGENVAL, 0, borderType );
 }
@@ -319,8 +413,11 @@ void cv::cornerMinEigenVal( InputArray _src, OutputArray _dst, int blockSize, in
 
 void cv::cornerHarris( InputArray _src, OutputArray _dst, int blockSize, int ksize, double k, int borderType )
 {
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_cornerMinEigenValVecs(_src, _dst, blockSize, ksize, k, borderType, HARRIS))
+
     Mat src = _src.getMat();
-    _dst.create( src.size(), CV_32F );
+    _dst.create( src.size(), CV_32FC1 );
     Mat dst = _dst.getMat();
     cornerEigenValsVecs( src, dst, blockSize, ksize, HARRIS, k, borderType );
 }
@@ -341,10 +438,14 @@ void cv::cornerEigenValsAndVecs( InputArray _src, OutputArray _dst, int blockSiz
 
 void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int borderType )
 {
-    Mat Dx, Dy, D2x, D2y, Dxy, src = _src.getMat();
+    int type = _src.type();
+    CV_Assert( type == CV_8UC1 || type == CV_32FC1 );
 
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_32FC1 );
-    _dst.create( src.size(), CV_32F );
+    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat(),
+                ocl_preCornerDetect(_src, _dst, ksize, borderType, CV_MAT_DEPTH(type)))
+
+    Mat Dx, Dy, D2x, D2y, Dxy, src = _src.getMat();
+    _dst.create( src.size(), CV_32FC1 );
     Mat dst = _dst.getMat();
 
     Sobel( src, Dx, CV_32F, 1, 0, ksize, 1, 0, borderType );
diff --git a/modules/imgproc/src/deriv.cpp b/modules/imgproc/src/deriv.cpp
index 6dccef99f..31a8b1b93 100644
--- a/modules/imgproc/src/deriv.cpp
+++ b/modules/imgproc/src/deriv.cpp
@@ -540,8 +540,6 @@ void cv::Laplacian( InputArray _src, OutputArray _dst, int ddepth, int ksize,
         int wtype = CV_MAKETYPE(wdepth, src.channels());
         Mat kd, ks;
         getSobelKernels( kd, ks, 2, 0, ksize, false, ktype );
-        if( ddepth < 0 )
-            ddepth = src.depth();
         int dtype = CV_MAKETYPE(ddepth, src.channels());
 
         int dy0 = std::min(std::max((int)(STRIPE_SIZE/(getElemSize(src.type())*src.cols)), 1), src.rows);
diff --git a/modules/imgproc/src/featureselect.cpp b/modules/imgproc/src/featureselect.cpp
index 8c740382f..53743c6a8 100644
--- a/modules/imgproc/src/featureselect.cpp
+++ b/modules/imgproc/src/featureselect.cpp
@@ -38,18 +38,184 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
 #include <cstdio>
 #include <vector>
+#include <iostream>
+#include <functional>
 
 namespace cv
 {
 
-template<typename T> struct greaterThanPtr
+struct greaterThanPtr :
+        public std::binary_function<const float *, const float *, bool>
 {
-    bool operator()(const T* a, const T* b) const { return *a > *b; }
+    bool operator () (const float * a, const float * b) const
+    { return *a > *b; }
 };
 
+struct Corner
+{
+    float val;
+    short y;
+    short x;
+
+    bool operator < (const Corner & c) const
+    {  return val > c.val; }
+};
+
+#ifdef HAVE_OPENCL
+
+static bool ocl_goodFeaturesToTrack( InputArray _image, OutputArray _corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray _mask, int blockSize,
+                                     bool useHarrisDetector, double harrisK )
+{
+    UMat eig, tmp;
+    if( useHarrisDetector )
+        cornerHarris( _image, eig, blockSize, 3, harrisK );
+    else
+        cornerMinEigenVal( _image, eig, blockSize, 3 );
+
+    double maxVal = 0;
+    minMaxLoc( eig, NULL, &maxVal, NULL, NULL, _mask );
+    threshold( eig, eig, maxVal*qualityLevel, 0, THRESH_TOZERO );
+    dilate( eig, tmp, Mat());
+
+    Size imgsize = _image.size();
+    std::vector<Corner> tmpCorners;
+    size_t total, i, j, ncorners = 0, possibleCornersCount =
+            std::max(1024, static_cast<int>(imgsize.area() * 0.1));
+    bool haveMask = !_mask.empty();
+
+    // collect list of pointers to features - put them into temporary image
+    {
+        ocl::Kernel k("findCorners", ocl::imgproc::gftt_oclsrc,
+                      format(haveMask ? "-D HAVE_MASK" : ""));
+        if (k.empty())
+            return false;
+
+        UMat counter(1, 1, CV_32SC1, Scalar::all(0)),
+                corners(1, (int)(possibleCornersCount * sizeof(Corner)), CV_8UC1);
+        ocl::KernelArg eigarg = ocl::KernelArg::ReadOnlyNoSize(eig),
+                tmparg = ocl::KernelArg::ReadOnlyNoSize(tmp),
+                cornersarg = ocl::KernelArg::PtrWriteOnly(corners),
+                counterarg = ocl::KernelArg::PtrReadWrite(counter);
+
+        if (!haveMask)
+            k.args(eigarg, tmparg, cornersarg, counterarg,
+                   imgsize.height - 2, imgsize.width - 2);
+        else
+        {
+            UMat mask = _mask.getUMat();
+            k.args(eigarg, ocl::KernelArg::ReadOnlyNoSize(mask), tmparg,
+                   cornersarg, counterarg, imgsize.height - 2, imgsize.width - 2);
+        }
+
+        size_t globalsize[2] = { imgsize.width - 2, imgsize.height - 2 };
+        if (!k.run(2, globalsize, NULL, false))
+            return false;
+
+        total = counter.getMat(ACCESS_READ).at<int>(0, 0);
+        int totalb = (int)(sizeof(Corner) * total);
+
+        tmpCorners.resize(total);
+        Mat mcorners(1, totalb, CV_8UC1, &tmpCorners[0]);
+        corners.colRange(0, totalb).copyTo(mcorners);
+    }
+
+    std::sort( tmpCorners.begin(), tmpCorners.end() );
+    std::vector<Point2f> corners;
+    corners.reserve(total);
+
+    if (minDistance >= 1)
+    {
+         // Partition the image into larger grids
+        int w = imgsize.width, h = imgsize.height;
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (w + cell_size - 1) / cell_size;
+        const int grid_height = (h + cell_size - 1) / cell_size;
+
+        std::vector<std::vector<Point2f> > grid(grid_width*grid_height);
+        minDistance *= minDistance;
+
+        for( i = 0; i < total; i++ )
+        {
+            const Corner & c = tmpCorners[i];
+            bool good = true;
+
+            int x_cell = c.x / cell_size;
+            int y_cell = c.y / cell_size;
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width-1, x2);
+            y2 = std::min(grid_height-1, y2);
+
+            for( int yy = y1; yy <= y2; yy++ )
+                for( int xx = x1; xx <= x2; xx++ )
+                {
+                    std::vector<Point2f> &m = grid[yy*grid_width + xx];
+
+                    if( m.size() )
+                    {
+                        for(j = 0; j < m.size(); j++)
+                        {
+                            float dx = c.x - m[j].x;
+                            float dy = c.y - m[j].y;
+
+                            if( dx*dx + dy*dy < minDistance )
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+
+            break_out:
+
+            if (good)
+            {
+                grid[y_cell*grid_width + x_cell].push_back(Point2f((float)c.x, (float)c.y));
+
+                corners.push_back(Point2f((float)c.x, (float)c.y));
+                ++ncorners;
+
+                if( maxCorners > 0 && (int)ncorners == maxCorners )
+                    break;
+            }
+        }
+    }
+    else
+    {
+        for( i = 0; i < total; i++ )
+        {
+            const Corner & c = tmpCorners[i];
+
+            corners.push_back(Point2f((float)c.x, (float)c.y));
+            ++ncorners;
+            if( maxCorners > 0 && (int)ncorners == maxCorners )
+                break;
+        }
+    }
+
+    Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
+    return true;
+}
+
+#endif
+
 }
 
 void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
@@ -57,27 +223,29 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                               InputArray _mask, int blockSize,
                               bool useHarrisDetector, double harrisK )
 {
-    Mat image = _image.getMat(), mask = _mask.getMat();
-
     CV_Assert( qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0 );
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+    CV_Assert( _mask.empty() || (_mask.type() == CV_8UC1 && _mask.sameSize(_image)) );
 
-    Mat eig, tmp;
+    CV_OCL_RUN(_image.dims() <= 2 && _image.isUMat(),
+               ocl_goodFeaturesToTrack(_image, _corners, maxCorners, qualityLevel, minDistance,
+                                    _mask, blockSize, useHarrisDetector, harrisK))
+
+    Mat image = _image.getMat(), eig, tmp;
     if( useHarrisDetector )
         cornerHarris( image, eig, blockSize, 3, harrisK );
     else
         cornerMinEigenVal( image, eig, blockSize, 3 );
 
     double maxVal = 0;
-    minMaxLoc( eig, 0, &maxVal, 0, 0, mask );
+    minMaxLoc( eig, 0, &maxVal, 0, 0, _mask );
     threshold( eig, eig, maxVal*qualityLevel, 0, THRESH_TOZERO );
     dilate( eig, tmp, Mat());
 
     Size imgsize = image.size();
-
     std::vector<const float*> tmpCorners;
 
     // collect list of pointers to features - put them into temporary image
+    Mat mask = _mask.getMat();
     for( int y = 1; y < imgsize.height - 1; y++ )
     {
         const float* eig_data = (const float*)eig.ptr(y);
@@ -92,11 +260,11 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
         }
     }
 
-    std::sort( tmpCorners.begin(), tmpCorners.end(), greaterThanPtr<float>() );
+    std::sort( tmpCorners.begin(), tmpCorners.end(), greaterThanPtr() );
     std::vector<Point2f> corners;
     size_t i, j, total = tmpCorners.size(), ncorners = 0;
 
-    if(minDistance >= 1)
+    if (minDistance >= 1)
     {
          // Partition the image into larger grids
         int w = image.cols;
@@ -133,7 +301,6 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
             y2 = std::min(grid_height-1, y2);
 
             for( int yy = y1; yy <= y2; yy++ )
-            {
                 for( int xx = x1; xx <= x2; xx++ )
                 {
                     std::vector <Point2f> &m = grid[yy*grid_width + xx];
@@ -153,14 +320,11 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
                         }
                     }
                 }
-            }
 
             break_out:
 
-            if(good)
+            if (good)
             {
-                // printf("%d: %d %d -> %d %d, %d, %d -- %d %d %d %d, %d %d, c=%d\n",
-                //    i,x, y, x_cell, y_cell, (int)minDistance, cell_size,x1,y1,x2,y2, grid_width,grid_height,c);
                 grid[y_cell*grid_width + x_cell].push_back(Point2f((float)x, (float)y));
 
                 corners.push_back(Point2f((float)x, (float)y));
@@ -187,33 +351,6 @@ void cv::goodFeaturesToTrack( InputArray _image, OutputArray _corners,
     }
 
     Mat(corners).convertTo(_corners, _corners.fixedType() ? _corners.type() : CV_32F);
-
-    /*
-    for( i = 0; i < total; i++ )
-    {
-        int ofs = (int)((const uchar*)tmpCorners[i] - eig.data);
-        int y = (int)(ofs / eig.step);
-        int x = (int)((ofs - y*eig.step)/sizeof(float));
-
-        if( minDistance > 0 )
-        {
-            for( j = 0; j < ncorners; j++ )
-            {
-                float dx = x - corners[j].x;
-                float dy = y - corners[j].y;
-                if( dx*dx + dy*dy < minDistance )
-                    break;
-            }
-            if( j < ncorners )
-                continue;
-        }
-
-        corners.push_back(Point2f((float)x, (float)y));
-        ++ncorners;
-        if( maxCorners > 0 && (int)ncorners == maxCorners )
-            break;
-    }
-*/
 }
 
 CV_IMPL void
diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp
index d54816849..8a4c74273 100644
--- a/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@ -42,6 +42,7 @@
 
 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
+#include <sstream>
 
 /****************************************************************************************\
                                     Base Image Filter
@@ -1404,7 +1405,11 @@ struct SymmColumnVec_32f16s
 
 struct RowVec_32f
 {
-    RowVec_32f() {}
+    RowVec_32f()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE);
+    }
+
     RowVec_32f( const Mat& _kernel )
     {
         kernel = _kernel;
@@ -3114,10 +3119,7 @@ template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFi
     VecOp vecOp;
 };
 
-}
-
-namespace cv
-{
+#ifdef HAVE_OPENCL
 
 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
@@ -3314,6 +3316,247 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
     }
     return kernel.run(2, globalsize, localsize, true);
 }
+
+static bool ocl_sepRowFilter2D( UMat &src, UMat &buf, Mat &kernelX, int anchor, int borderType, bool sync)
+{
+    int type = src.type();
+    int cn = CV_MAT_CN(type);
+    int sdepth = CV_MAT_DEPTH(type);
+    Size bufSize = buf.size();
+
+#ifdef ANDROID
+    size_t localsize[2] = {16, 10};
+#else
+    size_t localsize[2] = {16, 16};
+#endif
+    size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
+    if (CV_8U == sdepth)
+    {
+        switch (cn)
+        {
+        case 1:
+            globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
+            break;
+        case 2:
+            globalsize[0] = DIVUP((bufSize.width + 1) >> 1, localsize[0]) * localsize[0];
+            break;
+        case 4:
+            globalsize[0] = DIVUP(bufSize.width, localsize[0]) * localsize[0];
+            break;
+        }
+    }
+
+    int radiusX = anchor;
+    int radiusY = (int)((buf.rows - src.rows) >> 1);
+
+    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+    const char* btype = NULL;
+    switch (borderType & ~BORDER_ISOLATED)
+    {
+    case BORDER_CONSTANT:
+        btype = "BORDER_CONSTANT";
+        break;
+    case BORDER_REPLICATE:
+        btype = "BORDER_REPLICATE";
+        break;
+    case BORDER_REFLECT:
+        btype = "BORDER_REFLECT";
+        break;
+    case BORDER_WRAP:
+        btype = "BORDER_WRAP";
+        break;
+    case BORDER_REFLECT101:
+        btype = "BORDER_REFLECT_101";
+        break;
+    default:
+        return false;
+    }
+
+    bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
+    extra_extrapolation |= src.rows < radiusY;
+    extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
+    extra_extrapolation |= src.cols < radiusX;
+
+    cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s",
+        radiusX, (int)localsize[0], (int)localsize[1], cn,
+        btype,
+        extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+        isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+    build_options += ocl::kernelToStr(kernelX, CV_32F);
+
+    Size srcWholeSize; Point srcOffset;
+    src.locateROI(srcWholeSize, srcOffset);
+
+    std::stringstream strKernel;
+    strKernel << "row_filter";
+    if (-1 != cn)
+        strKernel << "_C" << cn;
+    if (-1 != sdepth)
+        strKernel << "_D" << sdepth;
+
+    ocl::Kernel kernelRow;
+    if (!kernelRow.create(strKernel.str().c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
+                          build_options))
+        return false;
+
+    int idxArg = 0;
+    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+    idxArg = kernelRow.set(idxArg, (int)(src.step / src.elemSize()));
+
+    idxArg = kernelRow.set(idxArg, srcOffset.x);
+    idxArg = kernelRow.set(idxArg, srcOffset.y);
+    idxArg = kernelRow.set(idxArg, src.cols);
+    idxArg = kernelRow.set(idxArg, src.rows);
+    idxArg = kernelRow.set(idxArg, srcWholeSize.width);
+    idxArg = kernelRow.set(idxArg, srcWholeSize.height);
+
+    idxArg = kernelRow.set(idxArg, ocl::KernelArg::PtrWriteOnly(buf));
+    idxArg = kernelRow.set(idxArg, (int)(buf.step / buf.elemSize()));
+    idxArg = kernelRow.set(idxArg, buf.cols);
+    idxArg = kernelRow.set(idxArg, buf.rows);
+    idxArg = kernelRow.set(idxArg, radiusY);
+
+    return kernelRow.run(2, globalsize, localsize, sync);
+}
+
+static bool ocl_sepColFilter2D(UMat &buf, UMat &dst, Mat &kernelY, int anchor, bool sync)
+{
+#ifdef ANDROID
+    size_t localsize[2] = {16, 10};
+#else
+    size_t localsize[2] = {16, 16};
+#endif
+    size_t globalsize[2] = {0, 0};
+
+    int type = dst.type();
+    int cn = CV_MAT_CN(type);
+    int ddepth = CV_MAT_DEPTH(type);
+    Size sz = dst.size();
+
+    globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
+
+    cv::String build_options;
+    if (CV_8U == ddepth)
+    {
+        switch (cn)
+        {
+        case 1:
+            globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float", "uchar", "convert_uchar_sat");
+            break;
+        case 2:
+            globalsize[0] = DIVUP((sz.width + 1) / 2, localsize[0]) * localsize[0];
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float2", "uchar2", "convert_uchar2_sat");
+            break;
+        case 3:
+        case 4:
+            globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "uchar4", "convert_uchar4_sat");
+            break;
+        }
+    }
+    else
+    {
+        globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
+        switch (dst.type())
+        {
+        case CV_32SC1:
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float", "int", "convert_int_sat");
+            break;
+        case CV_32SC3:
+        case CV_32SC4:
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "int4", "convert_int4_sat");
+            break;
+        case CV_32FC1:
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float", "float", "");
+            break;
+        case CV_32FC3:
+        case CV_32FC4:
+            build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
+                    anchor, (int)localsize[0], (int)localsize[1], cn, "float4", "float4", "");
+            break;
+        }
+    }
+
+    build_options += ocl::kernelToStr(kernelY, CV_32F);
+
+    ocl::Kernel kernelCol;
+    if (!kernelCol.create("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, build_options))
+        return false;
+
+    int idxArg = 0;
+    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrReadOnly(buf));
+    idxArg = kernelCol.set(idxArg, (int)(buf.step / buf.elemSize()));
+    idxArg = kernelCol.set(idxArg, buf.cols);
+    idxArg = kernelCol.set(idxArg, buf.rows);
+
+    idxArg = kernelCol.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+    idxArg = kernelCol.set(idxArg, (int)(dst.offset / dst.elemSize()));
+    idxArg = kernelCol.set(idxArg, (int)(dst.step / dst.elemSize()));
+    idxArg = kernelCol.set(idxArg, dst.cols);
+    idxArg = kernelCol.set(idxArg, dst.rows);
+
+    return kernelCol.run(2, globalsize, localsize, sync);
+}
+
+static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
+                      InputArray _kernelX, InputArray _kernelY, Point anchor,
+                      double delta, int borderType )
+{
+    if (abs(delta)> FLT_MIN)
+        return false;
+
+    int type = _src.type();
+    if ( !( (CV_8UC1 == type || CV_8UC4 == type || CV_32FC1 == type || CV_32FC4 == type) &&
+            (ddepth == CV_32F || ddepth == CV_8U || ddepth < 0) ) )
+        return false;
+
+    int cn = CV_MAT_CN(type);
+
+    Mat kernelX = _kernelX.getMat().reshape(1, 1);
+    if (1 != (kernelX.cols % 2))
+        return false;
+    Mat kernelY = _kernelY.getMat().reshape(1, 1);
+    if (1 != (kernelY.cols % 2))
+        return false;
+
+    int sdepth = CV_MAT_DEPTH(type);
+    if( anchor.x < 0 )
+        anchor.x = kernelX.cols >> 1;
+    if( anchor.y < 0 )
+        anchor.y = kernelY.cols >> 1;
+
+    if( ddepth < 0 )
+        ddepth = sdepth;
+
+    UMat src = _src.getUMat();
+    Size srcWholeSize; Point srcOffset;
+    src.locateROI(srcWholeSize, srcOffset);
+    if ( (0 != (srcOffset.x % 4))   ||
+         (0 != (src.cols % 4))      ||
+         (0 != ((src.step / src.elemSize()) % 4))
+       )
+        return false;
+
+    Size srcSize = src.size();
+    Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
+    UMat buf; buf.create(bufSize, CV_MAKETYPE(CV_32F, cn));
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, false))
+        return false;
+
+    _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
+    UMat dst = _dst.getUMat();
+    return ocl_sepColFilter2D(buf, dst, kernelY, anchor.y, false);
+}
+
+#endif
+
 }
 
 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
@@ -3431,9 +3674,8 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
                    InputArray _kernel, Point anchor,
                    double delta, int borderType )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if( use_opencl && ocl_filter2D(_src, _dst, ddepth, _kernel, anchor, delta, borderType))
-        return;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_filter2D(_src, _dst, ddepth, _kernel, anchor, delta, borderType))
 
     Mat src = _src.getMat(), kernel = _kernel.getMat();
 
@@ -3481,6 +3723,9 @@ void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
                       InputArray _kernelX, InputArray _kernelY, Point anchor,
                       double delta, int borderType )
 {
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
+
     Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
 
     if( ddepth < 0 )
diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp
index c49611f60..9e5067a87 100644
--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@@ -42,7 +42,7 @@
 
 #include "precomp.hpp"
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
 
diff --git a/modules/imgproc/src/gcgraph.hpp b/modules/imgproc/src/gcgraph.hpp
index f93bd19a2..16c25dea3 100644
--- a/modules/imgproc/src/gcgraph.hpp
+++ b/modules/imgproc/src/gcgraph.hpp
@@ -380,6 +380,6 @@ bool GCGraph<TWeight>::inSourceSegment( int i )
 {
     CV_Assert( i>=0 && i<(int)vtcs.size() );
     return vtcs[i].t == 0;
-};
+}
 
 #endif
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 7849d5175..6fc23d2ce 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -1399,6 +1399,61 @@ static void calcHist( const Mat* images, int nimages, const int* channels,
     }
 }
 
+#ifdef HAVE_OPENCL
+
+enum
+{
+    BINS = 256
+};
+
+static bool ocl_calcHist1(InputArray _src, OutputArray _hist, int ddepth = CV_32S)
+{
+    int compunits = ocl::Device::getDefault().maxComputeUnits();
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+
+    ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
+                  format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, wgs));
+    if (k1.empty())
+        return false;
+
+    _hist.create(BINS, 1, ddepth);
+    UMat src = _src.getUMat(), ghist(1, BINS * compunits, CV_32SC1),
+            hist = ddepth == CV_32S ? _hist.getUMat() : UMat(BINS, 1, CV_32SC1);
+
+    k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist),
+            (int)src.total());
+
+    size_t globalsize = compunits * wgs;
+    if (!k1.run(1, &globalsize, &wgs, false))
+        return false;
+
+    ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
+                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
+    if (k2.empty())
+        return false;
+
+    k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
+    if (!k2.run(1, &wgs, &wgs, false))
+        return false;
+
+    if (hist.depth() != ddepth)
+        hist.convertTo(_hist, ddepth);
+    else
+        _hist.getUMatRef() = hist;
+
+    return true;
+}
+
+static bool ocl_calcHist(InputArrayOfArrays images, OutputArray hist)
+{
+    std::vector<UMat> v;
+    images.getUMatVector(v);
+
+    return ocl_calcHist1(v[0], hist, CV_32F);
+}
+
+#endif
+
 }
 
 void cv::calcHist( const Mat* images, int nimages, const int* channels,
@@ -1417,6 +1472,12 @@ void cv::calcHist( InputArrayOfArrays images, const std::vector<int>& channels,
                    const std::vector<float>& ranges,
                    bool accumulate )
 {
+    CV_OCL_RUN(images.total() == 1 && channels.size() == 1 && images.channels(0) == 1 &&
+               channels[0] == 0 && images.isUMatVector() && mask.empty() && !accumulate &&
+               histSize.size() == 1 && histSize[0] == BINS && ranges.size() == 2 &&
+               ranges[0] == 0 && ranges[1] == 256,
+               ocl_calcHist(images, hist))
+
     int i, dims = (int)histSize.size(), rsz = (int)ranges.size(), csz = (int)channels.size();
     int nimages = (int)images.total();
 
@@ -1929,14 +1990,166 @@ void cv::calcBackProject( const Mat* images, int nimages, const int* channels,
         CV_Error(CV_StsUnsupportedFormat, "");
 }
 
+#ifdef HAVE_OPENCL
+
+namespace cv {
+
+static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
+{
+    int totalChannels = 0;
+    for (size_t i = 0, size = um.size(); i < size; ++i)
+    {
+        int ccn = um[i].channels();
+        totalChannels += ccn;
+
+        if (totalChannels == cn)
+        {
+            idx = (int)(i + 1);
+            cnidx = 0;
+            return;
+        }
+        else if (totalChannels > cn)
+        {
+            idx = (int)i;
+            cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
+            return;
+        }
+    }
+
+    idx = cnidx = -1;
+}
+
+static bool ocl_calcBackProject( InputArrayOfArrays _images, std::vector<int> channels,
+                                 InputArray _hist, OutputArray _dst,
+                                 const std::vector<float>& ranges,
+                                 float scale, size_t histdims )
+{
+    std::vector<UMat> images;
+    _images.getUMatVector(images);
+
+    size_t nimages = images.size(), totalcn = images[0].channels();
+
+    CV_Assert(nimages > 0);
+    Size size = images[0].size();
+    int depth = images[0].depth();
+
+    for (size_t i = 1; i < nimages; ++i)
+    {
+        const UMat & m = images[i];
+        totalcn += m.channels();
+        CV_Assert(size == m.size() && depth == m.depth());
+    }
+
+    std::sort(channels.begin(), channels.end());
+    for (size_t i = 0; i < histdims; ++i)
+        CV_Assert(channels[i] < (int)totalcn);
+
+    if (histdims == 1)
+    {
+        int idx, cnidx;
+        getUMatIndex(images, channels[0], idx, cnidx);
+        CV_Assert(idx >= 0);
+        UMat im = images[idx];
+
+        String opts = format("-D histdims=1 -D scn=%d", im.channels());
+        ocl::Kernel lutk("calcLUT", ocl::imgproc::calc_back_project_oclsrc, opts);
+        if (lutk.empty())
+            return false;
+
+        size_t lsize = 256;
+        UMat lut(1, (int)lsize, CV_32SC1), hist = _hist.getUMat(), uranges(ranges, true);
+
+        lutk.args(ocl::KernelArg::ReadOnlyNoSize(hist), hist.rows,
+                  ocl::KernelArg::PtrWriteOnly(lut), scale, ocl::KernelArg::PtrReadOnly(uranges));
+        if (!lutk.run(1, &lsize, NULL, false))
+            return false;
+
+        ocl::Kernel mapk("LUT", ocl::imgproc::calc_back_project_oclsrc, opts);
+        if (mapk.empty())
+            return false;
+
+        _dst.create(size, depth);
+        UMat dst = _dst.getUMat();
+
+        im.offset += cnidx;
+        mapk.args(ocl::KernelArg::ReadOnlyNoSize(im), ocl::KernelArg::PtrReadOnly(lut),
+                  ocl::KernelArg::WriteOnly(dst));
+
+        size_t globalsize[2] = { size.width, size.height };
+        return mapk.run(2, globalsize, NULL, false);
+    }
+    else if (histdims == 2)
+    {
+        int idx0, idx1, cnidx0, cnidx1;
+        getUMatIndex(images, channels[0], idx0, cnidx0);
+        getUMatIndex(images, channels[1], idx1, cnidx1);
+        CV_Assert(idx0 >= 0 && idx1 >= 0);
+        UMat im0 = images[idx0], im1 = images[idx1];
+
+        // Lut for the first dimension
+        String opts = format("-D histdims=2 -D scn1=%d -D scn2=%d", im0.channels(), im1.channels());
+        ocl::Kernel lutk1("calcLUT", ocl::imgproc::calc_back_project_oclsrc, opts);
+        if (lutk1.empty())
+            return false;
+
+        size_t lsize = 256;
+        UMat lut(1, (int)lsize<<1, CV_32SC1), uranges(ranges, true), hist = _hist.getUMat();
+
+        lutk1.args(hist.rows, ocl::KernelArg::PtrWriteOnly(lut), (int)0, ocl::KernelArg::PtrReadOnly(uranges), (int)0);
+        if (!lutk1.run(1, &lsize, NULL, false))
+            return false;
+
+        // lut for the second dimension
+        ocl::Kernel lutk2("calcLUT", ocl::imgproc::calc_back_project_oclsrc, opts);
+        if (lutk2.empty())
+            return false;
+
+        lut.offset += lsize * sizeof(int);
+        lutk2.args(hist.cols, ocl::KernelArg::PtrWriteOnly(lut), (int)256, ocl::KernelArg::PtrReadOnly(uranges), (int)2);
+        if (!lutk2.run(1, &lsize, NULL, false))
+            return false;
+
+        // perform lut
+        ocl::Kernel mapk("LUT", ocl::imgproc::calc_back_project_oclsrc, opts);
+        if (mapk.empty())
+            return false;
+
+        _dst.create(size, depth);
+        UMat dst = _dst.getUMat();
+
+        im0.offset += cnidx0;
+        im1.offset += cnidx1;
+        mapk.args(ocl::KernelArg::ReadOnlyNoSize(im0), ocl::KernelArg::ReadOnlyNoSize(im1),
+               ocl::KernelArg::ReadOnlyNoSize(hist), ocl::KernelArg::PtrReadOnly(lut), scale, ocl::KernelArg::WriteOnly(dst));
+
+        size_t globalsize[2] = { size.width, size.height };
+        return mapk.run(2, globalsize, NULL, false);
+    }
+    return false;
+}
+
+}
+
+#endif
 
 void cv::calcBackProject( InputArrayOfArrays images, const std::vector<int>& channels,
                           InputArray hist, OutputArray dst,
                           const std::vector<float>& ranges,
                           double scale )
 {
+    Size histSize = hist.size();
+#ifdef HAVE_OPENCL
+    bool _1D = histSize.height == 1 || histSize.width == 1;
+    size_t histdims = _1D ? 1 : hist.dims();
+#endif
+
+    CV_OCL_RUN(dst.isUMat() && hist.type() == CV_32FC1 &&
+               histdims <= 2 && ranges.size() == histdims * 2 && histdims == channels.size(),
+               ocl_calcBackProject(images, channels, hist, dst, ranges, (float)scale, histdims))
+
     Mat H0 = hist.getMat(), H;
     int hcn = H0.channels();
+
     if( hcn > 1 )
     {
         CV_Assert( H0.isContinuous() );
@@ -1947,12 +2160,15 @@ void cv::calcBackProject( InputArrayOfArrays images, const std::vector<int>& cha
     }
     else
         H = H0;
+
     bool _1d = H.rows == 1 || H.cols == 1;
     int i, dims = H.dims, rsz = (int)ranges.size(), csz = (int)channels.size();
     int nimages = (int)images.total();
+
     CV_Assert(nimages > 0);
     CV_Assert(rsz == dims*2 || (rsz == 2 && _1d) || (rsz == 0 && images.depth(0) == CV_8U));
     CV_Assert(csz == 0 || csz == dims || (csz == 1 && _1d));
+
     float* _ranges[CV_MAX_DIM];
     if( rsz > 0 )
     {
@@ -3131,49 +3347,17 @@ CV_IMPL void cvEqualizeHist( const CvArr* srcarr, CvArr* dstarr )
     cv::equalizeHist(cv::cvarrToMat(srcarr), cv::cvarrToMat(dstarr));
 }
 
+#ifdef HAVE_OPENCL
+
 namespace cv {
 
-enum
-{
-    BINS = 256
-};
-
-static bool ocl_calcHist(InputArray _src, OutputArray _hist)
-{
-    int compunits = ocl::Device::getDefault().maxComputeUnits();
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
-
-    ocl::Kernel k1("calculate_histogram", ocl::imgproc::histogram_oclsrc,
-                  format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, wgs));
-    if (k1.empty())
-        return false;
-
-    _hist.create(1, BINS, CV_32SC1);
-    UMat src = _src.getUMat(), hist = _hist.getUMat(), ghist(1, BINS * compunits, CV_32SC1);
-
-    k1.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::PtrWriteOnly(ghist),
-            (int)src.total());
-
-    size_t globalsize = compunits * wgs;
-    if (!k1.run(1, &globalsize, &wgs, false))
-        return false;
-
-    ocl::Kernel k2("merge_histogram", ocl::imgproc::histogram_oclsrc,
-                   format("-D BINS=%d -D HISTS_COUNT=%d -D WGS=%d", BINS, compunits, (int)wgs));
-    if (k2.empty())
-        return false;
-
-    k2.args(ocl::KernelArg::PtrReadOnly(ghist), ocl::KernelArg::PtrWriteOnly(hist));
-    return k2.run(1, &wgs, &wgs, false);
-}
-
 static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)
 {
-    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+    size_t wgs = std::min<size_t>(ocl::Device::getDefault().maxWorkGroupSize(), BINS);
 
     // calculation of histogram
     UMat hist;
-    if (!ocl_calcHist(_src, hist))
+    if (!ocl_calcHist1(_src, hist))
         return false;
 
     UMat lut(1, 256, CV_8UC1);
@@ -3191,6 +3375,8 @@ static bool ocl_equalizeHist(InputArray _src, OutputArray _dst)
 
 }
 
+#endif
+
 void cv::equalizeHist( InputArray _src, OutputArray _dst )
 {
     CV_Assert( _src.type() == CV_8UC1 );
@@ -3198,8 +3384,8 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     if (_src.empty())
         return;
 
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_equalizeHist(_src, _dst))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_equalizeHist(_src, _dst))
 
     Mat src = _src.getMat();
     _dst.create( src.size(), src.type() );
diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp
index 9c8eaca8f..061835cc0 100644
--- a/modules/imgproc/src/hough.cpp
+++ b/modules/imgproc/src/hough.cpp
@@ -75,7 +75,8 @@ Functions return the actual number of found lines.
 */
 static void
 HoughLinesStandard( const Mat& img, float rho, float theta,
-                    int threshold, std::vector<Vec2f>& lines, int linesMax )
+                    int threshold, std::vector<Vec2f>& lines, int linesMax,
+                    double min_theta, double max_theta )
 {
     int i, j;
     float irho = 1 / rho;
@@ -87,7 +88,13 @@ HoughLinesStandard( const Mat& img, float rho, float theta,
     int width = img.cols;
     int height = img.rows;
 
-    int numangle = cvRound(CV_PI / theta);
+    if (max_theta < 0 || max_theta > CV_PI ) {
+        CV_Error( CV_StsBadArg, "max_theta must fall between 0 and pi" );
+    }
+    if (min_theta < 0 || min_theta > max_theta ) {
+        CV_Error( CV_StsBadArg, "min_theta must fall between 0 and max_theta" );
+    }
+    int numangle = cvRound((max_theta - min_theta) / theta);
     int numrho = cvRound(((width + height) * 2 + 1) / rho);
 
     AutoBuffer<int> _accum((numangle+2) * (numrho+2));
@@ -99,7 +106,7 @@ HoughLinesStandard( const Mat& img, float rho, float theta,
 
     memset( accum, 0, sizeof(accum[0]) * (numangle+2) * (numrho+2) );
 
-    float ang = 0;
+    float ang = static_cast<float>(min_theta);
     for(int n = 0; n < numangle; ang += theta, n++ )
     {
         tabSin[n] = (float)(sin((double)ang) * irho);
@@ -166,7 +173,8 @@ static void
 HoughLinesSDiv( const Mat& img,
                 float rho, float theta, int threshold,
                 int srn, int stn,
-                std::vector<Vec2f>& lines, int linesMax )
+                std::vector<Vec2f>& lines, int linesMax,
+                double min_theta, double max_theta )
 {
     #define _POINT(row, column)\
         (image_src[(row)*step+(column)])
@@ -293,7 +301,7 @@ HoughLinesSDiv( const Mat& img,
 
     if( count * 100 > rn * tn )
     {
-        HoughLinesStandard( img, rho, theta, threshold, lines, linesMax );
+        HoughLinesStandard( img, rho, theta, threshold, lines, linesMax, min_theta, max_theta );
         return;
     }
 
@@ -601,15 +609,15 @@ HoughLinesProbabilistic( Mat& image,
 
 void cv::HoughLines( InputArray _image, OutputArray _lines,
                     double rho, double theta, int threshold,
-                    double srn, double stn )
+                    double srn, double stn, double min_theta, double max_theta )
 {
     Mat image = _image.getMat();
     std::vector<Vec2f> lines;
 
     if( srn == 0 && stn == 0 )
-        HoughLinesStandard(image, (float)rho, (float)theta, threshold, lines, INT_MAX);
+        HoughLinesStandard(image, (float)rho, (float)theta, threshold, lines, INT_MAX, min_theta, max_theta );
     else
-        HoughLinesSDiv(image, (float)rho, (float)theta, threshold, cvRound(srn), cvRound(stn), lines, INT_MAX);
+        HoughLinesSDiv(image, (float)rho, (float)theta, threshold, cvRound(srn), cvRound(stn), lines, INT_MAX, min_theta, max_theta);
 
     Mat(lines).copyTo(_lines);
 }
@@ -631,7 +639,8 @@ void cv::HoughLinesP(InputArray _image, OutputArray _lines,
 CV_IMPL CvSeq*
 cvHoughLines2( CvArr* src_image, void* lineStorage, int method,
                double rho, double theta, int threshold,
-               double param1, double param2 )
+               double param1, double param2,
+               double min_theta, double max_theta )
 {
     cv::Mat image = cv::cvarrToMat(src_image);
     std::vector<cv::Vec2f> l2;
@@ -694,11 +703,11 @@ cvHoughLines2( CvArr* src_image, void* lineStorage, int method,
     {
     case CV_HOUGH_STANDARD:
         HoughLinesStandard( image, (float)rho,
-                (float)theta, threshold, l2, linesMax );
+                (float)theta, threshold, l2, linesMax, min_theta, max_theta );
         break;
     case CV_HOUGH_MULTI_SCALE:
         HoughLinesSDiv( image, (float)rho, (float)theta,
-                threshold, iparam1, iparam2, l2, linesMax );
+                threshold, iparam1, iparam2, l2, linesMax, min_theta, max_theta );
         break;
     case CV_HOUGH_PROBABILISTIC:
         HoughLinesProbabilistic( image, (float)rho, (float)theta,
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 875813068..2d401275c 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1216,8 +1216,13 @@ public:
         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
         ksize(_ksize), xmin(_xmin), xmax(_xmax)
     {
+        CV_Assert(ksize <= MAX_ESIZE);
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
     virtual void operator() (const Range& range) const
     {
         int dy, cn = src.channels();
@@ -1266,6 +1271,9 @@ public:
             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
         }
     }
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+# pragma GCC diagnostic pop
+#endif
 
 private:
     Mat src;
@@ -1273,7 +1281,9 @@ private:
     const int* xofs, *yofs;
     const AT* alpha, *_beta;
     Size ssize, dsize;
-    int ksize, xmin, xmax;
+    const int ksize, xmin, xmax;
+
+    resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
 };
 
 template<class HResize, class VResize>
@@ -1900,6 +1910,8 @@ private:
 };
 #endif
 
+#ifdef HAVE_OPENCL
+
 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
                                           float * const alpha_tab, int * const ofs_tab)
 {
@@ -1955,7 +1967,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     double inv_fx = 1. / fx, inv_fy = 1. / fy;
     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
 
-    if( cn == 3 || !(cn <= 4 &&
+    if( !(cn <= 4 &&
            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
         return false;
@@ -1973,15 +1985,18 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         int wtype = CV_MAKETYPE(wdepth, cn);
         char buf[2][32];
         k.create("resizeLN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
-                        depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
+                 format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
+                        "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
                         ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
-                        ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+                        cn));
     }
     else if (interpolation == INTER_NEAREST)
     {
         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
-                 format("-D INTER_NEAREST -D PIXTYPE=%s -D cn", ocl::memopTypeToStr(type), cn));
+                 format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+                        ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
     }
     else if (interpolation == INTER_AREA)
     {
@@ -1993,9 +2008,9 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         int wtype = CV_MAKE_TYPE(wdepth, cn);
 
         char cvt[2][40];
-        String buildOption = format("-D INTER_AREA -D T=%s -D WTV=%s -D convertToWTV=%s",
-                                    ocl::typeToStr(type), ocl::typeToStr(wtype),
-                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]));
+        String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+                                    ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+                                    ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
 
         UMat alphaOcl, tabofsOcl, mapOcl;
         UMat dmap, smap;
@@ -2003,8 +2018,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         if (is_area_fast)
         {
             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
-            buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
-                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%f",
+            buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+                                               " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
                                                ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
                                                ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
                                   iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
@@ -2026,7 +2041,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
         }
         else
         {
-            buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+            buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
             if (k.empty())
                 return false;
@@ -2069,6 +2084,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
     return k.run(2, globalsize, 0, false);
 }
 
+#endif
+
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -2196,9 +2213,8 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
         inv_scale_y = (double)dsize.height/ssize.height;
     }
 
-    if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
-            ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
 
     Mat src = _src.getMat();
     _dst.create(dsize, src.type());
@@ -2212,7 +2228,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
     int depth = src.depth(), cn = src.channels();
     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
     int k, sx, sy, dx, dy;
-
+/*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
     int mode = interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 0;
     int type = src.type();
@@ -2240,7 +2256,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
             return;
     }
 #endif
-
+*/
     if( interpolation == INTER_NEAREST )
     {
         resizeNN( src, dst, inv_scale_x, inv_scale_y );
@@ -2565,15 +2581,15 @@ struct RemapVec_8u
     int operator()( const Mat& _src, void* _dst, const short* XY,
                     const ushort* FXY, const void* _wtab, int width ) const
     {
-        int cn = _src.channels();
+        int cn = _src.channels(), x = 0, sstep = (int)_src.step;
 
-        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) )
+        if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
+                sstep > 0x8000 )
             return 0;
 
         const uchar *S0 = _src.data, *S1 = _src.data + _src.step;
         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
         uchar* D = (uchar*)_dst;
-        int x = 0, sstep = (int)_src.step;
         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
         __m128i z = _mm_setzero_si128();
@@ -3299,7 +3315,10 @@ public:
                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
                     {
                         bufxy = (*m1)(Rect(x, y, bcols, brows));
-                        bufa = (*m2)(Rect(x, y, bcols, brows));
+
+                        const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
+                        for( x1 = 0; x1 < bcols; x1++ )
+                            A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
                     }
                     else if( planar_input )
                     {
@@ -3387,6 +3406,8 @@ private:
     const void *ctab;
 };
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
                       int interpolation, int borderType, const Scalar& borderValue)
 {
@@ -3459,6 +3480,8 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
     return k.run(2, globalThreads, NULL, false);
 }
 
+#endif
+
 }
 
 void cv::remap( InputArray _src, OutputArray _dst,
@@ -3501,8 +3524,8 @@ void cv::remap( InputArray _src, OutputArray _dst,
     CV_Assert( _map1.size().area() > 0 );
     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
 
-    if (ocl::useOpenCL() && _dst.isUMat() && ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
 
     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
     _dst.create( map1.size(), src.type() );
@@ -3680,7 +3703,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             for( x = 0; x < size.width; x++ )
             {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
             }
@@ -3689,7 +3712,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
         {
             for( x = 0; x < size.width; x++ )
             {
-                int fxy = src2 ? src2[x] : 0;
+                int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
             }
@@ -3867,6 +3890,8 @@ private:
 };
 #endif
 
+#ifdef HAVE_OPENCL
+
 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
 
 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
@@ -3875,7 +3900,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
 {
     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
 
-    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = depth;
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
     int interpolation = flags & INTER_MAX;
@@ -3884,36 +3909,48 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
 
     if ( !(borderType == cv::BORDER_CONSTANT &&
            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
-         (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3)
+         (!doubleSupport && depth == CV_64F) || cn > 4)
         return false;
 
     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
-    ocl::ProgramSource2 program = op_type == OCL_OP_AFFINE ?
+    ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
 
+    int scalarcn = cn == 3 ? 4 : cn;
+    int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
+    int sctype = CV_MAKETYPE(wdepth, scalarcn);
+
     ocl::Kernel k;
+    String opts;
     if (interpolation == INTER_NEAREST)
     {
-        k.create(kernelName, program,
-                 format("-D INTER_NEAREST -D T=%s%s", ocl::typeToStr(type),
-                        doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+        opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type),
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                      ocl::typeToStr(CV_MAT_DEPTH(type)),
+                      ocl::typeToStr(sctype),
+                      cn);
     }
     else
     {
         char cvt[2][50];
-        wdepth = std::max(CV_32S, depth);
-        k.create(kernelName, program,
-                  format("-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s",
-                         interpolationMap[interpolation], ocl::typeToStr(type),
-                         ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
-                         ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
-                         ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+        opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s -D cn=%d",
+                      interpolationMap[interpolation], ocl::typeToStr(type),
+                      ocl::typeToStr(CV_MAT_DEPTH(type)),
+                      ocl::typeToStr(sctype),
+                      ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
+                      ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
+                      ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
+                      doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn);
     }
+
+    k.create(kernelName, program, opts);
     if (k.empty())
         return false;
 
+    double borderBuf[] = {0, 0, 0, 0};
+    scalarToRawData(borderValue, borderBuf, sctype);
+
     UMat src = _src.getUMat(), M0;
     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
     UMat dst = _dst.getUMat();
@@ -3944,12 +3981,14 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
 
     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
-           ocl::KernelArg::Constant(Mat(1, 1, CV_MAKE_TYPE(wdepth, cn), borderValue)));
+           ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
 
     size_t globalThreads[2] = { dst.cols, dst.rows };
     return k.run(2, globalThreads, NULL, false);
 }
 
+#endif
+
 }
 
 
@@ -3957,10 +3996,9 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
                      InputArray _M0, Size dsize,
                      int flags, int borderType, const Scalar& borderValue )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
-                                            borderValue, OCL_OP_AFFINE))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
+                                 borderValue, OCL_OP_AFFINE))
 
     Mat src = _src.getMat(), M0 = _M0.getMat();
     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
@@ -4000,7 +4038,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
     const int AB_BITS = MAX(10, (int)INTER_BITS);
     const int AB_SCALE = 1 << AB_BITS;
-
+/*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
     int depth = src.depth();
     int channels = src.channels();
@@ -4044,7 +4082,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
         }
     }
 #endif
-
+*/
     for( x = 0; x < dst.cols; x++ )
     {
         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
@@ -4203,10 +4241,9 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
 {
     CV_Assert( _src.total() > 0 );
 
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-            ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
                               OCL_OP_PERSPECTIVE))
-        return;
 
     Mat src = _src.getMat(), M0 = _M0.getMat();
     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
@@ -4231,7 +4268,7 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
 
     if( !(flags & WARP_INVERSE_MAP) )
          invert(matM, matM);
-
+/*
 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
     int depth = src.depth();
     int channels = src.channels();
@@ -4275,7 +4312,7 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
         }
     }
 #endif
-
+*/
     Range range(0, dst.rows);
     warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
diff --git a/modules/imgproc/src/intersection.cpp b/modules/imgproc/src/intersection.cpp
index cdee36663..dfebd260e 100644
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@@ -231,7 +231,7 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
                 // Found a dupe, remove it
                 std::swap(intersection[j], intersection.back());
                 intersection.pop_back();
-                i--; // restart check
+                j--; // restart check
             }
         }
     }
diff --git a/modules/imgproc/src/min_enclosing_triangle.cpp b/modules/imgproc/src/min_enclosing_triangle.cpp
index 98bfd46e5..fb94fa0ba 100644
--- a/modules/imgproc/src/min_enclosing_triangle.cpp
+++ b/modules/imgproc/src/min_enclosing_triangle.cpp
@@ -287,7 +287,7 @@ static void updateSidesCA(const std::vector<cv::Point2f> &polygon,
                           cv::Point2f &sideAStartVertex, cv::Point2f &sideAEndVertex,
                           cv::Point2f &sideCStartVertex, cv::Point2f &sideCEndVertex);
 
-};
+}
 
 
 ///////////////////////////////////// Main functions /////////////////////////////////////
@@ -1560,4 +1560,4 @@ static bool lessOrEqual(double number1, double number2) {
     return ((number1 < number2) || (almostEqual(number1, number2)));
 }
 
-};
\ No newline at end of file
+}
diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp
index 14e672abd..a8cfc9613 100644
--- a/modules/imgproc/src/moments.cpp
+++ b/modules/imgproc/src/moments.cpp
@@ -39,6 +39,7 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv
 {
@@ -202,6 +203,10 @@ static Moments contourMoments( const Mat& contour )
 \****************************************************************************************/
 
 template<typename T, typename WT, typename MT>
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 5 && __GNUC_MINOR__ < 9
+// Workaround for http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60196
+__attribute__((optimize("no-tree-vectorize")))
+#endif
 static void momentsInTile( const Mat& img, double* moments )
 {
     Size size = img.size();
@@ -362,106 +367,181 @@ Moments::Moments( double _m00, double _m10, double _m01, double _m20, double _m1
     nu30 = mu30*s3; nu21 = mu21*s3; nu12 = mu12*s3; nu03 = mu03*s3;
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_moments( InputArray _src, Moments& m)
+{
+    const int TILE_SIZE = 32;
+    const int K = 10;
+    ocl::Kernel k("moments", ocl::imgproc::moments_oclsrc, format("-D TILE_SIZE=%d", TILE_SIZE));
+    if( k.empty() )
+        return false;
+
+    UMat src = _src.getUMat();
+    Size sz = src.size();
+    int xtiles = (sz.width + TILE_SIZE-1)/TILE_SIZE;
+    int ytiles = (sz.height + TILE_SIZE-1)/TILE_SIZE;
+    int ntiles = xtiles*ytiles;
+    UMat umbuf(1, ntiles*K, CV_32S);
+
+    size_t globalsize[] = {xtiles, sz.height}, localsize[] = {1, TILE_SIZE};
+    bool ok = k.args(ocl::KernelArg::ReadOnly(src),
+                     ocl::KernelArg::PtrWriteOnly(umbuf),
+                     xtiles).run(2, globalsize, localsize, true);
+    if(!ok)
+        return false;
+    Mat mbuf = umbuf.getMat(ACCESS_READ);
+    for( int i = 0; i < ntiles; i++ )
+    {
+        double x = (i % xtiles)*TILE_SIZE, y = (i / xtiles)*TILE_SIZE;
+        const int* mom = mbuf.ptr<int>() + i*K;
+        double xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+
+        // + m00 ( = m00' )
+        m.m00 += mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        m.m10 += mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        m.m01 += mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        m.m20 += mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        m.m02 += mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+
+    return true;
+}
+
+#endif
+
 }
 
 
 cv::Moments cv::moments( InputArray _src, bool binary )
 {
     const int TILE_SIZE = 32;
-    Mat mat = _src.getMat();
     MomentsInTileFunc func = 0;
     uchar nzbuf[TILE_SIZE*TILE_SIZE];
     Moments m;
-    int type = mat.type();
+    int type = _src.type();
     int depth = CV_MAT_DEPTH( type );
     int cn = CV_MAT_CN( type );
-
-    if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S))
-        return contourMoments(mat);
-
-    Size size = mat.size();
-
-    if( cn > 1 )
-        CV_Error( CV_StsBadArg, "Invalid image type" );
+    Size size = _src.size();
 
     if( size.width <= 0 || size.height <= 0 )
         return m;
 
-    if( binary || depth == CV_8U )
-        func = momentsInTile<uchar, int, int>;
-    else if( depth == CV_16U )
-        func = momentsInTile<ushort, int, int64>;
-    else if( depth == CV_16S )
-        func = momentsInTile<short, int, int64>;
-    else if( depth == CV_32F )
-        func = momentsInTile<float, double, double>;
-    else if( depth == CV_64F )
-        func = momentsInTile<double, double, double>;
+#ifdef HAVE_OPENCL
+    if( ocl::useOpenCL() && type == CV_8UC1 && !binary &&
+        _src.isUMat() && ocl_moments(_src, m) )
+        ;
     else
-        CV_Error( CV_StsUnsupportedFormat, "" );
-
-    Mat src0(mat);
-
-    for( int y = 0; y < size.height; y += TILE_SIZE )
+#endif
     {
-        Size tileSize;
-        tileSize.height = std::min(TILE_SIZE, size.height - y);
+        Mat mat = _src.getMat();
+        if( mat.checkVector(2) >= 0 && (depth == CV_32F || depth == CV_32S))
+            return contourMoments(mat);
 
-        for( int x = 0; x < size.width; x += TILE_SIZE )
+        if( cn > 1 )
+            CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" );
+
+        if( binary || depth == CV_8U )
+            func = momentsInTile<uchar, int, int>;
+        else if( depth == CV_16U )
+            func = momentsInTile<ushort, int, int64>;
+        else if( depth == CV_16S )
+            func = momentsInTile<short, int, int64>;
+        else if( depth == CV_32F )
+            func = momentsInTile<float, double, double>;
+        else if( depth == CV_64F )
+            func = momentsInTile<double, double, double>;
+        else
+            CV_Error( CV_StsUnsupportedFormat, "" );
+
+        Mat src0(mat);
+
+        for( int y = 0; y < size.height; y += TILE_SIZE )
         {
-            tileSize.width = std::min(TILE_SIZE, size.width - x);
-            Mat src(src0, cv::Rect(x, y, tileSize.width, tileSize.height));
+            Size tileSize;
+            tileSize.height = std::min(TILE_SIZE, size.height - y);
 
-            if( binary )
+            for( int x = 0; x < size.width; x += TILE_SIZE )
             {
-                cv::Mat tmp(tileSize, CV_8U, nzbuf);
-                cv::compare( src, 0, tmp, CV_CMP_NE );
-                src = tmp;
+                tileSize.width = std::min(TILE_SIZE, size.width - x);
+                Mat src(src0, cv::Rect(x, y, tileSize.width, tileSize.height));
+
+                if( binary )
+                {
+                    cv::Mat tmp(tileSize, CV_8U, nzbuf);
+                    cv::compare( src, 0, tmp, CV_CMP_NE );
+                    src = tmp;
+                }
+
+                double mom[10];
+                func( src, mom );
+
+                if(binary)
+                {
+                    double s = 1./255;
+                    for( int k = 0; k < 10; k++ )
+                        mom[k] *= s;
+                }
+
+                double xm = x * mom[0], ym = y * mom[0];
+
+                // accumulate moments computed in each tile
+
+                // + m00 ( = m00' )
+                m.m00 += mom[0];
+
+                // + m10 ( = m10' + x*m00' )
+                m.m10 += mom[1] + xm;
+
+                // + m01 ( = m01' + y*m00' )
+                m.m01 += mom[2] + ym;
+
+                // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+                m.m20 += mom[3] + x * (mom[1] * 2 + xm);
+
+                // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+                m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+                // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+                m.m02 += mom[5] + y * (mom[2] * 2 + ym);
+
+                // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+                m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+                // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+                m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+                // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+                m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+                // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+                m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
             }
-
-            double mom[10];
-            func( src, mom );
-
-            if(binary)
-            {
-                double s = 1./255;
-                for( int k = 0; k < 10; k++ )
-                    mom[k] *= s;
-            }
-
-            double xm = x * mom[0], ym = y * mom[0];
-
-            // accumulate moments computed in each tile
-
-            // + m00 ( = m00' )
-            m.m00 += mom[0];
-
-            // + m10 ( = m10' + x*m00' )
-            m.m10 += mom[1] + xm;
-
-            // + m01 ( = m01' + y*m00' )
-            m.m01 += mom[2] + ym;
-
-            // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-            m.m20 += mom[3] + x * (mom[1] * 2 + xm);
-
-            // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-            m.m11 += mom[4] + x * (mom[2] + ym) + y * mom[1];
-
-            // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-            m.m02 += mom[5] + y * (mom[2] * 2 + ym);
-
-            // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-            m.m30 += mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
-            // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-            m.m21 += mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
-            // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-            m.m12 += mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
-            // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-            m.m03 += mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
         }
     }
 
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index 845e00124..4cf0d0103 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -43,6 +43,7 @@
 #include "precomp.hpp"
 #include <limits.h>
 #include <stdio.h>
+#include "opencl_kernels.hpp"
 
 /****************************************************************************************\
                      Basic Morphological Operations: Erosion & Dilation
@@ -1283,11 +1284,131 @@ static bool IPPMorphOp(int op, InputArray _src, OutputArray _dst,
 }
 #endif
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_morphology_op(InputArray _src, OutputArray _dst, InputArray _kernel, Size &ksize, const Point anchor, int iterations, int op)
+{
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    if (_src.depth() == CV_64F && !doubleSupport)
+        return false;
+
+    UMat kernel8U;
+    _kernel.getUMat().convertTo(kernel8U, CV_8U);
+    UMat kernel = kernel8U.reshape(1, 1);
+
+    bool rectKernel = true;
+    for(int i = 0; i < kernel.rows * kernel.cols; ++i)
+        if(kernel.getMat(ACCESS_READ).at<uchar>(i) != 1)
+            rectKernel = false;
+
+    UMat src = _src.getUMat();
+
+#ifdef ANDROID
+    size_t localThreads[3] = {16, 8, 1};
+#else
+    size_t localThreads[3] = {16, 16, 1};
+#endif
+    size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
+
+    if(localThreads[0]*localThreads[1] * 2 < (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1))
+        return false;
+
+    char compile_option[128];
+    static const char* op2str[] = {"ERODE", "DILATE"};
+    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D %s %s %s -D GENTYPE=%s -D DEPTH_%d",
+        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1], op2str[op], doubleSupport?"-D DOUBLE_SUPPORT" :"", rectKernel?"-D RECTKERNEL":"",
+        ocl::typeToStr(_src.type()), _src.depth() );
+
+    std::vector<ocl::Kernel> kernels;
+    for(int i = 0; i<iterations; i++)
+    {
+        ocl::Kernel k( "morph", ocl::imgproc::morph_oclsrc, compile_option);
+        if (k.empty())
+            return false;
+        kernels.push_back(k);
+    }
+
+    _dst.create(src.size(), src.type());
+    UMat dst = _dst.getUMat();
+
+    if( iterations== 1 && src.u != dst.u)
+    {
+        Size wholesize;
+        Point ofs;
+        src.locateROI(wholesize, ofs);
+        int wholecols = wholesize.width, wholerows = wholesize.height;
+
+        int idxArg = 0;
+        idxArg = kernels[0].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(src));
+        idxArg = kernels[0].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
+        idxArg = kernels[0].set(idxArg, ofs.x);
+        idxArg = kernels[0].set(idxArg, ofs.y);
+        idxArg = kernels[0].set(idxArg, src.cols);
+        idxArg = kernels[0].set(idxArg, src.rows);
+        idxArg = kernels[0].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel));
+        idxArg = kernels[0].set(idxArg, wholecols);
+        idxArg = kernels[0].set(idxArg, wholerows);
+
+        return kernels[0].run(2, globalThreads, localThreads, false);
+    }
+
+    for(int i = 0; i< iterations; i++)
+    {
+        UMat source;
+        Size wholesize;
+        Point ofs;
+        if( i == 0)
+        {
+            int cols =  src.cols, rows = src.rows;
+            src.locateROI(wholesize,ofs);
+            src.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
+            src.copyTo(source);
+            src.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
+            source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
+        }
+        else
+        {
+            int cols =  dst.cols, rows = dst.rows;
+            dst.locateROI(wholesize,ofs);
+            dst.adjustROI(ofs.y, wholesize.height - rows - ofs.y, ofs.x, wholesize.width - cols - ofs.x);
+            dst.copyTo(source);
+            dst.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
+            source.adjustROI(-ofs.y, -wholesize.height + rows + ofs.y, -ofs.x, -wholesize.width + cols + ofs.x);
+        }
+
+        source.locateROI(wholesize, ofs);
+        int wholecols = wholesize.width, wholerows = wholesize.height;
+
+        int idxArg = 0;
+        idxArg = kernels[i].set(idxArg, ocl::KernelArg::ReadOnlyNoSize(source));
+        idxArg = kernels[i].set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
+        idxArg = kernels[i].set(idxArg, ofs.x);
+        idxArg = kernels[i].set(idxArg, ofs.y);
+        idxArg = kernels[i].set(idxArg, source.cols);
+        idxArg = kernels[i].set(idxArg, source.rows);
+        idxArg = kernels[i].set(idxArg, ocl::KernelArg::PtrReadOnly(kernel));
+        idxArg = kernels[i].set(idxArg, wholecols);
+        idxArg = kernels[i].set(idxArg, wholerows);
+
+        if (!kernels[i].run(2, globalThreads, localThreads, false))
+            return false;
+    }
+    return true;
+}
+
+#endif
+
 static void morphOp( int op, InputArray _src, OutputArray _dst,
                      InputArray _kernel,
                      Point anchor, int iterations,
                      int borderType, const Scalar& borderValue )
 {
+#ifdef HAVE_OPENCL
+    int src_type = _src.type(), dst_type = _dst.type(),
+        src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
+#endif
+
     Mat kernel = _kernel.getMat();
     Size ksize = kernel.data ? kernel.size() : Size(3,3);
     anchor = normalizeAnchor(anchor, ksize);
@@ -1299,14 +1420,9 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
         return;
 #endif
 
-    Mat src = _src.getMat();
-
-    _dst.create( src.size(), src.type() );
-    Mat dst = _dst.getMat();
-
     if( iterations == 0 || kernel.rows*kernel.cols == 1 )
     {
-        src.copyTo(dst);
+        _src.copyTo(_dst);
         return;
     }
 
@@ -1326,6 +1442,18 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
         iterations = 1;
     }
 
+    CV_OCL_RUN(_dst.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
+               _src.dims() <= 2 && (src_cn == 1 || src_cn == 4) && anchor.x == -1 && anchor.y == -1 &&
+               (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
+               borderType == cv::BORDER_CONSTANT && borderValue == morphologyDefaultBorderValue() &&
+               (op == MORPH_ERODE || op == MORPH_DILATE),
+               ocl_morphology_op(_src, _dst, kernel, ksize, anchor, iterations, op) )
+
+    Mat src = _src.getMat();
+
+    _dst.create( src.size(), src.type() );
+    Mat dst = _dst.getMat();
+
     int nStripes = 1;
 #if defined HAVE_TEGRA_OPTIMIZATION
     if (src.data != dst.data && iterations == 1 &&  //NOTE: threads are not used for inplace processing
@@ -1362,49 +1490,97 @@ void cv::dilate( InputArray src, OutputArray dst, InputArray kernel,
     morphOp( MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue );
 }
 
-
 void cv::morphologyEx( InputArray _src, OutputArray _dst, int op,
                        InputArray kernel, Point anchor, int iterations,
                        int borderType, const Scalar& borderValue )
 {
-    Mat src = _src.getMat(), temp;
-    _dst.create(src.size(), src.type());
-    Mat dst = _dst.getMat();
+    int src_type = _src.type(), dst_type = _dst.type(),
+        src_cn = CV_MAT_CN(src_type), src_depth = CV_MAT_DEPTH(src_type);
+
+    bool use_opencl = cv::ocl::useOpenCL() && _src.isUMat() && _src.size() == _dst.size() && src_type == dst_type &&
+        _src.dims()<=2 && (src_cn == 1 || src_cn == 4) && (anchor.x == -1) && (anchor.y == -1) &&
+        (src_depth == CV_8U || src_depth == CV_32F || src_depth == CV_64F ) &&
+        (borderType == cv::BORDER_CONSTANT) && (borderValue == morphologyDefaultBorderValue());
+
+    _dst.create(_src.size(), _src.type());
+    Mat src, dst, temp;
+    UMat usrc, udst, utemp;
 
     switch( op )
     {
     case MORPH_ERODE:
-        erode( src, dst, kernel, anchor, iterations, borderType, borderValue );
+        erode( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
         break;
     case MORPH_DILATE:
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
+        dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
         break;
     case MORPH_OPEN:
-        erode( src, dst, kernel, anchor, iterations, borderType, borderValue );
-        dilate( dst, dst, kernel, anchor, iterations, borderType, borderValue );
+        erode( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
+        dilate( _dst, _dst, kernel, anchor, iterations, borderType, borderValue );
         break;
     case CV_MOP_CLOSE:
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
-        erode( dst, dst, kernel, anchor, iterations, borderType, borderValue );
+        dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
+        erode( _dst, _dst, kernel, anchor, iterations, borderType, borderValue );
         break;
     case CV_MOP_GRADIENT:
-        erode( src, temp, kernel, anchor, iterations, borderType, borderValue );
-        dilate( src, dst, kernel, anchor, iterations, borderType, borderValue );
-        dst -= temp;
+        erode( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
+        dilate( _src, _dst, kernel, anchor, iterations, borderType, borderValue );
+        if(use_opencl)
+        {
+            udst = _dst.getUMat();
+            subtract(udst, utemp, udst);
+        }
+        else
+        {
+            dst = _dst.getMat();
+            dst -= temp;
+        }
         break;
     case CV_MOP_TOPHAT:
-        if( src.data != dst.data )
-            temp = dst;
-        erode( src, temp, kernel, anchor, iterations, borderType, borderValue );
-        dilate( temp, temp, kernel, anchor, iterations, borderType, borderValue );
-        dst = src - temp;
+        if(use_opencl)
+        {
+            usrc = _src.getUMat();
+            udst = _dst.getUMat();
+            if( usrc.u != udst.u )
+                utemp = udst;
+        }
+        else
+        {
+            src = _src.getMat();
+            dst = _dst.getMat();
+            if( src.data != dst.data )
+                temp = dst;
+        }
+        erode( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
+        dilate( use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel,
+            anchor, iterations, borderType, borderValue );
+        if(use_opencl)
+            subtract(usrc, utemp, udst);
+        else
+            dst = src - temp;
         break;
     case CV_MOP_BLACKHAT:
-        if( src.data != dst.data )
-            temp = dst;
-        dilate( src, temp, kernel, anchor, iterations, borderType, borderValue );
-        erode( temp, temp, kernel, anchor, iterations, borderType, borderValue );
-        dst = temp - src;
+        if(use_opencl)
+        {
+            usrc = _src.getUMat();
+            udst = _dst.getUMat();
+            if( usrc.u != udst.u )
+                utemp = udst;
+        }
+        else
+        {
+            src = _src.getMat();
+            dst = _dst.getMat();
+            if( src.data != dst.data )
+                temp = dst;
+        }
+        dilate( _src, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel, anchor, iterations, borderType, borderValue );
+        erode( use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, use_opencl ? (cv::OutputArray)utemp : (cv::OutputArray)temp, kernel,
+            anchor, iterations, borderType, borderValue );
+        if(use_opencl)
+            subtract(utemp, usrc, udst);
+        else
+            dst = temp - src;
         break;
     default:
         CV_Error( CV_StsBadArg, "unknown morphological operation" );
diff --git a/modules/imgproc/src/opencl/accumulate.cl b/modules/imgproc/src/opencl/accumulate.cl
new file mode 100644
index 000000000..a60d4d6d9
--- /dev/null
+++ b/modules/imgproc/src/opencl/accumulate.cl
@@ -0,0 +1,65 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
+#ifdef ACCUMULATE_PRODUCT
+                         __global const uchar * src2ptr, int src2_step, int src2_offset,
+#endif
+                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
+#ifdef ACCUMULATE_WEIGHTED
+                         , dstT alpha
+#endif
+#ifdef HAVE_MASK
+                         , __global const uchar * mask, int mask_step, int mask_offset
+#endif
+                         )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT));
+#ifdef HAVE_MASK
+        int mask_index = mad24(y, mask_step, mask_offset + x);
+        mask += mask_index;
+#endif
+        int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
+
+        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+#ifdef ACCUMULATE_PRODUCT
+        int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT));
+        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
+#endif
+        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+#ifdef HAVE_MASK
+            if (mask[0])
+#endif
+#ifdef ACCUMULATE
+                dst[c] += src[c];
+#elif defined ACCUMULATE_SQUARE
+                dst[c] += src[c] * src[c];
+#elif defined ACCUMULATE_PRODUCT
+                dst[c] += src[c] * src2[c];
+#elif defined ACCUMULATE_WEIGHTED
+                dst[c] = (1 - alpha) * dst[c] + src[c] * alpha;
+#else
+#error "Unknown accumulation type"
+#endif
+    }
+}
diff --git a/modules/imgproc/src/opencl/boxFilter.cl b/modules/imgproc/src/opencl/boxFilter.cl
index b65934ad4..986fc785c 100644
--- a/modules/imgproc/src/opencl/boxFilter.cl
+++ b/modules/imgproc/src/opencl/boxFilter.cl
@@ -39,45 +39,15 @@
 //
 //M*/
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
 #endif
 
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-//blur function does not support BORDER_WRAP
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#ifdef EXTRA_EXTRAPOLATION // border > src image size
 #ifdef BORDER_CONSTANT
-// None
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
     { \
@@ -131,248 +101,120 @@
 #else
 #error No extrapolation method
 #endif
+
+#define noconvert
+
+#ifdef SQR
+#define PROCESS_ELEM(value) (value * value)
 #else
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        int _row = y - minY, _col = x - minX; \
-        _row = ADDR_H(_row, 0, maxY - minY); \
-        _row = ADDR_B(_row, maxY - minY, _row); \
-        y = _row + minY; \
-        \
-        _col = ADDR_L(_col, 0, maxX - minX); \
-        _col = ADDR_R(_col, maxX - minX, _col); \
-        x = _col + minX; \
-    }
+#define PROCESS_ELEM(value) value
 #endif
 
-#if USE_DOUBLE
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
-#endif
-
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
-#else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
-#endif
-
-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define TYPE_SIZE   (VEC_SIZE*sizeof(BASE_TYPE))
-
 struct RectCoords
 {
     int x1, y1, x2, y2;
 };
 
-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x)
-#define ASSERT(condition)
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global const uchar* srcptr, int srcstep, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
+inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)
 {
 #ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
+    if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
 #endif
     {
-        __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-        return CONVERT_TO_FPTYPE(*ptr);
+        int src_index = mad24(pos.y, src_step, pos.x * (int)sizeof(ST));
+        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
+
+        return PROCESS_ELEM(value);
     }
     else
     {
 #ifdef BORDER_CONSTANT
-        return borderValue;
+        return (WT)(0);
 #else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
+        int selected_col = pos.x, selected_row = pos.y;
 
         EXTRAPOLATE(selected_col, selected_row,
 #ifdef BORDER_ISOLATED
-                srcCoords.x1, srcCoords.y1,
+            srcCoords.x1, srcCoords.y1,
 #else
-                0, 0,
+            0, 0,
 #endif
-                srcCoords.x2, srcCoords.y2
-         );
+            srcCoords.x2, srcCoords.y2);
 
-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
+        int src_index = mad24(selected_row, src_step, selected_col * (int)sizeof(ST));
+        WT value = convertToWT(*(__global const ST *)(srcptr + src_index));
 
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            __global TYPE* ptr = (__global TYPE*)(srcptr + pos.y * srcstep + pos.x * sizeof(TYPE));
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
+        return PROCESS_ELEM(value);
 #endif
     }
 }
 
-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
-
-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void boxFilter(__global const uchar* srcptr, int srcstep, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
-                __global uchar* dstptr, int dststep, int dstoffset,
-               int rows, int cols,
-#ifdef BORDER_CONSTANT
-               SCALAR_TYPE borderValue,
+__kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
+                        __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols
+#ifdef NORMALIZE
+                        , float alpha
 #endif
-               FPTYPE alpha
-               )
+                       )
 {
-    const struct RectCoords srcCoords = {srcOffsetX, srcOffsetY, srcEndX, srcEndY}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
+    const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
 
-    const int x = get_local_id(0) + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
-
-    const int local_id = get_local_id(0);
-
-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
+    int x = get_local_id(0) + (LOCAL_SIZE_X - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
+    int y = get_global_id(1) * BLOCK_SIZE_Y;
+    int local_id = get_local_id(0);
 
+    WT data[KERNEL_SIZE_Y];
+    __local WT sumOfCols[LOCAL_SIZE_X];
     int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
-    {
-        data[sy] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
-    }
 
-    INTERMEDIATE_TYPE tmp_sum = 0;
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-    {
-        tmp_sum += (data[sy]);
-    }
+    #pragma unroll
+    for (int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
+        data[sy] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
+
+    WT tmp_sum = (WT)(0);
+    #pragma unroll
+    for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
+        tmp_sum += data[sy];
 
     sumOfCols[local_id] = tmp_sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    int2 pos = (int2)(x, y);
-    __global TYPE* dstPtr = (__global TYPE*)(dstptr + pos.y * dststep + dstoffset + pos.x * TYPE_SIZE/*sizeof(TYPE)*/); // Pointer can be out of bounds!
+    int dst_index = mad24(y, dst_step, x * (int)sizeof(DT) + dst_offset);
+    __global DT * dst = (__global DT *)(dstptr + dst_index);
 
     int sy_index = 0; // current index in data[] array
-    int stepsY = min(rows - pos.y, BLOCK_SIZE_Y);
-    ASSERT(stepsY > 0);
-    for (; ;)
+    for (int i = 0, stepY = min(rows - y, BLOCK_SIZE_Y); i < stepY; ++i)
     {
-        ASSERT(pos.y < rows);
-
-        if(local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
-            pos.x >= 0 && pos.x < cols)
+        if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE_X - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
+            x >= 0 && x < cols)
         {
-            ASSERT(pos.y >= 0 && pos.y < rows);
+            WT total_sum = (WT)(0);
 
-            INTERMEDIATE_TYPE total_sum = 0;
-#pragma unroll
+            #pragma unroll
             for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
-            {
                 total_sum += sumOfCols[local_id + sx - ANCHOR_X];
-            }
-            *dstPtr = CONVERT_TO_TYPE(((INTERMEDIATE_TYPE)alpha) * total_sum);
-        }
 
-#if BLOCK_SIZE_Y == 1
-        break;
+#ifdef NORMALIZE
+            dst[0] = convertToDT((WT)(alpha) * total_sum);
 #else
-        if (--stepsY == 0)
-            break;
-
+            dst[0] = convertToDT(total_sum);
+#endif
+        }
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        tmp_sum = sumOfCols[local_id]; // TODO FIX IT: workaround for BUG in OpenCL compiler
-        // only works with scalars: ASSERT(fabs(tmp_sum - sumOfCols[local_id]) < (INTERMEDIATE_TYPE)1e-6);
+        tmp_sum = sumOfCols[local_id];
         tmp_sum -= data[sy_index];
 
-        data[sy_index] = readSrcPixel(srcPos, srcptr, srcstep, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
+        data[sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);
         srcPos.y++;
 
         tmp_sum += data[sy_index];
         sumOfCols[local_id] = tmp_sum;
 
-        sy_index = (sy_index + 1 < KERNEL_SIZE_Y) ? sy_index + 1 : 0;
-
+        sy_index = sy_index + 1 < KERNEL_SIZE_Y ? sy_index + 1 : 0;
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        // next line
-        DEBUG_ONLY(pos.y++);
-        dstPtr = (__global TYPE*)((__global char*)dstPtr + dststep); // Pointer can be out of bounds!
-#endif // BLOCK_SIZE_Y == 1
+        dst = (__global DT *)((__global uchar *)dst + dst_step);
     }
 }
diff --git a/modules/imgproc/src/opencl/calc_back_project.cl b/modules/imgproc/src/opencl/calc_back_project.cl
new file mode 100644
index 000000000..ec9247154
--- /dev/null
+++ b/modules/imgproc/src/opencl/calc_back_project.cl
@@ -0,0 +1,135 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Xu Pang, pangxu010@163.com
+//    Wenju He, wenju@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#define OUT_OF_RANGE -1
+
+#if histdims == 1
+
+__kernel void calcLUT(__global const uchar * histptr, int hist_step, int hist_offset, int hist_bins,
+                      __global int * lut, float scale, __constant float * ranges)
+{
+    int x = get_global_id(0);
+    float value = convert_float(x);
+
+    if (value > ranges[1] || value < ranges[0])
+        lut[x] = OUT_OF_RANGE;
+    else
+    {
+        float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;
+        value -= lb;
+        int bin = convert_int_sat_rtn(value / gap);
+
+        if (bin >= hist_bins)
+            lut[x] = OUT_OF_RANGE;
+        else
+        {
+            int hist_index = mad24(hist_step, bin, hist_offset);
+            __global const float * hist = (__global const float *)(histptr + hist_index);
+
+            lut[x] = (int)convert_uchar_sat_rte(hist[0] * scale);
+        }
+    }
+}
+
+__kernel void LUT(__global const uchar * src, int src_step, int src_offset,
+                  __constant int * lut,
+                  __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int src_index = mad24(y, src_step, src_offset + x * scn);
+        int dst_index = mad24(y, dst_step, dst_offset + x);
+
+        int value = lut[src[src_index]];
+        dst[dst_index] = value == OUT_OF_RANGE ? 0 : convert_uchar(value);
+    }
+}
+
+#elif histdims == 2
+
+__kernel void calcLUT(int hist_bins, __global int * lut, int lut_offset,
+                      __constant float * ranges, int roffset)
+{
+    int x = get_global_id(0);
+    float value = convert_float(x);
+
+    ranges += roffset;
+    lut += lut_offset;
+
+    if (value > ranges[1] || value < ranges[0])
+        lut[x] = OUT_OF_RANGE;
+    else
+    {
+        float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;
+        value -= lb;
+        int bin = convert_int_sat_rtn(value / gap);
+
+        lut[x] = bin >= hist_bins ? OUT_OF_RANGE : bin;
+    }
+}
+
+__kernel void LUT(__global const uchar * src1, int src1_step, int src1_offset,
+                  __global const uchar * src2, int src2_step, int src2_offset,
+                  __global const uchar * histptr, int hist_step, int hist_offset,
+                  __constant int * lut, float scale,
+                  __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int src1_index = mad24(y, src1_step, src1_offset + x * scn1);
+        int src2_index = mad24(y, src2_step, src2_offset + x * scn2);
+        int dst_index = mad24(y, dst_step, dst_offset + x);
+
+        int bin1 = lut[src1[src1_index]];
+        int bin2 = lut[src2[src2_index] + 256];
+        dst[dst_index] = bin1 == OUT_OF_RANGE || bin2 == OUT_OF_RANGE ? 0 :
+                        convert_uchar_sat_rte(*(__global const float *)(histptr +
+                        mad24(hist_step, bin1, hist_offset + bin2 * (int)sizeof(float))) * scale);
+    }
+}
+
+#else
+#error "(nimages <= 2) should be true"
+#endif
diff --git a/modules/ocl/src/opencl/imgproc_canny.cl b/modules/imgproc/src/opencl/canny.cl
similarity index 55%
rename from modules/ocl/src/opencl/imgproc_canny.cl
rename to modules/imgproc/src/opencl/canny.cl
index 0a54f1468..88b406f40 100644
--- a/modules/ocl/src/opencl/imgproc_canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@@ -43,18 +43,6 @@
 //
 //M*/
 
-#ifdef L2GRAD
-inline float calc(int x, int y)
-{
-    return sqrt((float)(x * x + y * y));
-}
-#else
-inline float calc(int x, int y)
-{
-    return (float)abs(x) + abs(y);
-}
-#endif //
-
 // Smoothing perpendicular to the derivative direction with a triangle filter
 // only support 3x3 Sobel kernel
 // h (-1) =  1, h (0) =  2, h (1) =  1
@@ -65,29 +53,13 @@ inline float calc(int x, int y)
 // src		input 8bit single channel image data
 // dx_buf	output dx buffer
 // dy_buf	output dy buffer
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-calcSobelRowPass
-(
-    __global const uchar * src,
-    __global int * dx_buf,
-    __global int * dy_buf,
-    int rows,
-    int cols,
-    int src_step,
-    int src_offset,
-    int dx_buf_step,
-    int dx_buf_offset,
-    int dy_buf_step,
-    int dy_buf_offset
-)
-{
-    dx_buf_step   /= sizeof(*dx_buf);
-    dx_buf_offset /= sizeof(*dx_buf);
-    dy_buf_step   /= sizeof(*dy_buf);
-    dy_buf_offset /= sizeof(*dy_buf);
 
+__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
+calcSobelRowPass
+    (__global const uchar * src, int src_step, int src_offset, int rows, int cols,
+     __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
+     __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+{
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
 
@@ -96,26 +68,62 @@ calcSobelRowPass
 
     __local int smem[16][18];
 
-    smem[lidy][lidx + 1] =
-        src[gidx + min(gidy, rows - 1) * src_step + src_offset];
-    if(lidx == 0)
+    smem[lidy][lidx + 1] = src[mad24(src_step, min(gidy, rows - 1), gidx + src_offset)];
+    if (lidx == 0)
     {
-        smem[lidy][0]  =
-            src[max(gidx - 1,  0)        + min(gidy, rows - 1) * src_step + src_offset];
-        smem[lidy][17] =
-            src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
+        smem[lidy][0]  = src[mad24(src_step, min(gidy, rows - 1), max(gidx - 1,  0)        + src_offset)];
+        smem[lidy][17] = src[mad24(src_step, min(gidy, rows - 1), min(gidx + 16, cols - 1) + src_offset)];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(gidy < rows && gidx < cols)
+    if (gidy < rows && gidx < cols)
     {
-        dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
-            -smem[lidy][lidx] + smem[lidy][lidx + 2];
-        dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
+        *(__global short *)(dx_buf + mad24(gidy, dx_buf_step, gidx * (int)sizeof(short) + dx_buf_offset)) =
+            smem[lidy][lidx + 2] - smem[lidy][lidx];
+        *(__global short *)(dy_buf + mad24(gidy, dy_buf_step, gidx * (int)sizeof(short) + dy_buf_offset)) =
             smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
     }
 }
 
+inline int calc(short x, short y)
+{
+#ifdef L2GRAD
+    return x * x + y * y;
+#else
+    return (x >= 0 ? x : -x) + (y >= 0 ? y : -y);
+#endif
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the non-buffered version(non-3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+
+__kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_offset,
+                            __global const uchar * dyptr, int dy_step, int dy_offset,
+                            __global uchar * magptr, int mag_step, int mag_offset, int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int dx_index = mad24(dx_step, y, x * (int)sizeof(short) + dx_offset);
+        int dy_index = mad24(dy_step, y, x * (int)sizeof(short) + dy_offset);
+        int mag_index = mad24(mag_step, y + 1, (x + 1) * (int)sizeof(int) + mag_offset);
+
+        __global const short * dx = (__global const short *)(dxptr + dx_index);
+        __global const short * dy = (__global const short *)(dyptr + dy_index);
+        __global int * mag = (__global int *)(magptr + mag_index);
+
+        mag[0] = calc(dx[0], dy[0]);
+    }
+}
+
 // calculate the magnitude of the filter pass combining both x and y directions
 // This is the buffered version(3x3 sobel)
 //
@@ -124,29 +132,14 @@ calcSobelRowPass
 // dx			direvitive in x direction output
 // dy			direvitive in y direction output
 // mag			magnitude direvitive of xy output
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
+__kernel void __attribute__((reqd_work_group_size(16, 16, 1)))
 calcMagnitude_buf
-(
-    __global const int * dx_buf,
-    __global const int * dy_buf,
-    __global int * dx,
-    __global int * dy,
-    __global float * mag,
-    int rows,
-    int cols,
-    int dx_buf_step,
-    int dx_buf_offset,
-    int dy_buf_step,
-    int dy_buf_offset,
-    int dx_step,
-    int dx_offset,
-    int dy_step,
-    int dy_offset,
-    int mag_step,
-    int mag_offset
-)
+    (__global const short * dx_buf, int dx_buf_step, int dx_buf_offset,
+     __global const short * dy_buf, int dy_buf_step, int dy_buf_offset,
+     __global short * dx, int dx_step, int dx_offset,
+     __global short * dy, int dy_step, int dy_offset,
+     __global int * mag, int mag_step, int mag_offset,
+     int rows, int cols)
 {
     dx_buf_step    /= sizeof(*dx_buf);
     dx_buf_offset  /= sizeof(*dx_buf);
@@ -165,31 +158,25 @@ calcMagnitude_buf
     int lidx = get_local_id(0);
     int lidy = get_local_id(1);
 
-    __local int sdx[18][16];
-    __local int sdy[18][16];
+    __local short sdx[18][16];
+    __local short sdy[18][16];
 
-    sdx[lidy + 1][lidx] =
-        dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
-    sdy[lidy + 1][lidx] =
-        dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
-    if(lidy == 0)
+    sdx[lidy + 1][lidx] = dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
+    sdy[lidy + 1][lidx] = dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    if (lidy == 0)
     {
-        sdx[0][lidx]  =
-            dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
-        sdx[17][lidx] =
-            dx_buf[gidx + min(gidy + 16, rows - 1)  * dx_buf_step + dx_buf_offset];
+        sdx[0][lidx]  = dx_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dx_buf_step + dx_buf_offset];
+        sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1)        * dx_buf_step + dx_buf_offset];
 
-        sdy[0][lidx]  =
-            dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
-        sdy[17][lidx] =
-            dy_buf[gidx + min(gidy + 16, rows - 1)  * dy_buf_step + dy_buf_offset];
+        sdy[0][lidx]  = dy_buf[gidx + min(max(gidy - 1, 0), rows - 1) * dy_buf_step + dy_buf_offset];
+        sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1)        * dy_buf_step + dy_buf_offset];
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(gidx < cols && gidy < rows)
+    if (gidx < cols && gidy < rows)
     {
-        int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
-        int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+        short x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+        short y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
 
         dx[gidx + gidy * dx_step + dx_offset] = x;
         dy[gidx + gidy * dy_step + dy_offset] = y;
@@ -198,61 +185,14 @@ calcMagnitude_buf
     }
 }
 
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the non-buffered version(non-3x3 sobel)
-//
-// dx_buf		dx buffer, calculated from calcSobelRowPass
-// dy_buf		dy buffer, calculated from calcSobelRowPass
-// dx			direvitive in x direction output
-// dy			direvitive in y direction output
-// mag			magnitude direvitive of xy output
-__kernel
-void calcMagnitude
-(
-    __global const int * dx,
-    __global const int * dy,
-    __global float * mag,
-    int rows,
-    int cols,
-    int dx_step,
-    int dx_offset,
-    int dy_step,
-    int dy_offset,
-    int mag_step,
-    int mag_offset
-)
-{
-    dx_step    /= sizeof(*dx);
-    dx_offset  /= sizeof(*dx);
-    dy_step    /= sizeof(*dy);
-    dy_offset  /= sizeof(*dy);
-    mag_step   /= sizeof(*mag);
-    mag_offset /= sizeof(*mag);
-
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    if(gidy < rows && gidx < cols)
-    {
-        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
-            calc(
-                dx[gidx + gidy * dx_step + dx_offset],
-                dy[gidx + gidy * dy_step + dy_offset]
-            );
-    }
-}
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // 0.4142135623730950488016887242097 is tan(22.5)
+
 #define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
 
-#ifdef DOUBLE_SUPPORT
-    #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-#else
-    #define TG22        (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
-#endif
-
-//First pass of edge detection and non-maximum suppression
+// First pass of edge detection and non-maximum suppression
 // edgetype is set to for each pixel:
 // 0 - below low thres, not an edge
 // 1 - maybe an edge
@@ -267,154 +207,15 @@ void calcMagnitude
 // dx, dy		direvitives of x and y direction
 // mag			magnitudes calculated from calcMagnitude function
 // map			output containing raw edge types
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-calcMap
-(
-    __global const int * dx,
-    __global const int * dy,
-    __global const float * mag,
-    __global int * map,
-    int rows,
-    int cols,
-    float low_thresh,
-    float high_thresh,
-    int dx_step,
-    int dx_offset,
-    int dy_step,
-    int dy_offset,
-    int mag_step,
-    int mag_offset,
-    int map_step,
-    int map_offset
-)
+
+__kernel void __attribute__((reqd_work_group_size(16,16,1)))
+calcMap(
+    __global const uchar * dx, int dx_step, int dx_offset,
+    __global const uchar * dy, int dy_step, int dy_offset,
+    __global const uchar * mag, int mag_step, int mag_offset,
+    __global uchar * map, int map_step, int map_offset,
+    int rows, int cols, int low_thresh, int high_thresh)
 {
-    dx_step    /= sizeof(*dx);
-    dx_offset  /= sizeof(*dx);
-    dy_step    /= sizeof(*dy);
-    dy_offset  /= sizeof(*dy);
-    mag_step   /= sizeof(*mag);
-    mag_offset /= sizeof(*mag);
-    map_step   /= sizeof(*map);
-    map_offset /= sizeof(*map);
-
-    mag += mag_offset;
-    map += map_offset;
-
-    __local float smem[18][18];
-
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int grp_idx = get_global_id(0) & 0xFFFFF0;
-    int grp_idy = get_global_id(1) & 0xFFFFF0;
-
-    int tid = lidx + lidy * 16;
-    int lx = tid % 18;
-    int ly = tid / 18;
-    if(ly < 14)
-    {
-        smem[ly][lx] =
-            mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
-    }
-    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-    {
-        smem[ly + 14][lx] =
-            mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidy < rows && gidx < cols)
-    {
-        int x = dx[gidx + gidy * dx_step];
-        int y = dy[gidx + gidy * dy_step];
-        const int s = (x ^ y) < 0 ? -1 : 1;
-        const float m = smem[lidy + 1][lidx + 1];
-        x = abs(x);
-        y = abs(y);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-        if(m > low_thresh)
-        {
-            const int tg22x = x * TG22;
-            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
-            y <<= CANNY_SHIFT;
-            if(y < tg22x)
-            {
-                if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
-                {
-                    edge_type = 1 + (int)(m > high_thresh);
-                }
-            }
-            else if (y > tg67x)
-            {
-                if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
-                {
-                    edge_type = 1 + (int)(m > high_thresh);
-                }
-            }
-            else
-            {
-                if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
-                {
-                    edge_type = 1 + (int)(m > high_thresh);
-                }
-            }
-        }
-        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
-    }
-}
-
-#undef CANNY_SHIFT
-#undef TG22
-
-struct PtrStepSz {
-    __global int *ptr;
-    int step;
-    int rows, cols;
-};
-inline int get(struct PtrStepSz data, int y, int x) { return *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)); }
-inline void set(struct PtrStepSz data, int y, int x, int value) { *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)) = value; }
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// do Hysteresis for pixel whose edge type is 1
-//
-// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
-// marked as edge. Each thread will iterate for 16 times to connect local edges.
-// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
-// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
-//
-// map		raw edge type results calculated from calcMap.
-// st		the potiential edge points found in this kernel call
-// counter	the number of potiential edge points
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-edgesHysteresisLocal
-(
-    __global int * map_ptr,
-    __global ushort2 * st,
-    __global unsigned int * counter,
-    int rows,
-    int cols,
-    int map_step,
-    int map_offset
-)
-{
-#if 0
-    map_step   /= sizeof(*map);
-    map_offset /= sizeof(*map);
-
-    const __global int* map = map_ptr + map_offset;
-
     __local int smem[18][18];
 
     int gidx = get_global_id(0);
@@ -429,72 +230,95 @@ edgesHysteresisLocal
     int tid = lidx + lidy * 16;
     int lx = tid % 18;
     int ly = tid / 18;
-    if(ly < 14)
-    {
-        smem[ly][lx] =
-            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
-    }
-    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-    {
-        smem[ly + 14][lx] =
-            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
-    }
 
+    mag += mag_offset;
+    if (ly < 14)
+        smem[ly][lx] = *(__global const int *)(mag +
+            mad24(mag_step, min(grp_idy + ly, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
+    if (ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+        smem[ly + 14][lx] = *(__global const int *)(mag +
+            mad24(mag_step, min(grp_idy + ly + 14, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(gidy < rows && gidx < cols)
+    if (gidy < rows && gidx < cols)
     {
-        int n;
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        int m = smem[lidy + 1][lidx + 1];
 
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
+        if (m > low_thresh)
         {
-            n = 0;
+            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, dx_offset + (int)sizeof(short) * gidx));
+            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, dy_offset + (int)sizeof(short) * gidx));
+            int x = abs(xs), y = abs(ys);
 
-            if (smem[lidy + 1][lidx + 1] == 1)
+            int tg22x = x * TG22;
+            y <<= CANNY_SHIFT;
+
+            if (y < tg22x)
             {
-                n += smem[lidy    ][lidx    ] == 2;
-                n += smem[lidy    ][lidx + 1] == 2;
-                n += smem[lidy    ][lidx + 2] == 2;
-
-                n += smem[lidy + 1][lidx    ] == 2;
-                n += smem[lidy + 1][lidx + 2] == 2;
-
-                n += smem[lidy + 2][lidx    ] == 2;
-                n += smem[lidy + 2][lidx + 1] == 2;
-                n += smem[lidy + 2][lidx + 2] == 2;
+                if (m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+                if (y > tg67x)
+                {
+                    if (m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
+                else
+                {
+                    int s = (xs ^ ys) < 0 ? -1 : 1;
+                    if (m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                        edge_type = 1 + (int)(m > high_thresh);
+                }
             }
-
-            if (n > 0)
-                smem[lidy + 1][lidx + 1] = 2;
-        }
-
-        const int e = smem[lidy + 1][lidx + 1];
-        map[gidx + 1 + (gidy + 1) * map_step] = e;
-
-        n = 0;
-        if(e == 2)
-        {
-            n += smem[lidy    ][lidx    ] == 1;
-            n += smem[lidy    ][lidx + 1] == 1;
-            n += smem[lidy    ][lidx + 2] == 1;
-
-            n += smem[lidy + 1][lidx    ] == 1;
-            n += smem[lidy + 1][lidx + 2] == 1;
-
-            n += smem[lidy + 2][lidx    ] == 1;
-            n += smem[lidy + 2][lidx + 1] == 1;
-            n += smem[lidy + 2][lidx + 2] == 1;
-        }
-
-        if(n > 0)
-        {
-            unsigned int ind = atomic_inc(counter);
-            st[ind] = (ushort2)(gidx + 1, gidy + 1);
         }
+        *(__global int *)(map + mad24(map_step, gidy + 1, (gidx + 1) * (int)sizeof(int) + map_offset)) = edge_type;
     }
-#else
-    struct PtrStepSz map = {((__global int *)((__global char*)map_ptr + map_offset)), map_step, rows, cols};
+}
+
+#undef CANNY_SHIFT
+#undef TG22
+
+struct PtrStepSz
+{
+    __global uchar * ptr;
+    int step, rows, cols;
+};
+
+inline int get(struct PtrStepSz data, int y, int x)
+{
+    return *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1)));
+}
+
+inline void set(struct PtrStepSz data, int y, int x, int value)
+{
+    *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))) = value;
+}
+
+// perform Hysteresis for pixel whose edge type is 1
+//
+// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
+// marked as edge. Each thread will iterate for 16 times to connect local edges.
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
+// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
+//
+// map		raw edge type results calculated from calcMap.
+// stack	the potiential edge points found in this kernel call
+// counter	the number of potiential edge points
+
+__kernel void __attribute__((reqd_work_group_size(16,16,1)))
+edgesHysteresisLocal
+    (__global uchar * map_ptr, int map_step, int map_offset,
+     __global ushort2 * st, __global unsigned int * counter,
+    int rows, int cols)
+{
+    struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };
 
     __local int smem[18][18];
 
@@ -507,13 +331,13 @@ edgesHysteresisLocal
 
     smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
     if (threadIdx.y == 0)
-        smem[0][threadIdx.x + 1] = y > 0 ? get(map, y - 1, x) : 0;
+        smem[0][threadIdx.x + 1] = x < map.cols ? get(map, y - 1, x) : 0;
     if (threadIdx.y == blockDim.y - 1)
         smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
     if (threadIdx.x == 0)
-        smem[threadIdx.y + 1][0] = x > 0 ? get(map, y, x - 1) : 0;
+        smem[threadIdx.y + 1][0] = y < map.rows ? get(map, y, x - 1) : 0;
     if (threadIdx.x == blockDim.x - 1)
-        smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? get(map, y, x + 1) : 0;
+        smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols && y < map.rows ? get(map, y, x + 1) : 0;
     if (threadIdx.x == 0 && threadIdx.y == 0)
         smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
     if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
@@ -525,7 +349,7 @@ edgesHysteresisLocal
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (x >= map.cols || y >= map.rows)
+    if (x >= cols || y >= rows)
         return;
 
     int n;
@@ -554,9 +378,7 @@ edgesHysteresisLocal
     }
 
     const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
     set(map, y, x, e);
-
     n = 0;
 
     if (e == 2)
@@ -576,9 +398,8 @@ edgesHysteresisLocal
     if (n > 0)
     {
         const int ind = atomic_inc(counter);
-        st[ind] = (ushort2)(x, y);
+        st[ind] = (ushort2)(x + 1, y + 1);
     }
-#endif
 }
 
 __constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
@@ -586,25 +407,13 @@ __constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
 
 
 #define stack_size 512
-__kernel
-void
-__attribute__((reqd_work_group_size(128,1,1)))
-edgesHysteresisGlobal
-(
-    __global int * map,
-    __global ushort2 * st1,
-    __global ushort2 * st2,
-    __global int * counter,
-    int rows,
-    int cols,
-    int count,
-    int map_step,
-    int map_offset
-)
-{
-    map_step   /= sizeof(*map);
-    map_offset /= sizeof(*map);
+#define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))
 
+__kernel void __attribute__((reqd_work_group_size(128, 1, 1)))
+edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
+    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
+    int rows, int cols, int count)
+{
     map += map_offset;
 
     int lidx = get_local_id(0);
@@ -612,32 +421,26 @@ edgesHysteresisGlobal
     int grp_idx = get_group_id(0);
     int grp_idy = get_group_id(1);
 
-    __local unsigned int s_counter;
-    __local unsigned int s_ind;
-
+    __local unsigned int s_counter, s_ind;
     __local ushort2 s_st[stack_size];
 
-    if(lidx == 0)
-    {
+    if (lidx == 0)
         s_counter = 0;
-    }
     barrier(CLK_LOCAL_MEM_FENCE);
 
     int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
 
-    if(ind < count)
+    if (ind < count)
     {
         ushort2 pos = st1[ind];
         if (lidx < 8)
         {
             pos.x += c_dx[lidx];
             pos.y += c_dy[lidx];
-            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
+            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
             {
-                map[pos.x + pos.y * map_step] = 2;
-
+                *(__global int *)(map + map_index) = 2;
                 ind = atomic_inc(&s_counter);
-
                 s_st[ind] = pos;
             }
         }
@@ -660,12 +463,10 @@ edgesHysteresisGlobal
             {
                 pos.x += c_dx[lidx & 7];
                 pos.y += c_dy[lidx & 7];
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
                 {
-                    map[pos.x + pos.y * map_step] = 2;
-
+                    *(__global int *)(map + map_index) = 2;
                     ind = atomic_inc(&s_counter);
-
                     s_st[ind] = pos;
                 }
             }
@@ -682,40 +483,32 @@ edgesHysteresisGlobal
             barrier(CLK_LOCAL_MEM_FENCE);
 
             ind = s_ind;
-
             for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
-            {
                 st2[ind + i] = s_st[i];
-            }
         }
     }
 }
+
+#undef map_index
 #undef stack_size
 
-//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
+// Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
 // map		edge type mappings
 // dst		edge output
-__kernel
-void getEdges
-(
-    __global const int * map,
-    __global uchar * dst,
-    int rows,
-    int cols,
-    int map_step,
-    int map_offset,
-    int dst_step,
-    int dst_offset
-)
+
+__kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offset,
+                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
 {
-    map_step   /= sizeof(*map);
-    map_offset /= sizeof(*map);
+    int x = get_global_id(0);
+    int y = get_global_id(1);
 
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    if(gidy < rows && gidx < cols)
+    if (y < rows && x < cols)
     {
-        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
+        int map_index = mad24(map_step, y + 1, (x + 1) * (int)sizeof(int) + map_offset);
+        int dst_index = mad24(dst_step, y, x + dst_offset);
+
+        __global const int * map = (__global const int *)(mapptr + map_index);
+
+        dst[dst_index] = (uchar)(-(map[0] >> 1));
     }
 }
diff --git a/modules/ocl/src/opencl/imgproc_clahe.cl b/modules/imgproc/src/opencl/clahe.cl
similarity index 91%
rename from modules/ocl/src/opencl/imgproc_clahe.cl
rename to modules/imgproc/src/opencl/clahe.cl
index 71a6f895d..9f88b20bf 100644
--- a/modules/ocl/src/opencl/imgproc_clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@@ -139,11 +139,11 @@ inline void reduce(__local volatile int* smem, int val, int tid)
 }
 #endif
 
-__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
-                      const int srcStep, const int dstStep,
+__kernel void calcLut(__global __const uchar * src, const int srcStep,
+                      const int src_offset, __global uchar * lut,
+                      const int dstStep, const int dst_offset,
                       const int2 tileSize, const int tilesX,
-                      const int clipLimit, const float lutScale,
-                      const int src_offset, const int dst_offset)
+                      const int clipLimit, const float lutScale)
 {
     __local int smem[512];
 
@@ -151,7 +151,6 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
     int ty = get_group_id(1);
     int tid = get_local_id(1) * get_local_size(0)
                              + get_local_id(0);
-
     smem[tid] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -212,14 +211,12 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
         convert_uchar(clamp(ires, (uint)0, (uint)255));
 }
 
-__kernel void transform(__global __const uchar * src,
-                        __global uchar * dst,
-                        __global uchar * lut,
-                        const int srcStep, const int dstStep, const int lutStep,
+__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,
+                        __global uchar * dst, const int dstStep, const int dst_offset,
+                        __global uchar * lut, const int lutStep, int lut_offset,
                         const int cols, const int rows,
                         const int2 tileSize,
-                        const int tilesX, const int tilesY,
-                        const int src_offset, const int dst_offset, int lut_offset)
+                        const int tilesX, const int tilesY)
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
diff --git a/modules/ocl/src/opencl/imgproc_calcHarris.cl b/modules/imgproc/src/opencl/corner.cl
similarity index 79%
rename from modules/ocl/src/opencl/imgproc_calcHarris.cl
rename to modules/imgproc/src/opencl/corner.cl
index 4fc179260..563cb9808 100644
--- a/modules/ocl/src/opencl/imgproc_calcHarris.cl
+++ b/modules/imgproc/src/opencl/corner.cl
@@ -47,13 +47,6 @@
 /////////////////////////////////Macro for border type////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#if defined (DOUBLE_SUPPORT) && defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#define FPTYPE double
-#else
-#define FPTYPE float
-#endif
-
 #ifdef BORDER_CONSTANT
 #elif defined BORDER_REPLICATE
 #define EXTRAPOLATE(x, maxV) \
@@ -98,10 +91,9 @@
 /////////////////////////////////////calcHarris////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-__kernel void calcHarris(__global const float *Dx, __global const float *Dy, __global float *dst,
-                         int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
-                         int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
-                         int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
+__kernel void corner(__global const float * Dx, int dx_step, int dx_offset, int dx_whole_rows, int dx_whole_cols,
+                     __global const float * Dy, int dy_step, int dy_offset, int dy_whole_rows, int dy_whole_cols,
+                     __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float k)
 {
     int col = get_local_id(0);
     int gX = get_group_id(0);
@@ -123,7 +115,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
     int dst_startY = (gY << 1) + dst_y_off;
 
     float dx_data[ksY+1],dy_data[ksY+1], data[3][ksY+1];
-    __local FPTYPE temp[6][THREADS];
+    __local float temp[6][THREADS];
 
 #ifdef BORDER_CONSTANT
     for (int i=0; i < ksY+1; i++)
@@ -161,7 +153,7 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
         data[2][i] = dy_data[i] * dy_data[i];
     }
 #endif
-    FPTYPE sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
+    float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
     for (int i=1; i < ksY; i++)
     {
         sum0 += data[0][i];
@@ -169,21 +161,21 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
         sum2 += data[2][i];
     }
 
-    FPTYPE sum01 = sum0 + data[0][0];
-    FPTYPE sum02 = sum0 + data[0][ksY];
+    float sum01 = sum0 + data[0][0];
+    float sum02 = sum0 + data[0][ksY];
     temp[0][col] = sum01;
     temp[1][col] = sum02;
-    FPTYPE sum11 = sum1 + data[1][0];
-    FPTYPE sum12 = sum1 + data[1][ksY];
+    float sum11 = sum1 + data[1][0];
+    float sum12 = sum1 + data[1][ksY];
     temp[2][col] = sum11;
     temp[3][col] = sum12;
-    FPTYPE sum21 = sum2 + data[2][0];
-    FPTYPE sum22 = sum2 + data[2][ksY];
+    float sum21 = sum2 + data[2][0];
+    float sum22 = sum2 + data[2][ksY];
     temp[4][col] = sum21;
     temp[5][col] = sum22;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (col < (THREADS- (ksX - 1)))
+    if (col < (THREADS - (ksX - 1)))
     {
         col += anX;
         int posX = dst_startX - dst_x_off + col - anX;
@@ -192,23 +184,44 @@ __kernel void calcHarris(__global const float *Dx, __global const float *Dy, __g
         float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
         for (int k=0; k<6; k++)
         {
-            FPTYPE temp_sum = 0;
+            float temp_sum = 0;
             for (int i=-anX; i<=anX - till; i++)
-            {
                 temp_sum += temp[k][col+i];
-            }
             tmp_sum[k] = temp_sum;
         }
 
+#ifdef CORNER_HARRIS
         if (posX < dst_cols && (posY) < dst_rows)
         {
-            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
+            int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));
+            *(__global float *)(dst + dst_index) =
                     tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
         }
         if (posX < dst_cols && (posY + 1) < dst_rows)
         {
-            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
+            int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));
+            *(__global float *)(dst + dst_index) =
                     tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
         }
+#elif defined CORNER_MINEIGENVAL
+        if (posX < dst_cols && (posY) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));
+            float a = tmp_sum[0] * 0.5f;
+            float b = tmp_sum[2];
+            float c = tmp_sum[4] * 0.5f;
+            *(__global float *)(dst + dst_index) = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+        if (posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));
+            float a = tmp_sum[1] * 0.5f;
+            float b = tmp_sum[3];
+            float c = tmp_sum[5] * 0.5f;
+            *(__global float *)(dst + dst_index) = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+#else
+#error "No such corners type"
+#endif
     }
 }
diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl
index ad7562f3d..115bfbd7a 100644
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@@ -46,10 +46,6 @@
 
 /**************************************PUBLICFUNC*************************************/
 
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
 #if depth == 0
     #define DATA_TYPE uchar
     #define MAX_NUM  255
@@ -472,7 +468,7 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
         dst[0] = src[2];
         dst[1] = src[1];
         dst[2] = src[0];
-#elif defined ORDER
+#else
         dst[0] = src[0];
         dst[1] = src[1];
         dst[2] = src[2];
@@ -728,7 +724,7 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset
 
         diff = v - vmin;
         s = diff/(float)(fabs(v) + FLT_EPSILON);
-        diff = (float)(60./(diff + FLT_EPSILON));
+        diff = (float)(60.f/(diff + FLT_EPSILON));
         if( v == r )
             h = (g - b)*diff;
         else if( v == g )
@@ -1065,3 +1061,234 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset
 }
 
 #endif
+
+/////////////////////////////////// [l|s]RGB <-> Lab ///////////////////////////
+
+#define lab_shift xyz_shift
+#define gamma_shift 3
+#define lab_shift2 (lab_shift + gamma_shift)
+#define GAMMA_TAB_SIZE 1024
+#define GammaTabScale (float)GAMMA_TAB_SIZE
+
+inline float splineInterpolate(float x, __global const float * tab, int n)
+{
+    int ix = clamp(convert_int_sat_rtn(x), 0, n-1);
+    x -= ix;
+    tab += ix*4;
+    return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
+}
+
+#ifdef DEPTH_0
+
+__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+                      __global const ushort * gammaTab, __global ushort * LabCbrtTab_b,
+                      __constant int * coeffs, int Lscale, int Lshift)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+        src += src_idx;
+        dst += dst_idx;
+
+        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        int R = gammaTab[src[0]], G = gammaTab[src[1]], B = gammaTab[src[2]];
+        int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
+        int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
+        int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
+
+        int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
+        int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
+        int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
+
+        dst[0] = SAT_CAST(L);
+        dst[1] = SAT_CAST(a);
+        dst[2] = SAT_CAST(b);
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float _1_3, float _a)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+        __global const float * src = (__global const float *)(srcptr + src_idx);
+        __global float * dst = (__global float *)(dstptr + dst_idx);
+
+        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+        float R = clamp(src[0], 0.0f, 1.0f);
+        float G = clamp(src[1], 0.0f, 1.0f);
+        float B = clamp(src[2], 0.0f, 1.0f);
+
+#ifdef SRGB
+        R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+        G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+        B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+        float X = R*C0 + G*C1 + B*C2;
+        float Y = R*C3 + G*C4 + B*C5;
+        float Z = R*C6 + G*C7 + B*C8;
+
+        float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a);
+        float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a);
+        float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a);
+
+        float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+        float a = 500.f * (FX - FY);
+        float b = 200.f * (FY - FZ);
+
+        dst[0] = L;
+        dst[1] = a;
+        dst[2] = b;
+    }
+}
+
+#endif
+
+inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];
+
+    float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
+          C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
+          C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
+
+    float y, fy;
+    if (li <= lThresh)
+    {
+        y = li / 903.3f;
+        fy = 7.787f * y + 16.0f / 116.0f;
+    }
+    else
+    {
+        fy = (li + 16.0f) / 116.0f;
+        y = fy * fy * fy;
+    }
+
+    float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
+
+    for (int j = 0; j < 2; j++)
+        if (fxz[j] <= fThresh)
+            fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
+        else
+            fxz[j] = fxz[j] * fxz[j] * fxz[j];
+
+    float x = fxz[0], z = fxz[1];
+    float ro = clamp(C0 * x + C1 * y + C2 * z, 0.0f, 1.0f);
+    float go = clamp(C3 * x + C4 * y + C5 * z, 0.0f, 1.0f);
+    float bo = clamp(C6 * x + C7 * y + C8 * z, 0.0f, 1.0f);
+
+#ifdef SRGB
+    ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+    bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);
+#endif
+
+    dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;
+}
+
+#ifdef DEPTH_0
+
+__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
+                      __global uchar * dst, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+        src += src_idx;
+        dst += dst_idx;
+
+        float srcbuf[3], dstbuf[3];
+        srcbuf[0] = src[0]*(100.f/255.f);
+        srcbuf[1] = convert_float(src[1] - 128);
+        srcbuf[2] = convert_float(src[2] - 128);
+
+        Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+            gammaTab,
+#endif
+            coeffs, lThresh, fThresh);
+
+        dst[0] = SAT_CAST(dstbuf[0] * 255.0f);
+        dst[1] = SAT_CAST(dstbuf[1] * 255.0f);
+        dst[2] = SAT_CAST(dstbuf[2] * 255.0f);
+#if dcn == 4
+        dst[3] = MAX_NUM;
+#endif
+    }
+}
+
+#elif defined DEPTH_5
+
+__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,
+#ifdef SRGB
+                      __global const float * gammaTab,
+#endif
+                      __constant float * coeffs, float lThresh, float fThresh)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        int src_idx = mad24(y, src_step, src_offset + x * scnbytes);
+        int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes);
+
+        __global const float * src = (__global const float *)(srcptr + src_idx);
+        __global float * dst = (__global float *)(dstptr + dst_idx);
+
+        float srcbuf[3], dstbuf[3];
+        srcbuf[0] = src[0], srcbuf[1] = src[1], srcbuf[2] = src[2];
+
+        Lab2BGR_f(&srcbuf[0], &dstbuf[0],
+#ifdef SRGB
+            gammaTab,
+#endif
+            coeffs, lThresh, fThresh);
+
+        dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
+#if dcn == 4
+        dst[3] = MAX_NUM;
+#endif
+    }
+}
+
+#endif
diff --git a/modules/ocl/src/opencl/filter_sep_col.cl b/modules/imgproc/src/opencl/filterSepCol.cl
similarity index 93%
rename from modules/ocl/src/opencl/filter_sep_col.cl
rename to modules/imgproc/src/opencl/filterSepCol.cl
index 0d1998ce9..2657ae931 100644
--- a/modules/ocl/src/opencl/filter_sep_col.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -60,20 +60,19 @@ Niko
 The info above maybe obsolete.
 ***********************************************************************************/
 
+#define DIG(a) a,
+__constant float mat_kernel[] = { COEFF };
 
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
                         (__global const GENTYPE_SRC * restrict src,
-                         __global GENTYPE_DST * dst,
-                         const int dst_cols,
-                         const int dst_rows,
+                         const int src_step_in_pixel,
                          const int src_whole_cols,
                          const int src_whole_rows,
-                         const int src_step_in_pixel,
-                         //const int src_offset_x,
-                         //const int src_offset_y,
-                         const int dst_step_in_pixel,
+                         __global GENTYPE_DST * dst,
                          const int dst_offset_in_pixel,
-                         __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSY+1)))))
+                         const int dst_step_in_pixel,
+                         const int dst_cols,
+                         const int dst_rows)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -112,7 +111,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void col_filter
     //write the result to dst
     if((x<dst_cols) & (y<dst_rows))
     {
-        start_addr = mad24(y,dst_step_in_pixel,x+dst_offset_in_pixel);
+        start_addr = mad24(y, dst_step_in_pixel, x + dst_offset_in_pixel);
         dst[start_addr] = convert_to_DST(sum);
     }
 }
diff --git a/modules/ocl/src/opencl/filter_sep_row.cl b/modules/imgproc/src/opencl/filterSepRow.cl
similarity index 63%
rename from modules/ocl/src/opencl/filter_sep_row.cl
rename to modules/imgproc/src/opencl/filterSepRow.cl
index d74540966..d0623f590 100644
--- a/modules/ocl/src/opencl/filter_sep_row.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
@@ -48,45 +48,85 @@
 #define ALIGN (RADIUS)
 #endif
 
-#ifdef BORDER_CONSTANT
-#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        x = max(min(x, maxV - 1), 0); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        if (x < 0) \
-            x -= ((x - maxV + 1) / maxV) * maxV; \
-        if (x >= maxV) \
-            x %= maxV; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE_(x, maxV, delta) \
-    { \
-        if (maxV == 1) \
-            x = 0; \
-        else \
-            do \
-            { \
-                if ( x < 0 ) \
-                    x = -x - 1 + delta; \
-                else \
-                    x = maxV - 1 - (x - maxV) - delta; \
-            } \
-            while (x >= maxV || x < 0); \
-    }
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#endif
+
 #ifdef BORDER_REFLECT
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
-#else
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
 #endif
-#else
-#error No extrapolation method
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
 #endif
 
+//blur function does not support BORDER_WRAP
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#endif
+
+#ifdef EXTRA_EXTRAPOLATION // border > src image size
+    #ifdef BORDER_CONSTANT
+        #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+    #elif defined BORDER_REPLICATE
+        #define EXTRAPOLATE(t, minT, maxT) \
+        { \
+            t = max(min(t, (maxT) - 1), (minT)); \
+        }
+    #elif defined BORDER_WRAP
+        #define EXTRAPOLATE(x, minT, maxT) \
+        { \
+            if (t < (minT)) \
+                t -= ((t - (maxT) + 1) / (maxT)) * (maxT); \
+            if (t >= (maxT)) \
+                t %= (maxT); \
+        }
+    #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
+        #define EXTRAPOLATE_(t, minT, maxT, delta) \
+        { \
+            if ((maxT) - (minT) == 1) \
+                t = (minT); \
+            else \
+                do \
+                { \
+                    if (t < (minT)) \
+                        t = (minT) - (t - (minT)) - 1 + delta; \
+                    else \
+                        t = (maxT) - 1 - (t - (maxT)) - delta; \
+                } \
+                while (t >= (maxT) || t < (minT)); \
+            \
+        }
+        #ifdef BORDER_REFLECT
+            #define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 0)
+        #elif defined(BORDER_REFLECT_101)
+            #define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 1)
+        #endif
+    #else
+        #error No extrapolation method
+    #endif //BORDER_....
+#else //EXTRA_EXTRAPOLATION
+    #ifdef BORDER_CONSTANT
+        #define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+    #else
+        #define EXTRAPOLATE(t, minT, maxT) \
+        { \
+            int _delta = t - (minT); \
+            _delta = ADDR_L(_delta, 0, (maxT) - (minT)); \
+            _delta = ADDR_R(_delta, (maxT) - (minT), _delta); \
+            t = _delta + (minT); \
+        }
+    #endif //BORDER_CONSTANT
+#endif //EXTRA_EXTRAPOLATION
+
 /**********************************************************************************
 These kernels are written for separable filters such as Sobel, Scharr, GaussianBlur.
 Now(6/29/2011) the kernels only support 8U data type and the anchor of the convovle
@@ -104,15 +144,19 @@ Niko
 The info above maybe obsolete.
 ***********************************************************************************/
 
+#define DIG(a) a,
+__constant float mat_kernel[] = { COEFF };
+
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D0
     (__global uchar * restrict src,
-     __global float * dst,
-     int dst_cols, int dst_rows,
-     int src_whole_cols, int src_whole_rows,
      int src_step_in_pixel,
      int src_offset_x, int src_offset_y,
-     int dst_step_in_pixel, int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int src_cols, int src_rows,
+     int src_whole_cols, int src_whole_rows,
+     __global float * dst,
+     int dst_step_in_pixel,
+     int dst_cols, int dst_rows,
+     int radiusy)
 {
     int x = get_global_id(0)<<2;
     int y = get_global_id(1);
@@ -140,16 +184,31 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     }
 
     // judge if read out of boundary
+#ifdef BORDER_ISOLATED
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
-        temp[i].x = ELEM(start_x+i*LSIZE0*4,0,src_whole_cols,0,temp[i].x);
-        temp[i].y = ELEM(start_x+i*LSIZE0*4+1,0,src_whole_cols,0,temp[i].y);
-        temp[i].z = ELEM(start_x+i*LSIZE0*4+2,0,src_whole_cols,0,temp[i].z);
-        temp[i].w = ELEM(start_x+i*LSIZE0*4+3,0,src_whole_cols,0,temp[i].w);
-        temp[i]   = ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+        temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x);
+        temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y);
+        temp[i].z = ELEM(start_x+i*LSIZE0*4+2, src_offset_x, src_offset_x + src_cols, 0,         temp[i].z);
+        temp[i].w = ELEM(start_x+i*LSIZE0*4+3, src_offset_x, src_offset_x + src_cols, 0,         temp[i].w);
+        temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
     }
+#else
+    for (i = 0; i<READ_TIMES_ROW; i++)
+    {
+        temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x);
+        temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y);
+        temp[i].z = ELEM(start_x+i*LSIZE0*4+2, 0, src_whole_cols, 0,         temp[i].z);
+        temp[i].w = ELEM(start_x+i*LSIZE0*4+3, 0, src_whole_cols, 0,         temp[i].w);
+        temp[i]   = ELEM(start_y,              0, src_whole_rows, (uchar4)0, temp[i]);
+    }
+#endif
+#else // BORDER_CONSTANT
+#ifdef BORDER_ISOLATED
+    int not_all_in_range = (start_x<src_offset_x) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_offset_x + src_cols)| (start_y<src_offset_y) | (start_y >= src_offset_y + src_rows);
 #else
     int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);
+#endif
     int4 index[READ_TIMES_ROW];
     int4 addr;
     int s_y;
@@ -160,13 +219,24 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         for (i = 0; i < READ_TIMES_ROW; i++)
         {
             index[i] = (int4)(start_x+i*LSIZE0*4) + (int4)(0, 1, 2, 3);
-            EXTRAPOLATE(index[i].x, src_whole_cols);
-            EXTRAPOLATE(index[i].y, src_whole_cols);
-            EXTRAPOLATE(index[i].z, src_whole_cols);
-            EXTRAPOLATE(index[i].w, src_whole_cols);
+#ifdef BORDER_ISOLATED
+            EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);
+            EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);
+            EXTRAPOLATE(index[i].z, src_offset_x, src_offset_x + src_cols);
+            EXTRAPOLATE(index[i].w, src_offset_x, src_offset_x + src_cols);
+#else
+            EXTRAPOLATE(index[i].x, 0, src_whole_cols);
+            EXTRAPOLATE(index[i].y, 0, src_whole_cols);
+            EXTRAPOLATE(index[i].z, 0, src_whole_cols);
+            EXTRAPOLATE(index[i].w, 0, src_whole_cols);
+#endif
         }
         s_y = start_y;
-        EXTRAPOLATE(s_y, src_whole_rows);
+#ifdef BORDER_ISOLATED
+        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
+#else
+        EXTRAPOLATE(s_y, 0, src_whole_rows);
+#endif
 
         // read pixels from src
         for (i = 0; i<READ_TIMES_ROW; i++)
@@ -184,7 +254,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         for (i = 0; i<READ_TIMES_ROW; i++)
             temp[i] = *(__global uchar4*)&src[start_addr+i*LSIZE0*4];
     }
-#endif
+#endif //BORDER_CONSTANT
 
     // save pixels to lds
     for (i = 0; i<READ_TIMES_ROW; i++)
@@ -222,13 +292,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
 
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D0
     (__global uchar4 * restrict src,
-     __global float4 * dst,
-     int dst_cols, int dst_rows,
-     int src_whole_cols, int src_whole_rows,
      int src_step_in_pixel,
      int src_offset_x, int src_offset_y,
-     int dst_step_in_pixel, int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int src_cols, int src_rows,
+     int src_whole_cols, int src_whole_rows,
+     __global float4 * dst,
+     int dst_step_in_pixel,
+     int dst_cols, int dst_rows,
+     int radiusy)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -254,11 +325,19 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     }
 
     //judge if read out of boundary
+#ifdef BORDER_ISOLATED
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
-        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(uchar4)0,temp[i]);
-        temp[i]= ELEM(start_y,0,src_whole_rows,(uchar4)0,temp[i]);
+        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (uchar4)0, temp[i]);
+        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);
     }
+#else
+    for (i = 0; i<READ_TIMES_ROW; i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (uchar4)0, temp[i]);
+        temp[i]= ELEM(start_y,          0, src_whole_rows, (uchar4)0, temp[i]);
+    }
+#endif
 #else
     int index[READ_TIMES_ROW];
     int s_x,s_y;
@@ -267,16 +346,21 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
         s_x = start_x+i*LSIZE0;
-        EXTRAPOLATE(s_x, src_whole_cols);
         s_y = start_y;
-        EXTRAPOLATE(s_y, src_whole_rows);
-        index[i]=mad24(s_y,src_step_in_pixel,s_x);
+#ifdef BORDER_ISOLATED
+        EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
+        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
+#else
+        EXTRAPOLATE(s_x, 0, src_whole_cols);
+        EXTRAPOLATE(s_y, 0, src_whole_rows);
+#endif
+        index[i]=mad24(s_y, src_step_in_pixel, s_x);
     }
 
     //read pixels from src
     for (i = 0; i<READ_TIMES_ROW; i++)
         temp[i] = src[index[i]];
-#endif
+#endif //BORDER_CONSTANT
 
     //save pixels to lds
     for (i = 0; i<READ_TIMES_ROW; i++)
@@ -301,13 +385,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
 
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C1_D5
     (__global float * restrict src,
-     __global float * dst,
-     int dst_cols, int dst_rows,
-     int src_whole_cols, int src_whole_rows,
      int src_step_in_pixel,
      int src_offset_x, int src_offset_y,
-     int dst_step_in_pixel, int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int src_cols, int src_rows,
+     int src_whole_cols, int src_whole_rows,
+     __global float * dst,
+     int dst_step_in_pixel,
+     int dst_cols, int dst_rows,
+     int radiusy)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -333,27 +418,40 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     }
 
     // judge if read out of boundary
+#ifdef BORDER_ISOLATED
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
-        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float)0,temp[i]);
-        temp[i]= ELEM(start_y,0,src_whole_rows,(float)0,temp[i]);
+        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float)0,temp[i]);
+        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float)0,temp[i]);
     }
 #else
+    for (i = 0; i<READ_TIMES_ROW; i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float)0,temp[i]);
+        temp[i]= ELEM(start_y,          0, src_whole_rows, (float)0,temp[i]);
+    }
+#endif
+#else // BORDER_CONSTANT
     int index[READ_TIMES_ROW];
     int s_x,s_y;
     // judge if read out of boundary
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
         s_x = start_x + i*LSIZE0, s_y = start_y;
-        EXTRAPOLATE(s_x, src_whole_cols);
-        EXTRAPOLATE(s_y, src_whole_rows);
+#ifdef BORDER_ISOLATED
+        EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
+        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
+#else
+        EXTRAPOLATE(s_x, 0, src_whole_cols);
+        EXTRAPOLATE(s_y, 0, src_whole_rows);
+#endif
 
         index[i]=mad24(s_y, src_step_in_pixel, s_x);
     }
     // read pixels from src
     for (i = 0; i<READ_TIMES_ROW; i++)
         temp[i] = src[index[i]];
-#endif
+#endif// BORDER_CONSTANT
 
     //save pixels to lds
     for (i = 0; i<READ_TIMES_ROW; i++)
@@ -379,13 +477,14 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
 
 __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_C4_D5
     (__global float4 * restrict src,
-     __global float4 * dst,
-     int dst_cols, int dst_rows,
-     int src_whole_cols, int src_whole_rows,
      int src_step_in_pixel,
      int src_offset_x, int src_offset_y,
-     int dst_step_in_pixel, int radiusy,
-     __constant float * mat_kernel __attribute__((max_constant_size(4*(2*RADIUSX+1)))))
+     int src_cols, int src_rows,
+     int src_whole_cols, int src_whole_rows,
+     __global float4 * dst,
+     int dst_step_in_pixel,
+     int dst_cols, int dst_rows,
+     int radiusy)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
@@ -411,11 +510,19 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     }
 
     // judge if read out of boundary
+#ifdef BORDER_ISOLATED
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
-        temp[i]= ELEM(start_x+i*LSIZE0,0,src_whole_cols,(float4)0,temp[i]);
-        temp[i]= ELEM(start_y,0,src_whole_rows,(float4)0,temp[i]);
+        temp[i]= ELEM(start_x+i*LSIZE0, src_offset_x, src_offset_x + src_cols, (float4)0,temp[i]);
+        temp[i]= ELEM(start_y,          src_offset_y, src_offset_y + src_rows, (float4)0,temp[i]);
     }
+#else
+    for (i = 0; i<READ_TIMES_ROW; i++)
+    {
+        temp[i]= ELEM(start_x+i*LSIZE0, 0, src_whole_cols, (float4)0,temp[i]);
+        temp[i]= ELEM(start_y,          0, src_whole_rows, (float4)0,temp[i]);
+    }
+#endif
 #else
     int index[READ_TIMES_ROW];
     int s_x,s_y;
@@ -424,8 +531,13 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
     for (i = 0; i<READ_TIMES_ROW; i++)
     {
         s_x = start_x + i*LSIZE0, s_y = start_y;
-        EXTRAPOLATE(s_x, src_whole_cols);
-        EXTRAPOLATE(s_y, src_whole_rows);
+#ifdef BORDER_ISOLATED
+        EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);
+        EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);
+#else
+        EXTRAPOLATE(s_x, 0, src_whole_cols);
+        EXTRAPOLATE(s_y, 0, src_whole_rows);
+#endif
 
         index[i]=mad24(s_y,src_step_in_pixel,s_x);
     }
diff --git a/modules/ocl/src/opencl/arithm_magnitude.cl b/modules/imgproc/src/opencl/gftt.cl
similarity index 66%
rename from modules/ocl/src/opencl/arithm_magnitude.cl
rename to modules/imgproc/src/opencl/gftt.cl
index 1053efd00..46e37990d 100644
--- a/modules/ocl/src/opencl/arithm_magnitude.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Zhang Ying, zhangying913@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,32 +43,39 @@
 //
 //M*/
 
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
+__kernel void findCorners(__global const uchar * eigptr, int eig_step, int eig_offset,
+#ifdef HAVE_MASK
+                          __global const uchar * mask, int mask_step, int mask_offset,
 #endif
-#endif
-
-__kernel void arithm_magnitude(__global T *src1, int src1_step, int src1_offset,
-                               __global T *src2, int src2_step, int src2_offset,
-                               __global T *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols)
+                          __global const uchar * tmpptr, int tmp_step, int tmp_offset,
+                          __global uchar * cornersptr, __global int * counter,
+                          int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
     {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index  = mad24(y, dst_step,  x + dst_offset);
+        ++x, ++y;
 
-        T data1 = src1[src1_index];
-        T data2 = src2[src2_index];
+        int eig_index = mad24(y, eig_step, eig_offset + x * (int)sizeof(float));
+        int tmp_index = mad24(y, tmp_step, tmp_offset + x * (int)sizeof(float));
+#ifdef HAVE_MASK
+        int mask_index = mad24(y, mask_step, mask_offset + x);
+        mask += mask_index;
+#endif
 
-        T tmp = hypot(data1, data2);
-        dst[dst_index] = tmp;
+        float val = *(__global const float *)(eigptr + eig_index);
+        float tmp = *(__global const float *)(tmpptr + tmp_index);
+
+        if (val != 0 && val == tmp
+#ifdef HAVE_MASK
+            && mask[0] != 0
+#endif
+            )
+        {
+            __global float2 * corners = (__global float2 *)(cornersptr + (int)sizeof(float2) * atomic_inc(counter));
+            corners[0] = (float2)(val, as_float( (x<<16) | y ));
+        }
     }
 }
diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl
new file mode 100644
index 000000000..123c1d6f8
--- /dev/null
+++ b/modules/imgproc/src/opencl/match_template.cl
@@ -0,0 +1,397 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+
+#define DATA_SIZE ((int)sizeof(type))
+#define ELEM_TYPE elem_type
+#define ELEM_SIZE ((int)sizeof(elem_type))
+#define CN cn
+
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx*CN + img_sqsums_offset + ox*CN)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step,   gidx*CN + img_sums_offset + ox*CN)
+
+inline float normAcc(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 0;
+}
+
+inline float normAcc_SQDIFF(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 1;
+}
+
+//////////////////////////////////////////CCORR/////////////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Naive_CCORR (__global const uchar * img,int img_step,int img_offset,
+                                         __global const uchar * tpl,int tpl_step,int tpl_offset,int tpl_rows, int tpl_cols,
+                                         __global uchar * res,int res_step,int res_offset,int res_rows,int res_cols)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float sum = 0;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            __global const ELEM_TYPE * img_ptr = (__global const ELEM_TYPE *)(img + mad24(gidy + i, img_step, gidx*DATA_SIZE + img_offset));
+            __global const ELEM_TYPE * tpl_ptr = (__global const ELEM_TYPE *)(tpl + mad24(i, tpl_step, tpl_offset));
+
+            for(j = 0; j < tpl_cols; j ++)
+
+#pragma unroll
+                for (int c = 0; c < CN; c++)
+
+                    sum += (float)(img_ptr[j*CN+c] * tpl_ptr[j*CN+c]);
+
+        }
+        __global float * result = (__global float *)(res+res_idx);
+        *result = sum;
+    }
+}
+
+__kernel void matchTemplate_CCORR_NORMED ( __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                           __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                           int tpl_rows, int tpl_cols, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global float * sqsum = (__global float*)(img_sqsums);
+        float image_sqsum_ = (float)(
+                                 (sqsum[SQSUMS_PTR(tpl_cols, tpl_rows)] - sqsum[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (sqsum[SQSUMS_PTR(0, tpl_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc(*result, sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+////////////////////////////////////////////SQDIFF////////////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * img,int img_step,int img_offset,
+                                         __global const uchar * tpl,int tpl_step,int tpl_offset,int tpl_rows, int tpl_cols,
+                                         __global uchar * res,int res_step,int res_offset,int res_rows,int res_cols)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float delta;
+    float sum = 0;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            __global const ELEM_TYPE * img_ptr = (__global const ELEM_TYPE *)(img + mad24(gidy + i, img_step, gidx*DATA_SIZE + img_offset));
+            __global const ELEM_TYPE * tpl_ptr = (__global const ELEM_TYPE *)(tpl + mad24(i, tpl_step, tpl_offset));
+
+            for(j = 0; j < tpl_cols; j ++)
+
+#pragma unroll
+                for (int c = 0; c < CN; c++)
+                {
+                    delta = (float)(img_ptr[j*CN+c] - tpl_ptr[j*CN+c]);
+                    sum += delta*delta;
+                }
+        }
+        __global float * result = (__global float *)(res+res_idx);
+        *result = sum;
+    }
+}
+
+__kernel void matchTemplate_SQDIFF_NORMED ( __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                            __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                            int tpl_rows, int tpl_cols, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global float * sqsum = (__global float*)(img_sqsums);
+        float image_sqsum_ = (float)(
+                                 (sqsum[SQSUMS_PTR(tpl_cols, tpl_rows)] - sqsum[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (sqsum[SQSUMS_PTR(0, tpl_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result = normAcc_SQDIFF(image_sqsum_ - 2.f * result[0] + tpl_sqsum, sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+////////////////////////////////////////////CCOEFF/////////////////////////////////////////////////////////////////
+
+__kernel void matchTemplate_Prepared_CCOEFF_C1 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                  __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                  int tpl_rows, int tpl_cols, float tpl_sum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        image_sum_ += (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)] - sum[SUMS_PTR(tpl_cols, 0)])-
+                              (sum[SUMS_PTR(0, tpl_rows)] - sum[SUMS_PTR(0, 0)])) * tpl_sum;
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_Prepared_CCOEFF_C2 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                  __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                  int tpl_rows, int tpl_cols, float tpl_sum_0,float tpl_sum_1)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        image_sum_ += tpl_sum_0 * (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)] - sum[SUMS_PTR(tpl_cols, 0)])    -(sum[SUMS_PTR(0, tpl_rows)] - sum[SUMS_PTR(0, 0)]));
+        image_sum_ += tpl_sum_1 * (float)((sum[SUMS_PTR(tpl_cols, tpl_rows)+1] - sum[SUMS_PTR(tpl_cols, 0)+1])-(sum[SUMS_PTR(0, tpl_rows)+1] - sum[SUMS_PTR(0, 0)+1]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_Prepared_CCOEFF_C4 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                                __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                                int tpl_rows, int tpl_cols, float tpl_sum_0,float tpl_sum_1,float tpl_sum_2,float tpl_sum_3)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_step /= ELEM_SIZE;
+    img_sums_offset /= ELEM_SIZE;
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+    float image_sum_ = 0;
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+
+        int c_r = SUMS_PTR(tpl_cols, tpl_rows);
+        int c_o = SUMS_PTR(tpl_cols, 0);
+        int o_r = SUMS_PTR(0,tpl_rows);
+        int oo = SUMS_PTR(0, 0);
+
+        image_sum_ += tpl_sum_0 * (float)((sum[c_r]   - sum[c_o])  -(sum[o_r]   - sum[oo]));
+        image_sum_ += tpl_sum_1 * (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[oo+1]));
+        image_sum_ += tpl_sum_2 * (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[oo+2]));
+        image_sum_ += tpl_sum_3 * (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[oo+3]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result -= image_sum_;
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C1 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight, float tpl_sum, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        float image_sum_ =  (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)]) -
+                                    (sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)]));
+
+        float image_sqsum_ = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)]) -
+                                     (sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+
+        __global float * result = (__global float *)(res+res_idx);
+
+        *result = normAcc((*result) - image_sum_ * tpl_sum,
+                          sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C2 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight, float tpl_sum_0, float tpl_sum_1, float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    float sum_[2];
+    float sqsum_[2];
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        sum_[0] =  (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)])-(sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)]));
+        sum_[1] =  (float)((sum[SUMS_PTR(t_cols, t_rows)+1] - sum[SUMS_PTR(t_cols, 0)+1])-(sum[SUMS_PTR(0, t_rows)+1] - sum[SUMS_PTR(0, 0)+1]));
+
+        sqsum_[0] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)])-(sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
+        sqsum_[1] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)+1] - sqsum[SQSUMS_PTR(t_cols, 0)+1])-(sqsum[SQSUMS_PTR(0, t_rows)+1] - sqsum[SQSUMS_PTR(0, 0)+1]));
+
+        float num = sum_[0]*tpl_sum_0 + sum_[1]*tpl_sum_1;
+
+        float denum = sqrt( tpl_sqsum * (sqsum_[0] - weight * sum_[0]* sum_[0] +
+                                         sqsum_[1] - weight * sum_[1]* sum_[1]));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc((*result) - num, denum);
+    }
+}
+
+__kernel void matchTemplate_CCOEFF_NORMED_C4 (__global const uchar * img_sums, int img_sums_step, int img_sums_offset,
+                                              __global const uchar * img_sqsums, int img_sqsums_step, int img_sqsums_offset,
+                                              __global uchar * res, int res_step, int res_offset, int res_rows, int res_cols,
+                                              int t_rows, int t_cols, float weight,
+                                              float tpl_sum_0,float tpl_sum_1,float tpl_sum_2,float tpl_sum_3,
+                                              float tpl_sqsum)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= ELEM_SIZE;
+    img_sums_step     /= ELEM_SIZE;
+    img_sqsums_step   /= sizeof(float);
+    img_sqsums_offset /= sizeof(float);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx * (int)sizeof(float));
+
+    float sum_[4];
+    float sqsum_[4];
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(img_sums);
+        __global float * sqsum = (__global float*)(img_sqsums);
+
+        int c_r = SUMS_PTR(t_cols, t_rows);
+        int c_o = SUMS_PTR(t_cols, 0);
+        int o_r = SUMS_PTR(0, t_rows);
+        int o_o = SUMS_PTR(0, 0);
+
+        sum_[0] =  (float)((sum[c_r]   - sum[c_o])  -(sum[o_r]   - sum[o_o ]));
+        sum_[1] =  (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[o_o +1]));
+        sum_[2] =  (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[o_o +2]));
+        sum_[3] =  (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[o_o +3]));
+
+        c_r = SQSUMS_PTR(t_cols, t_rows);
+        c_o = SQSUMS_PTR(t_cols, 0);
+        o_r = SQSUMS_PTR(0, t_rows);
+        o_o = SQSUMS_PTR(0, 0);
+
+        sqsum_[0] = (float)((sqsum[c_r]   - sqsum[c_o])  -(sqsum[o_r]   - sqsum[o_o]));
+        sqsum_[1] = (float)((sqsum[c_r+1] - sqsum[c_o+1])-(sqsum[o_r+1] - sqsum[o_o+1]));
+        sqsum_[2] = (float)((sqsum[c_r+2] - sqsum[c_o+2])-(sqsum[o_r+2] - sqsum[o_o+2]));
+        sqsum_[3] = (float)((sqsum[c_r+3] - sqsum[c_o+3])-(sqsum[o_r+3] - sqsum[o_o+3]));
+
+        float num = sum_[0]*tpl_sum_0 + sum_[1]*tpl_sum_1 + sum_[2]*tpl_sum_2 + sum_[3]*tpl_sum_3;
+
+        float denum = sqrt( tpl_sqsum * (
+                                sqsum_[0] - weight * sum_[0]* sum_[0] +
+                                sqsum_[1] - weight * sum_[1]* sum_[1] +
+                                sqsum_[2] - weight * sum_[2]* sum_[2] +
+                                sqsum_[3] - weight * sum_[3]* sum_[3] ));
+
+        __global float * result = (__global float *)(res+res_idx);
+        *result = normAcc((*result) - num, denum);
+    }
+}
diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl
new file mode 100644
index 000000000..0cf5b3544
--- /dev/null
+++ b/modules/imgproc/src/opencl/moments.cl
@@ -0,0 +1,147 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#if TILE_SIZE != 32
+#error "TILE SIZE should be 32"
+#endif
+
+__kernel void moments(__global const uchar* src, int src_step, int src_offset,
+                      int src_rows, int src_cols, __global int* mom0, int xtiles)
+{
+    int x0 = get_global_id(0);
+    int y0 = get_group_id(1);
+    int x, y = get_local_id(1);
+    int x_min = x0*TILE_SIZE;
+    int ypix = y0*TILE_SIZE + y;
+    __local int mom[TILE_SIZE][10];
+
+    if( x_min < src_cols && y0*TILE_SIZE < src_rows )
+    {
+        if( ypix < src_rows )
+        {
+            int x_max = min(src_cols - x_min, TILE_SIZE);
+            __global const uchar* ptr = src + src_offset + ypix*src_step + x_min;
+            int4 S = (int4)(0,0,0,0), p;
+
+            #define SUM_ELEM(elem, ofs) \
+                (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem
+
+            x = x_max & -4;
+            if( x_max >= 4 )
+            {
+                p = convert_int4(vload4(0, ptr));
+                S += SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3);
+
+                if( x_max >= 8 )
+                {
+                    p = convert_int4(vload4(0, ptr+4));
+                    S += SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7);
+
+                    if( x_max >= 12 )
+                    {
+                        p = convert_int4(vload4(0, ptr+8));
+                        S += SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11);
+
+                        if( x_max >= 16 )
+                        {
+                            p = convert_int4(vload4(0, ptr+12));
+                            S += SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15);
+                        }
+                    }
+                }
+            }
+
+            if( x_max >= 20 )
+            {
+                p = convert_int4(vload4(0, ptr+16));
+                S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
+
+                if( x_max >= 24 )
+                {
+                    p = convert_int4(vload4(0, ptr+20));
+                    S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
+
+                    if( x_max >= 28 )
+                    {
+                        p = convert_int4(vload4(0, ptr+24));
+                        S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
+
+                        if( x_max >= 32 )
+                        {
+                            p = convert_int4(vload4(0, ptr+28));
+                            S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
+                        }
+                    }
+                }
+            }
+
+            if( x < x_max )
+            {
+                int ps = ptr[x];
+                S += SUM_ELEM(ps, x);
+                if( x+1 < x_max )
+                {
+                    ps = ptr[x+1];
+                    S += SUM_ELEM(ps, x+1);
+                    if( x+2 < x_max )
+                    {
+                        ps = ptr[x+2];
+                        S += SUM_ELEM(ps, x+2);
+                    }
+                }
+            }
+
+            int sy = y*y;
+
+            mom[y][0] = S.s0;
+            mom[y][1] = S.s1;
+            mom[y][2] = y*S.s0;
+            mom[y][3] = S.s2;
+            mom[y][4] = y*S.s1;
+            mom[y][5] = sy*S.s0;
+            mom[y][6] = S.s3;
+            mom[y][7] = y*S.s2;
+            mom[y][8] = sy*S.s1;
+            mom[y][9] = y*sy*S.s0;
+        }
+        else
+            mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =
+                mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        #define REDUCE(d) \
+        if( y < d ) \
+        { \
+            mom[y][0] += mom[y+d][0]; \
+            mom[y][1] += mom[y+d][1]; \
+            mom[y][2] += mom[y+d][2]; \
+            mom[y][3] += mom[y+d][3]; \
+            mom[y][4] += mom[y+d][4]; \
+            mom[y][5] += mom[y+d][5]; \
+            mom[y][6] += mom[y+d][6]; \
+            mom[y][7] += mom[y+d][7]; \
+            mom[y][8] += mom[y+d][8]; \
+            mom[y][9] += mom[y+d][9]; \
+        } \
+        barrier(CLK_LOCAL_MEM_FENCE)
+
+        REDUCE(16);
+        REDUCE(8);
+        REDUCE(4);
+        REDUCE(2);
+
+        if( y == 0 )
+        {
+            __global int* momout = mom0 + (y0*xtiles + x0)*10;
+            momout[0] = mom[0][0] + mom[1][0];
+            momout[1] = mom[0][1] + mom[1][1];
+            momout[2] = mom[0][2] + mom[1][2];
+            momout[3] = mom[0][3] + mom[1][3];
+            momout[4] = mom[0][4] + mom[1][4];
+            momout[5] = mom[0][5] + mom[1][5];
+            momout[6] = mom[0][6] + mom[1][6];
+            momout[7] = mom[0][7] + mom[1][7];
+            momout[8] = mom[0][8] + mom[1][8];
+            momout[9] = mom[0][9] + mom[1][9];
+        }
+    }
+}
diff --git a/modules/imgproc/src/opencl/morph.cl b/modules/imgproc/src/opencl/morph.cl
new file mode 100644
index 000000000..cb6e733ed
--- /dev/null
+++ b/modules/imgproc/src/opencl/morph.cl
@@ -0,0 +1,152 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+#ifdef DEPTH_0
+#ifdef ERODE
+#define VAL 255
+#endif
+#ifdef DILATE
+#define VAL 0
+#endif
+#endif
+#ifdef DEPTH_5
+#ifdef ERODE
+#define VAL FLT_MAX
+#endif
+#ifdef DILATE
+#define VAL -FLT_MAX
+#endif
+#endif
+#ifdef DEPTH_6
+#ifdef ERODE
+#define VAL DBL_MAX
+#endif
+#ifdef DILATE
+#define VAL -DBL_MAX
+#endif
+#endif
+
+#ifdef ERODE
+#define MORPH_OP(A,B) min((A),(B))
+#endif
+#ifdef DILATE
+#define MORPH_OP(A,B) max((A),(B))
+#endif
+//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+
+__kernel void morph(__global const uchar * restrict srcptr, int src_step, int src_offset,
+                    __global uchar * dstptr, int dst_step, int dst_offset,
+                    int src_offset_x, int src_offset_y,
+                    int cols, int rows,
+                    __constant uchar * mat_kernel,
+                    int src_whole_cols, int src_whole_rows)
+{
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int x = get_group_id(0)*LSIZE0;
+    int y = get_group_id(1)*LSIZE1;
+    int start_x = x+src_offset_x-RADIUSX;
+    int end_x = x + src_offset_x+LSIZE0+RADIUSX;
+    int width = end_x -(x+src_offset_x-RADIUSX)+1;
+    int start_y = y+src_offset_y-RADIUSY;
+    int point1 = mad24(l_y,LSIZE0,l_x);
+    int point2 = point1 + LSIZE0*LSIZE1;
+    int tl_x = point1 % width;
+    int tl_y = point1 / width;
+    int tl_x2 = point2 % width;
+    int tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x;
+    int cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2;
+    int cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y,src_step, cur_x*(int)sizeof(GENTYPE));
+    int start_addr2 = mad24(cur_y2,src_step, cur_x2*(int)sizeof(GENTYPE));
+    GENTYPE temp0,temp1;
+    __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
+
+    int end_addr = mad24(src_whole_rows - 1,src_step,src_whole_cols*(int)sizeof(GENTYPE));
+    //read pixels from src
+    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+    __global const GENTYPE * src;
+    src = (__global const GENTYPE *)(srcptr+start_addr);
+    temp0 = src[0];
+    src = (__global const GENTYPE *)(srcptr+start_addr2);
+    temp1 = src[0];
+    //judge if read out of boundary
+    temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
+    temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
+
+    temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
+    temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
+
+    LDS_DAT[point1] = temp0;
+    LDS_DAT[point2] = temp1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    GENTYPE res = (GENTYPE)VAL;
+    for(int i=0; i<2*RADIUSY+1; i++)
+        for(int j=0; j<2*RADIUSX+1; j++)
+        {
+            res =
+#ifndef RECTKERNEL
+                mat_kernel[i*(2*RADIUSX+1)+j] ?
+#endif
+                MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
+#ifndef RECTKERNEL
+                :res
+#endif
+                ;
+        }
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if(gidx<cols && gidy<rows)
+    {
+        int dst_index = mad24(gidy, dst_step, dst_offset + gidx * (int)sizeof(GENTYPE));
+        __global GENTYPE * dst = (__global GENTYPE *)(dstptr + dst_index);
+        dst[0] = res;
+    }
+
+}
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl b/modules/imgproc/src/opencl/precornerdetect.cl
similarity index 58%
rename from modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
rename to modules/imgproc/src/opencl/precornerdetect.cl
index 756f20165..32e6c2ce5 100644
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/imgproc/src/opencl/precornerdetect.cl
@@ -15,7 +15,7 @@
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
+//    Shengen Yan,yanshengen@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,44 +43,33 @@
 //
 //M*/
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int cols, int rows)
+__kernel void preCornerDetect(__global const uchar * Dxptr, int dx_step, int dx_offset,
+                              __global const uchar * Dyptr, int dy_step, int dy_offset,
+                              __global const uchar * D2xptr, int d2x_step, int d2x_offset,
+                              __global const uchar * D2yptr, int d2y_step, int d2y_offset,
+                              __global const uchar * Dxyptr, int dxy_step, int dxy_offset,
+                              __global uchar * dstptr, int dst_step, int dst_offset,
+                              int dst_rows, int dst_cols, float factor)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if (x < cols && y < rows)
+    if (x < dst_cols && y < dst_rows)
     {
-        int mask_index = mad24(y, mask_step, x + mask_offset);
+        int dx_index = mad24(dx_step, y, (int)sizeof(float) * x + dx_offset);
+        int dy_index = mad24(dy_step, y, (int)sizeof(float) * x + dy_offset);
+        int d2x_index = mad24(d2x_step, y, (int)sizeof(float) * x + d2x_offset);
+        int d2y_index = mad24(d2y_step, y, (int)sizeof(float) * x + d2y_offset);
+        int dxy_index = mad24(dxy_step, y, (int)sizeof(float) * x + dxy_offset);
+        int dst_index = mad24(dst_step, y, (int)sizeof(float) * x + dst_offset);
 
-        if (mask[mask_index])
-        {
-#if elemSize > 1
-            x *= elemSize;
-#endif
-            int src1_index = mad24(y, src1_step, x + src1_offset);
-            int dst_index = mad24(y, dst_step, x + dst_offset);
+        float dx = *(__global const float *)(Dxptr + dx_index);
+        float dy = *(__global const float *)(Dyptr + dy_index);
+        float d2x = *(__global const float *)(D2xptr + d2x_index);
+        float d2y = *(__global const float *)(D2yptr + d2y_index);
+        float dxy = *(__global const float *)(Dxyptr + dxy_index);
+        __global float * dst = (__global float *)(dstptr + dst_index);
 
-#if elemSize > 1
-            #pragma unroll
-            for (int i = 0; i < elemSize; i += vlen)
-            {
-                ucharv t0 = vloadn(0, src1 + src1_index + i);
-                ucharv t1 = vloadn(0, src2 + i);
-                ucharv t2 = t0 Operation t1;
-
-                vstoren(t2, 0, dst + dst_index + i);
-            }
-#else
-            dst[dst_index] = src1[src1_index] Operation src2[0];
-#endif
-        }
+        dst[0] = factor * (dx*dx*d2y + dy*dy*d2x - 2*dx*dy*dxy);
     }
 }
diff --git a/modules/imgproc/src/opencl/resize.cl b/modules/imgproc/src/opencl/resize.cl
index 92491615e..a142d781c 100644
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@@ -50,12 +50,21 @@
 #define INTER_RESIZE_COEF_BITS 11
 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
 #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
-#define CAST_SCALE (1.0f/(1<<CAST_BITS))
 #define INC(x,l) min(x+1,l-1)
 
-#define PIXSIZE ((int)sizeof(PIXTYPE))
+
 #define noconvert(x) (x)
 
+#if cn != 3
+#define loadpix(addr)  *(__global const PIXTYPE*)(addr)
+#define storepix(val, addr)  *(__global PIXTYPE*)(addr) = val
+#define PIXSIZE ((int)sizeof(PIXTYPE))
+#else
+#define loadpix(addr)  vload3(0, (__global const PIXTYPE1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr))
+#define PIXSIZE ((int)sizeof(PIXTYPE1)*3)
+#endif
+
 #if defined INTER_LINEAR
 
 __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
@@ -79,7 +88,6 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
 
     int y_ = INC(y,srcrows);
     int x_ = INC(x,srccols);
-    __global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE));
 
 #if depth <= 4
 
@@ -91,10 +99,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
     int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
     int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
 
-    WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
-    WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
 
     WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
                mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
@@ -104,10 +112,10 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
 #else
     float u1 = 1.f - u;
     float v1 = 1.f - v;
-    WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
-    WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
-    WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
 
     PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
 
@@ -115,8 +123,7 @@ __kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
 
     if(dx < dstcols && dy < dstrows)
     {
-        __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
-        dst[0] = uval;
+        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
     }
 }
 
@@ -138,17 +145,13 @@ __kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
         int sx = min(convert_int_rtz(s1), srccols-1);
         int sy = min(convert_int_rtz(s2), srcrows-1);
 
-        __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
-        __global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE));
-
-        dst[0] = src[0];
+        storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)),
+                 dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
     }
 }
 
 #elif defined INTER_AREA
 
-#define TSIZE ((int)(sizeof(T)))
-
 #ifdef INTER_AREA_FAST
 
 __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
@@ -176,10 +179,10 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
             int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
             #pragma unroll
             for (int x = 0; x < XSCALE; ++x)
-                sum += convertToWTV(((__global const T*)(src + src_index))[sxmap_tab[sx + x]]);
+                sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE));
         }
 
-        ((__global T*)(dst + dst_index))[dx] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
+        storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE);
     }
 }
 
@@ -221,12 +224,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse
             for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
             {
                 WTV alpha = (WTV)(xalpha_tab[xk]);
-                buf += convertToWTV(((__global const T*)(src + src_index))[sx]) * alpha;
+                buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha;
             }
             sum += buf * beta;
         }
 
-        ((__global T*)(dst + dst_index))[dx] = convertToT(sum);
+        storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE);
     }
 }
 
diff --git a/modules/imgproc/src/opencl/warp_affine.cl b/modules/imgproc/src/opencl/warp_affine.cl
index 340cfdd8e..028e8736e 100644
--- a/modules/imgproc/src/opencl/warp_affine.cl
+++ b/modules/imgproc/src/opencl/warp_affine.cl
@@ -64,11 +64,31 @@
 
 #define noconvert
 
+#ifndef ST
+#define ST T
+#endif
+
+#if cn != 3
+#define loadpix(addr)  *(__global const T*)(addr)
+#define storepix(val, addr)  *(__global T*)(addr) = val
+#define scalar scalar_
+#define pixsize (int)sizeof(T)
+#else
+#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
+#ifdef INTER_NEAREST
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#else
+#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+#define pixsize ((int)sizeof(T1)*3)
+#endif
+
 #ifdef INTER_NEAREST
 
 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                         __constant CT * M, T scalar)
+                         __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -85,17 +105,15 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         short sx = convert_short_sat(X0 >> AB_BITS);
         short sy = convert_short_sat(Y0 >> AB_BITS);
 
-        int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
         if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
         {
-            int src_index = mad24(sy, src_step, src_offset + sx * (int)sizeof(T));
-            __global const T * src = (__global const T *)(srcptr + src_index);
-            dst[0] = src[0];
+            int src_index = mad24(sy, src_step, src_offset + sx * pixsize);
+            storepix(loadpix(srcptr + src_index), dstptr + dst_index);
         }
         else
-            dst[0] = scalar;
+            storepix(scalar, dstptr + dst_index);
     }
 }
 
@@ -103,7 +121,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
 
 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                         __constant CT * M, WT scalar)
+                         __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -126,19 +144,18 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         short ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
 
         WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
         WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
         WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
         WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
 
         float taby = 1.f/INTER_TAB_SIZE*ay;
         float tabx = 1.f/INTER_TAB_SIZE*ax;
 
-        int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
 #if depth <= 4
         int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
@@ -147,11 +164,11 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
 
         WT val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-        dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
+        storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
 #else
         float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
         WT val = v0 * tabx2 * taby2 +  v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
-        dst[0] = convertToT(val);
+        storepix(convertToT(val), dstptr + dst_index);
 #endif
     }
 }
@@ -170,7 +187,7 @@ inline void interpolateCubic( float x, float* coeffs )
 
 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                         __constant CT * M, WT scalar)
+                         __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -198,7 +215,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
             #pragma unroll
             for (int x = 0; x < 4; x++)
                 v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
-                    convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
+                    convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
 
         float tab1y[4], tab1x[4];
 
@@ -207,8 +224,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         interpolateCubic(ayy, tab1y);
         interpolateCubic(axx, tab1x);
 
-        int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
         WT sum = (WT)(0);
 #if depth <= 4
@@ -221,12 +237,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * itab[i];
-        dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
+        storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
 #else
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
-        dst[0] = convertToT( sum );
+        storepix(convertToT( sum ), dstptr + dst_index);
 #endif
     }
 }
diff --git a/modules/imgproc/src/opencl/warp_perspective.cl b/modules/imgproc/src/opencl/warp_perspective.cl
index 211433e70..211f45b5b 100644
--- a/modules/imgproc/src/opencl/warp_perspective.cl
+++ b/modules/imgproc/src/opencl/warp_perspective.cl
@@ -64,11 +64,31 @@
 
 #define noconvert
 
+#ifndef ST
+#define ST T
+#endif
+
+#if cn != 3
+#define loadpix(addr)  *(__global const T*)(addr)
+#define storepix(val, addr)  *(__global T*)(addr) = val
+#define scalar scalar_
+#define pixsize (int)sizeof(T)
+#else
+#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
+#ifdef INTER_NEAREST
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#else
+#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+#define pixsize ((int)sizeof(T1)*3)
+#endif
+
 #ifdef INTER_NEAREST
 
 __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                               __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                              __constant CT * M, T scalar)
+                              __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -82,17 +102,15 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         short sx = convert_short_sat_rte(X0*W);
         short sy = convert_short_sat_rte(Y0*W);
 
-        int dst_index = mad24(dy, dst_step, dx * (int)sizeof(T) + dst_offset);
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);
 
         if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
         {
-            int src_index = mad24(sy, src_step, sx * (int)sizeof(T) + src_offset);
-            __global const T * src = (__global const T *)(srcptr + src_index);
-            dst[0] = src[0];
+            int src_index = mad24(sy, src_step, sx * pixsize + src_offset);
+            storepix(loadpix(srcptr + src_index), dstptr + dst_index);
         }
         else
-            dst[0] = scalar;
+            storepix(scalar, dstptr + dst_index);
     }
 }
 
@@ -100,7 +118,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
 
 __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                               __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                              __constant CT * M, WT scalar)
+                              __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -119,19 +137,18 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         short ax = (short)(X & (INTER_TAB_SIZE - 1));
 
         WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
         WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
         WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
         WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
-            convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+            convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
 
         float taby = 1.f/INTER_TAB_SIZE*ay;
         float tabx = 1.f/INTER_TAB_SIZE*ax;
 
-        int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
 #if depth <= 4
         int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
@@ -140,11 +157,11 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
 
         WT val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-        dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
+        storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
 #else
         float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
         WT val = v0 * tabx2 * taby2 +  v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
-        dst[0] = convertToT(val);
+        storepix(convertToT(val), dstptr + dst_index);
 #endif
     }
 }
@@ -163,7 +180,7 @@ inline void interpolateCubic( float x, float* coeffs )
 
 __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                               __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                              __constant CT * M, WT scalar)
+                              __constant CT * M, ST scalar_)
 {
     int dx = get_global_id(0);
     int dy = get_global_id(1);
@@ -187,7 +204,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
             #pragma unroll
             for (int x = 0; x < 4; x++)
                 v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
-                    convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
+                    convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
 
         float tab1y[4], tab1x[4];
 
@@ -196,8 +213,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         interpolateCubic(ayy, tab1y);
         interpolateCubic(axx, tab1x);
 
-        int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
-        __global T * dst = (__global T *)(dstptr + dst_index);
+        int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
 
         WT sum = (WT)(0);
 #if depth <= 4
@@ -210,12 +226,12 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * itab[i];
-        dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
+        storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
 #else
         #pragma unroll
         for (int i = 0; i < 16; i++)
             sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
-        dst[0] = convertToT( sum );
+        storepix(convertToT( sum ), dstptr + dst_index);
 #endif
     }
 }
diff --git a/modules/imgproc/src/phasecorr.cpp b/modules/imgproc/src/phasecorr.cpp
index d21a4938f..f513e84e2 100644
--- a/modules/imgproc/src/phasecorr.cpp
+++ b/modules/imgproc/src/phasecorr.cpp
@@ -576,20 +576,23 @@ void cv::createHanningWindow(OutputArray _dst, cv::Size winSize, int type)
     _dst.create(winSize, type);
     Mat dst = _dst.getMat();
 
-    int rows = dst.rows;
-    int cols = dst.cols;
+    int rows = dst.rows, cols = dst.cols;
+
+    AutoBuffer<double> _wc(cols);
+    double * const wc = (double *)_wc;
+
+    double coeff0 = 2.0 * CV_PI / (double)(cols - 1), coeff1 = 2.0f * CV_PI / (double)(rows - 1);
+    for(int j = 0; j < cols; j++)
+        wc[j] = 0.5 * (1.0 - cos(coeff0 * j));
 
     if(dst.depth() == CV_32F)
     {
         for(int i = 0; i < rows; i++)
         {
             float* dstData = dst.ptr<float>(i);
-            double wr = 0.5 * (1.0f - cos(2.0f * CV_PI * (double)i / (double)(rows - 1)));
+            double wr = 0.5 * (1.0 - cos(coeff1 * i));
             for(int j = 0; j < cols; j++)
-            {
-                double wc = 0.5 * (1.0f - cos(2.0f * CV_PI * (double)j / (double)(cols - 1)));
-                dstData[j] = (float)(wr * wc);
-            }
+                dstData[j] = (float)(wr * wc[j]);
         }
     }
     else
@@ -597,12 +600,9 @@ void cv::createHanningWindow(OutputArray _dst, cv::Size winSize, int type)
         for(int i = 0; i < rows; i++)
         {
             double* dstData = dst.ptr<double>(i);
-            double wr = 0.5 * (1.0 - cos(2.0 * CV_PI * (double)i / (double)(rows - 1)));
+            double wr = 0.5 * (1.0 - cos(coeff1 * i));
             for(int j = 0; j < cols; j++)
-            {
-                double wc = 0.5 * (1.0 - cos(2.0 * CV_PI * (double)j / (double)(cols - 1)));
-                dstData[j] = wr * wc;
-            }
+                dstData[j] = wr * wc[j];
         }
     }
 
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index 6802e9eeb..23a132f9b 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -401,12 +401,13 @@ pyrUp_( const Mat& _src, Mat& _dst, int)
 
 typedef void (*PyrFunc)(const Mat&, Mat&, int);
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
 
-    if (((channels != 1) && (channels != 2) && (channels != 4))
-        || (borderType != BORDER_DEFAULT))
+    if ((channels != 1 && channels != 2 && channels != 4) || borderType != BORDER_DEFAULT)
         return false;
 
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
@@ -423,18 +424,16 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
     _dst.create( dsize, src.type() );
     UMat dst = _dst.getUMat();
 
-    const char * const kernelName = "pyrDown";
-    ocl::ProgramSource2 program = ocl::imgproc::pyr_down_oclsrc;
-    ocl::Kernel k;
-
     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
     char cvt[2][50];
-    k.create(kernelName, program,
+    ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc,
                  format("-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s",
                  ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
                  ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
                  ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
                  doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
 
     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
 
@@ -447,12 +446,11 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
 
-    if (((channels != 1) && (channels != 2) && (channels != 4))
-        || (borderType != BORDER_DEFAULT))
+    if ((channels != 1 && channels != 2 && channels != 4) || borderType != BORDER_DEFAULT)
         return false;
 
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
-    if ((depth == CV_64F) && !(doubleSupport))
+    if (depth == CV_64F && !doubleSupport)
         return false;
 
     Size ssize = _src.size();
@@ -464,18 +462,16 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     _dst.create( dsize, src.type() );
     UMat dst = _dst.getUMat();
 
-    const char * const kernelName = "pyrUp";
-    ocl::ProgramSource2 program = ocl::imgproc::pyr_up_oclsrc;
-    ocl::Kernel k;
-
     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
     char cvt[2][50];
-    k.create(kernelName, program,
+    ocl::Kernel k("pyrUp", ocl::imgproc::pyr_up_oclsrc,
                  format("-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s",
                  ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
                  ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
                  ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
                  doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+    if (k.empty())
+        return false;
 
     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
     size_t globalThreads[2] = {dst.cols, dst.rows};
@@ -484,13 +480,14 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
     return k.run(2, globalThreads, localThreads, false);
 }
 
+#endif
+
 }
 
 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_pyrDown(_src, _dst, _dsz, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_pyrDown(_src, _dst, _dsz, borderType))
 
     Mat src = _src.getMat();
     Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
@@ -522,9 +519,8 @@ void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borde
 
 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
 {
-    if (ocl::useOpenCL() && _dst.isUMat() &&
-        ocl_pyrUp(_src, _dst, _dsz, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_pyrUp(_src, _dst, _dsz, borderType))
 
     Mat src = _src.getMat();
     Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
@@ -556,6 +552,16 @@ void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderT
 
 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
 {
+    if (_src.dims() <= 2 && _dst.isUMatVector())
+    {
+        UMat src = _src.getUMat();
+        _dst.create( maxlevel + 1, 1, 0 );
+        _dst.getUMatRef(0) = src;
+        for( int i = 1; i <= maxlevel; i++ )
+            pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
+        return;
+    }
+
     Mat src = _src.getMat();
     _dst.create( maxlevel + 1, 1, 0 );
     _dst.getMatRef(0) = src;
diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.cpp
index bc621be3b..40687a226 100644
--- a/modules/imgproc/src/smooth.cpp
+++ b/modules/imgproc/src/smooth.cpp
@@ -67,15 +67,18 @@ namespace cv
                                          Box Filter
 \****************************************************************************************/
 
-template<typename T, typename ST> struct RowSum : public BaseRowFilter
+template<typename T, typename ST>
+struct RowSum :
+        public BaseRowFilter
 {
-    RowSum( int _ksize, int _anchor )
+    RowSum( int _ksize, int _anchor ) :
+        BaseRowFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
     }
 
-    void operator()(const uchar* src, uchar* dst, int width, int cn)
+    virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
     {
         const T* S = (const T*)src;
         ST* D = (ST*)dst;
@@ -98,9 +101,12 @@ template<typename T, typename ST> struct RowSum : public BaseRowFilter
 };
 
 
-template<typename ST, typename T> struct ColumnSum : public BaseColumnFilter
+template<typename ST, typename T>
+struct ColumnSum :
+        public BaseColumnFilter
 {
-    ColumnSum( int _ksize, int _anchor, double _scale )
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
@@ -108,9 +114,9 @@ template<typename ST, typename T> struct ColumnSum : public BaseColumnFilter
         sumCount = 0;
     }
 
-    void reset() { sumCount = 0; }
+    virtual void reset() { sumCount = 0; }
 
-    void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
     {
         int i;
         ST* SUM;
@@ -198,9 +204,12 @@ template<typename ST, typename T> struct ColumnSum : public BaseColumnFilter
 };
 
 
-template<> struct ColumnSum<int, uchar> : public BaseColumnFilter
+template<>
+struct ColumnSum<int, uchar> :
+        public BaseColumnFilter
 {
-    ColumnSum( int _ksize, int _anchor, double _scale )
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
@@ -208,9 +217,9 @@ template<> struct ColumnSum<int, uchar> : public BaseColumnFilter
         sumCount = 0;
     }
 
-    void reset() { sumCount = 0; }
+    virtual void reset() { sumCount = 0; }
 
-    void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
     {
         int i;
         int* SUM;
@@ -339,9 +348,12 @@ template<> struct ColumnSum<int, uchar> : public BaseColumnFilter
     std::vector<int> sum;
 };
 
-template<> struct ColumnSum<int, short> : public BaseColumnFilter
+template<>
+struct ColumnSum<int, short> :
+        public BaseColumnFilter
 {
-    ColumnSum( int _ksize, int _anchor, double _scale )
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
@@ -349,9 +361,9 @@ template<> struct ColumnSum<int, short> : public BaseColumnFilter
         sumCount = 0;
     }
 
-    void reset() { sumCount = 0; }
+    virtual void reset() { sumCount = 0; }
 
-    void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
     {
         int i;
         int* SUM;
@@ -477,9 +489,12 @@ template<> struct ColumnSum<int, short> : public BaseColumnFilter
 };
 
 
-template<> struct ColumnSum<int, ushort> : public BaseColumnFilter
+template<>
+struct ColumnSum<int, ushort> :
+        public BaseColumnFilter
 {
-    ColumnSum( int _ksize, int _anchor, double _scale )
+    ColumnSum( int _ksize, int _anchor, double _scale ) :
+        BaseColumnFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
@@ -487,9 +502,9 @@ template<> struct ColumnSum<int, ushort> : public BaseColumnFilter
         sumCount = 0;
     }
 
-    void reset() { sumCount = 0; }
+    virtual void reset() { sumCount = 0; }
 
-    void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+    virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
     {
         int i;
         int* SUM;
@@ -611,155 +626,114 @@ template<> struct ColumnSum<int, ushort> : public BaseColumnFilter
     std::vector<int> sum;
 };
 
+#ifdef HAVE_OPENCL
+
 #define DIVUP(total, grain) ((total + grain - 1) / (grain))
 
 static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth,
-                           Size ksize, Point anchor, int borderType )
+                           Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false )
 {
-    int type = _src.type();
-    int cn = CV_MAT_CN(type);
-    if ((1 != cn) && (2 != cn) && (4 != cn))
-        return false;//TODO
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type);
+    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    int sdepth = CV_MAT_DEPTH(type);
-    if( ddepth < 0 )
+    if (ddepth < 0)
         ddepth = sdepth;
-    else if (ddepth != sdepth)
+
+    if (!(cn == 1 || cn == 2 || cn == 4) || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
+        _src.offset() % esz != 0 || _src.step() % esz != 0)
         return false;
-    if( anchor.x < 0 )
+
+    if (anchor.x < 0)
         anchor.x = ksize.width / 2;
-    if( anchor.y < 0 )
+    if (anchor.y < 0)
         anchor.y = ksize.height / 2;
 
-    ocl::Kernel kernel;
-
-    //Normalize the result by default
+    int computeUnits = ocl::Device::getDefault().maxComputeUnits();
     float alpha = 1.0f / (ksize.height * ksize.width);
-    bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-    bool useDouble = (CV_64F == sdepth);
-    const cv::ocl::Device &device = cv::ocl::Device::getDefault();
-    int doubleFPConfig = device.doubleFPConfig();
-    if (useDouble && (0 == doubleFPConfig))
-        return false;// may be we have to check is  (0 != (CL_FP_SOFT_FLOAT & doubleFPConfig)) ?
+    Size size = _src.size(), wholeSize;
+    bool isolated = (borderType & BORDER_ISOLATED) != 0;
+    borderType &= ~BORDER_ISOLATED;
+    int wdepth = std::max(CV_32F, std::max(ddepth, sdepth));
 
-    const char* btype = NULL;
-    switch (borderType & ~BORDER_ISOLATED)
+    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" };
+    size_t globalsize[2] = { size.width, size.height };
+    size_t localsize[2] = { 0, 1 };
+
+    UMat src = _src.getUMat();
+    if (!isolated)
     {
-    case BORDER_CONSTANT:
-        btype = "BORDER_CONSTANT";
-        break;
-    case BORDER_REPLICATE:
-        btype = "BORDER_REPLICATE";
-        break;
-    case BORDER_REFLECT:
-        btype = "BORDER_REFLECT";
-        break;
-    case BORDER_WRAP:
-        //CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-        return false;
-    case BORDER_REFLECT101:
-        btype = "BORDER_REFLECT_101";
-        break;
-    }
-
-    cv::Size sz = _src.size();
-
-    size_t globalsize[2] = {sz.width, sz.height};
-    size_t localsize[2] = {0, 1};
-
-    UMat src; Size wholeSize;
-    if (!isIsolatedBorder)
-    {
-        src = _src.getUMat();
         Point ofs;
         src.locateROI(wholeSize, ofs);
     }
 
-    size_t maxWorkItemSizes[32]; device.maxWorkItemSizes(maxWorkItemSizes);
-    size_t tryWorkItems = maxWorkItemSizes[0];
-    for (;;)
+    int h = isolated ? size.height : wholeSize.height;
+    int w = isolated ? size.width : wholeSize.width;
+
+    size_t maxWorkItemSizes[32];
+    ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes);
+    int tryWorkItems = (int)maxWorkItemSizes[0];
+
+    ocl::Kernel kernel;
+    for ( ; ; )
     {
-        size_t BLOCK_SIZE = tryWorkItems;
-        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
-            BLOCK_SIZE /= 2;
-        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
-        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * device.maxComputeUnits() * 32 < (size_t)sz.height)
+        int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height);
+
+        while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2)
+            BLOCK_SIZE_X /= 2;
+        while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height)
             BLOCK_SIZE_Y *= 2;
 
-        if ((size_t)ksize.width > BLOCK_SIZE)
+        if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height)
             return false;
 
-        int requiredTop = anchor.y;
-        int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
-        int requiredBottom = ksize.height - 1 - anchor.y;
-        int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? sz.height : wholeSize.height;
-        int w = isIsolatedBorder ? sz.width : wholeSize.width;
+        char cvt[2][50];
+        String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s "
+                             "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s",
+                             BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]),
+                             anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType],
+                             isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                             normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "");
 
-        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+        localsize[0] = BLOCK_SIZE_X;
+        globalsize[0] = DIVUP(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X;
+        globalsize[1] = DIVUP(size.height, BLOCK_SIZE_Y);
 
-        if ((w < ksize.width) || (h < ksize.height))
-            return false;
-
-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                sdepth, cn, useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
-        localsize[0] = BLOCK_SIZE;
-        globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
-        globalsize[1] = DIVUP(sz.height, BLOCK_SIZE_Y);
-
-        cv::String errmsg;
-        kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, build_options);
+        kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts);
 
         size_t kernelWorkGroupSize = kernel.workGroupSize();
         if (localsize[0] <= kernelWorkGroupSize)
             break;
-
-        if (BLOCK_SIZE < kernelWorkGroupSize)
+        if (BLOCK_SIZE_X < (int)kernelWorkGroupSize)
             return false;
-        tryWorkItems = kernelWorkGroupSize;
+
+        tryWorkItems = (int)kernelWorkGroupSize;
     }
 
-    _dst.create(sz, CV_MAKETYPE(ddepth, cn));
+    _dst.create(size, CV_MAKETYPE(ddepth, cn));
     UMat dst = _dst.getUMat();
-    if (src.empty())
-        src = _src.getUMat();
-    int idxArg = 0;
-    idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+
+    int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
     idxArg = kernel.set(idxArg, (int)src.step);
     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
     int srcOffsetY = (int)(src.offset / src.step);
-    int srcEndX = (isIsolatedBorder ? (srcOffsetX + sz.width) : wholeSize.width);
-    int srcEndY = (isIsolatedBorder ? (srcOffsetY + sz.height) : wholeSize.height);
+    int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width;
+    int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height;
     idxArg = kernel.set(idxArg, srcOffsetX);
     idxArg = kernel.set(idxArg, srcOffsetY);
     idxArg = kernel.set(idxArg, srcEndX);
     idxArg = kernel.set(idxArg, srcEndY);
     idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst));
-    float borderValue[4] = {0, 0, 0, 0};
-    double borderValueDouble[4] = {0, 0, 0, 0};
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-    {
-        int cnocl = (3 == cn) ? 4 : cn;
-        if (useDouble)
-            idxArg = kernel.set(idxArg, (void *)&borderValueDouble[0], sizeof(double) * cnocl);
-        else
-            idxArg = kernel.set(idxArg, (void *)&borderValue[0], sizeof(float) * cnocl);
-    }
-    if (useDouble)
-        idxArg = kernel.set(idxArg, (double)alpha);
-    else
+    if (normalize)
         idxArg = kernel.set(idxArg, (float)alpha);
 
-    return kernel.run(2, globalsize, localsize, true);
+    return kernel.run(2, globalsize, localsize, false);
 }
 
+#endif
+
 }
 
 
@@ -862,9 +836,7 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
                 Size ksize, Point anchor,
                 bool normalize, int borderType )
 {
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat() && normalize;
-    if( use_opencl && ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType) )
-        return;
+    CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize))
 
     Mat src = _src.getMat();
     int sdepth = src.depth(), cn = src.channels();
@@ -872,7 +844,7 @@ void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth,
         ddepth = sdepth;
     _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
     Mat dst = _dst.getMat();
-    if( borderType != BORDER_CONSTANT && normalize )
+    if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 )
     {
         if( src.rows == 1 )
             ksize.height = 1;
@@ -903,15 +875,18 @@ void cv::blur( InputArray src, OutputArray dst,
 namespace cv
 {
 
-template<typename T, typename ST> struct SqrRowSum : public BaseRowFilter
+template<typename T, typename ST>
+struct SqrRowSum :
+        public BaseRowFilter
 {
-    SqrRowSum( int _ksize, int _anchor )
+    SqrRowSum( int _ksize, int _anchor ) :
+        BaseRowFilter()
     {
         ksize = _ksize;
         anchor = _anchor;
     }
 
-    void operator()(const uchar* src, uchar* dst, int width, int cn)
+    virtual void operator()(const uchar* src, uchar* dst, int width, int cn)
     {
         const T* S = (const T*)src;
         ST* D = (ST*)dst;
@@ -971,26 +946,31 @@ void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth,
                        Size ksize, Point anchor,
                        bool normalize, int borderType )
 {
-    Mat src = _src.getMat();
-    int sdepth = src.depth(), cn = src.channels();
+    int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType);
+    Size size = _src.size();
+
     if( ddepth < 0 )
         ddepth = sdepth < CV_32F ? CV_32F : CV_64F;
-    _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) );
-    Mat dst = _dst.getMat();
+
     if( borderType != BORDER_CONSTANT && normalize )
     {
-        if( src.rows == 1 )
+        if( size.height == 1 )
             ksize.height = 1;
-        if( src.cols == 1 )
+        if( size.width == 1 )
             ksize.width = 1;
     }
 
-    int sumType = CV_64F;
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
+               ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true))
+
+    int sumDepth = CV_64F;
     if( sdepth == CV_8U )
-        sumType = CV_32S;
-    sumType = CV_MAKETYPE( sumType, cn );
-    int srcType = CV_MAKETYPE(sdepth, cn);
-    int dstType = CV_MAKETYPE(ddepth, cn);
+        sumDepth = CV_32S;
+    int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn);
+
+    Mat src = _src.getMat();
+    _dst.create( size, dstType );
+    Mat dst = _dst.getMat();
 
     Ptr<BaseRowFilter> rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x );
     Ptr<BaseColumnFilter> columnFilter = getColumnSumFilter(sumType,
@@ -1920,39 +1900,41 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
     }
 }
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_medianFilter ( InputArray _src, OutputArray _dst, int m)
+{
+    int type = _src.type();
+    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    if (!((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn != 3 && cn <= 4)))
+        return false;
+
+    const char * kernelName;
+
+    if (m == 3)
+        kernelName = "medianFilter3";
+    else if (m == 5)
+        kernelName = "medianFilter5";
+    else
+        return false;
+
+    ocl::Kernel k(kernelName,ocl::imgproc::medianFilter_oclsrc,format("-D type=%s",ocl::typeToStr(type)));
+    if (k.empty())
+        return false;
+
+    UMat src = _src.getUMat();
+    _dst.create(_src.size(),type);
+    UMat dst = _dst.getUMat();
+
+    size_t globalsize[2] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16};
+    size_t localsize[2] = {16, 16};
+
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)).run(2,globalsize,localsize,false);
 }
 
-namespace cv
-{
-    static bool ocl_medianFilter ( InputArray _src, OutputArray _dst, int m)
-    {
-        int type = _src.type();
-        int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+#endif
 
-        if (!((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && (cn != 3 && cn <= 4)))
-            return false;
-
-        const char * kernelName;
-
-        if (m==3)
-            kernelName = "medianFilter3";
-        else if (m==5)
-            kernelName = "medianFilter5";
-        else
-            return false;
-
-        ocl::Kernel k(kernelName,ocl::imgproc::medianFilter_oclsrc,format("-D type=%s",ocl::typeToStr(type)));
-        if (k.empty())
-            return false;
-
-        _dst.create(_src.size(),type);
-        UMat src = _src.getUMat(), dst = _dst.getUMat();
-
-        size_t globalsize[2] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16};
-        size_t localsize[2] = {16, 16};
-
-        return k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)).run(2,globalsize,localsize,false);
-    }
 }
 
 void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
@@ -1961,16 +1943,12 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
 
     if( ksize <= 1 )
     {
-        Mat src0 = _src0.getMat();
-        _dst.create( src0.size(), src0.type() );
-        Mat dst = _dst.getMat();
-        src0.copyTo(dst);
+        _src0.copyTo(_dst);
         return;
     }
 
-    bool use_opencl = ocl::useOpenCL() && _dst.isUMat();
-    if ( use_opencl && ocl_medianFilter(_src0,_dst, ksize))
-        return;
+    CV_OCL_RUN(_src0.dims() <= 2 && _dst.isUMat(),
+               ocl_medianFilter(_src0,_dst, ksize))
 
     Mat src0 = _src0.getMat();
     _dst.create( src0.size(), src0.type() );
@@ -2226,6 +2204,8 @@ private:
 };
 #endif
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
                                    double sigma_color, double sigma_space,
                                    int borderType)
@@ -2301,6 +2281,8 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 static void
 bilateralFilter_8u( const Mat& src, Mat& dst, int d,
     double sigma_color, double sigma_space,
@@ -2651,9 +2633,8 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
 {
     _dst.create( _src.size(), _src.type() );
 
-    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat() &&
-            ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
-        return;
+    CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
+               ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType))
 
     Mat src = _src.getMat(), dst = _dst.getMat();
 
@@ -2666,285 +2647,6 @@ void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d,
         "Bilateral filtering is only implemented for 8u and 32f images" );
 }
 
-
-/****************************************************************************************\
-                                  Adaptive Bilateral Filtering
-\****************************************************************************************/
-
-namespace cv
-{
-#ifndef ABF_CALCVAR
-#define ABF_CALCVAR 1
-#endif
-
-#ifndef ABF_FIXED_WEIGHT
-#define ABF_FIXED_WEIGHT 0
-#endif
-
-#ifndef ABF_GAUSSIAN
-#define ABF_GAUSSIAN 1
-#endif
-
-class adaptiveBilateralFilter_8u_Invoker :
-    public ParallelLoopBody
-{
-public:
-    adaptiveBilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, Size _ksize, double _sigma_space, double _maxSigmaColor, Point _anchor) :
-        temp(&_temp), dest(&_dest), ksize(_ksize), sigma_space(_sigma_space), maxSigma_Color(_maxSigmaColor), anchor(_anchor)
-    {
-        if( sigma_space <= 0 )
-            sigma_space = 1;
-        CV_Assert((ksize.width & 1) && (ksize.height & 1));
-        space_weight.resize(ksize.width * ksize.height);
-        double sigma2 = sigma_space * sigma_space;
-        int idx = 0;
-        int w = ksize.width / 2;
-        int h = ksize.height / 2;
-        for(int y=-h; y<=h; y++)
-            for(int x=-w; x<=w; x++)
-        {
-#if ABF_GAUSSIAN
-            space_weight[idx++] = (float)exp ( -0.5*(x * x + y * y)/sigma2);
-#else
-            space_weight[idx++] = (float)(sigma2 / (sigma2 + x * x + y * y));
-#endif
-        }
-    }
-    virtual void operator()(const Range& range) const
-    {
-        int cn = dest->channels();
-        int anX = anchor.x;
-
-        const uchar *tptr;
-
-        for(int i = range.start;i < range.end; i++)
-        {
-            int startY = i;
-            if(cn == 1)
-            {
-                float var;
-                int currVal;
-                int sumVal = 0;
-                int sumValSqr = 0;
-                int currValCenter;
-                int currWRTCenter;
-                float weight;
-                float totalWeight = 0.;
-                float tmpSum = 0.;
-
-                for(int j = 0;j < dest->cols *cn; j+=cn)
-                {
-                    sumVal = 0;
-                    sumValSqr= 0;
-                    totalWeight = 0.;
-                    tmpSum = 0.;
-
-                    // Top row: don't sum the very last element
-                    int startLMJ = 0;
-                    int endLMJ  = ksize.width  - 1;
-                    int howManyAll = (anX *2 +1)*(ksize.width );
-#if ABF_CALCVAR
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-                            currVal = tptr[cn*(y+anX)];
-                            sumVal += currVal;
-                            sumValSqr += (currVal *currVal);
-                        }
-                    }
-                    var = ( (sumValSqr * howManyAll)- sumVal * sumVal )  /  ( (float)(howManyAll*howManyAll));
-
-                    if(var < 0.01)
-                        var = 0.01f;
-                    else if(var > (float)(maxSigma_Color*maxSigma_Color) )
-                        var =  (float)(maxSigma_Color*maxSigma_Color) ;
-
-#else
-                    var = maxSigmaColor*maxSigmaColor;
-#endif
-                    startLMJ = 0;
-                    endLMJ = ksize.width;
-                    tptr = temp->ptr(startY + (startLMJ+ endLMJ)/2);
-                    currValCenter =tptr[j+cn*anX];
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-#if ABF_FIXED_WEIGHT
-                            weight = 1.0;
-#else
-                            currVal = tptr[cn*(y+anX)];
-                            currWRTCenter = currVal - currValCenter;
-
-#if ABF_GAUSSIAN
-                            weight = exp ( -0.5f * currWRTCenter * currWRTCenter/var ) * space_weight[x*ksize.width+y+anX];
-#else
-                            weight = var / ( var + (currWRTCenter * currWRTCenter) ) * space_weight[x*ksize.width+y+anX];
-#endif
-
-#endif
-                            tmpSum += ((float)tptr[cn*(y+anX)] * weight);
-                            totalWeight += weight;
-                        }
-                    }
-                    tmpSum /= totalWeight;
-
-                   dest->at<uchar>(startY ,j)= static_cast<uchar>(tmpSum);
-                }
-            }
-            else
-            {
-                assert(cn == 3);
-                float var_b, var_g, var_r;
-                int currVal_b, currVal_g, currVal_r;
-                int sumVal_b= 0, sumVal_g= 0, sumVal_r= 0;
-                int sumValSqr_b= 0, sumValSqr_g= 0, sumValSqr_r= 0;
-                int currValCenter_b= 0, currValCenter_g= 0, currValCenter_r= 0;
-                int currWRTCenter_b, currWRTCenter_g, currWRTCenter_r;
-                float weight_b, weight_g, weight_r;
-                float totalWeight_b= 0., totalWeight_g= 0., totalWeight_r= 0.;
-                float tmpSum_b = 0., tmpSum_g= 0., tmpSum_r = 0.;
-
-                for(int j = 0;j < dest->cols *cn; j+=cn)
-                {
-                    sumVal_b= 0, sumVal_g= 0, sumVal_r= 0;
-                    sumValSqr_b= 0, sumValSqr_g= 0, sumValSqr_r= 0;
-                    totalWeight_b= 0., totalWeight_g= 0., totalWeight_r= 0.;
-                    tmpSum_b = 0., tmpSum_g= 0., tmpSum_r = 0.;
-
-                    // Top row: don't sum the very last element
-                    int startLMJ = 0;
-                    int endLMJ  = ksize.width - 1;
-                    int howManyAll = (anX *2 +1)*(ksize.width);
-#if ABF_CALCVAR
-                    float max_var = (float)( maxSigma_Color*maxSigma_Color);
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-                            currVal_b = tptr[cn*(y+anX)], currVal_g = tptr[cn*(y+anX)+1], currVal_r =tptr[cn*(y+anX)+2];
-                            sumVal_b += currVal_b;
-                            sumVal_g += currVal_g;
-                            sumVal_r += currVal_r;
-                            sumValSqr_b += (currVal_b *currVal_b);
-                            sumValSqr_g += (currVal_g *currVal_g);
-                            sumValSqr_r += (currVal_r *currVal_r);
-                        }
-                    }
-                    var_b =  ( (sumValSqr_b * howManyAll)- sumVal_b * sumVal_b )  /  ( (float)(howManyAll*howManyAll));
-                    var_g =  ( (sumValSqr_g * howManyAll)- sumVal_g * sumVal_g )  /  ( (float)(howManyAll*howManyAll));
-                    var_r =  ( (sumValSqr_r * howManyAll)- sumVal_r * sumVal_r )  /  ( (float)(howManyAll*howManyAll));
-
-                    if(var_b < 0.01)
-                        var_b = 0.01f;
-                    else if(var_b > max_var )
-                        var_b =  (float)(max_var) ;
-
-                    if(var_g < 0.01)
-                        var_g = 0.01f;
-                    else if(var_g > max_var )
-                        var_g =  (float)(max_var) ;
-
-                    if(var_r < 0.01)
-                        var_r = 0.01f;
-                    else if(var_r > max_var )
-                        var_r =  (float)(max_var) ;
-
-#else
-                    var_b = maxSigma_Color*maxSigma_Color; var_g = maxSigma_Color*maxSigma_Color; var_r = maxSigma_Color*maxSigma_Color;
-#endif
-                    startLMJ = 0;
-                    endLMJ = ksize.width;
-                    tptr = temp->ptr(startY + (startLMJ+ endLMJ)/2) + j;
-                    currValCenter_b =tptr[cn*anX], currValCenter_g =tptr[cn*anX+1], currValCenter_r =tptr[cn*anX+2];
-                    for(int x = startLMJ; x< endLMJ; x++)
-                    {
-                        tptr = temp->ptr(startY + x) +j;
-                        for(int y=-anX; y<=anX; y++)
-                        {
-#if ABF_FIXED_WEIGHT
-                            weight_b = 1.0;
-                            weight_g = 1.0;
-                            weight_r = 1.0;
-#else
-                            currVal_b = tptr[cn*(y+anX)];currVal_g=tptr[cn*(y+anX)+1];currVal_r=tptr[cn*(y+anX)+2];
-                            currWRTCenter_b = currVal_b - currValCenter_b;
-                            currWRTCenter_g = currVal_g - currValCenter_g;
-                            currWRTCenter_r = currVal_r - currValCenter_r;
-
-                            float cur_spw = space_weight[x*ksize.width+y+anX];
-
-#if ABF_GAUSSIAN
-                            weight_b = exp( -0.5f * currWRTCenter_b * currWRTCenter_b/ var_b ) * cur_spw;
-                            weight_g = exp( -0.5f * currWRTCenter_g * currWRTCenter_g/ var_g ) * cur_spw;
-                            weight_r = exp( -0.5f * currWRTCenter_r * currWRTCenter_r/ var_r ) * cur_spw;
-#else
-                            weight_b = var_b / ( var_b + (currWRTCenter_b * currWRTCenter_b) ) * cur_spw;
-                            weight_g = var_g / ( var_g + (currWRTCenter_g * currWRTCenter_g) ) * cur_spw;
-                            weight_r = var_r / ( var_r + (currWRTCenter_r * currWRTCenter_r) ) * cur_spw;
-#endif
-#endif
-                            tmpSum_b += ((float)tptr[cn*(y+anX)]   * weight_b);
-                            tmpSum_g += ((float)tptr[cn*(y+anX)+1] * weight_g);
-                            tmpSum_r += ((float)tptr[cn*(y+anX)+2] * weight_r);
-                            totalWeight_b += weight_b, totalWeight_g += weight_g, totalWeight_r += weight_r;
-                        }
-                    }
-                    tmpSum_b /= totalWeight_b;
-                    tmpSum_g /= totalWeight_g;
-                    tmpSum_r /= totalWeight_r;
-
-                    dest->at<uchar>(startY,j  )= static_cast<uchar>(tmpSum_b);
-                    dest->at<uchar>(startY,j+1)= static_cast<uchar>(tmpSum_g);
-                    dest->at<uchar>(startY,j+2)= static_cast<uchar>(tmpSum_r);
-                }
-            }
-        }
-    }
-private:
-    const Mat *temp;
-    Mat *dest;
-    Size ksize;
-    double sigma_space;
-    double maxSigma_Color;
-    Point anchor;
-    std::vector<float> space_weight;
-};
-static void adaptiveBilateralFilter_8u( const Mat& src, Mat& dst, Size ksize, double sigmaSpace, double maxSigmaColor, Point anchor, int borderType )
-{
-    Size size = src.size();
-
-    CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
-              src.type() == dst.type() && src.size() == dst.size() &&
-              src.data != dst.data );
-    Mat temp;
-    copyMakeBorder(src, temp, anchor.x, anchor.y, anchor.x, anchor.y, borderType);
-
-    adaptiveBilateralFilter_8u_Invoker body(dst, temp, ksize, sigmaSpace, maxSigmaColor, anchor);
-    parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
-}
-}
-void cv::adaptiveBilateralFilter( InputArray _src, OutputArray _dst, Size ksize,
-                                  double sigmaSpace, double maxSigmaColor, Point anchor, int borderType )
-{
-    Mat src = _src.getMat();
-    _dst.create(src.size(), src.type());
-    Mat dst = _dst.getMat();
-
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
-
-    anchor = normalizeAnchor(anchor,ksize);
-    if( src.depth() == CV_8U )
-        adaptiveBilateralFilter_8u( src, dst, ksize, sigmaSpace, maxSigmaColor, anchor, borderType );
-    else
-        CV_Error( CV_StsUnsupportedFormat,
-        "Adaptive Bilateral filtering is only implemented for 8u images" );
-}
-
 //////////////////////////////////////////////////////////////////////////////////////////
 
 CV_IMPL void
diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp
index c6ce0f296..4e18f119f 100644
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -231,6 +231,8 @@ typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_
                              uchar* sqsum, size_t sqsumstep, uchar* tilted, size_t tstep,
                              Size size, int cn );
 
+#ifdef HAVE_OPENCL
+
 enum { vlen = 4 };
 
 static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth )
@@ -324,6 +326,8 @@ static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum,
     return k2.run(1, &gt2, &lt2, false);
 }
 
+#endif
+
 }
 
 
@@ -336,19 +340,17 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output
          sqdepth = CV_64F;
     sdepth = CV_MAT_DEPTH(sdepth), sqdepth = CV_MAT_DEPTH(sqdepth);
 
+#ifdef HAVE_OPENCL
     if (ocl::useOpenCL() && _sum.isUMat() && !_tilted.needed())
     {
         if (!_sqsum.needed())
         {
-            if (ocl_integral(_src, _sum, sdepth))
-                return;
+            CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, sdepth))
         }
         else if (_sqsum.isUMat())
-        {
-            if (ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth))
-                return;
-        }
+            CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth))
     }
+#endif
 
     Size ssize = _src.size(), isize(ssize.width + 1, ssize.height + 1);
     _sum.create( isize, CV_MAKETYPE(sdepth, cn) );
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index bfe7ce600..f138427dc 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -40,10 +40,295 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
 
 namespace cv
 {
 
+#ifdef HAVE_OPENCL
+
+static bool useNaive(int method, int depth, const Size & size)
+{
+#ifdef HAVE_CLAMDFFT
+    if (method == TM_SQDIFF && depth == CV_32F)
+        return true;
+    else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
+        return size.height < 18 && size.width < 18;
+    else
+        return false;
+#else
+    (void)(method);
+    (void)(depth);
+    (void)(size);
+    return true;
+#endif
+}
+
+/////////////////////////////////////////////////// CCORR //////////////////////////////////////////////////////////////
+
+static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k("matchTemplate_Naive_CCORR", ocl::imgproc::match_template_oclsrc,
+                  format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat(), templ = _templ.getUMat();
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    size_t globalsize[2] = { result.cols, result.rows };
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::ReadOnly(templ),
+                  ocl::KernelArg::WriteOnly(result)).run(2, globalsize, NULL, false);
+}
+
+static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k("matchTemplate_CCORR_NORMED", ocl::imgproc::match_template_oclsrc,
+                  format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type),
+                         ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat(), templ = _templ.getUMat();
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    UMat image_sums, image_sqsums;
+    integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
+
+    UMat temp;
+    multiply(templ, templ, temp, 1, CV_32F);
+    Scalar s = sum(temp);
+    float templ_sqsum = 0;
+    for (int i = 0; i < cn; ++i)
+        templ_sqsum += static_cast<float>(s[i]);
+
+    size_t globalsize[2] = { result.cols, result.rows };
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
+                  templ.rows, templ.cols, templ_sqsum).run(2, globalsize, NULL, false);
+}
+
+static bool matchTemplate_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    if (useNaive(TM_CCORR, _image.depth(), _templ.size())  )
+        return matchTemplateNaive_CCORR(_image, _templ, _result);
+    else
+        return false;
+}
+
+////////////////////////////////////// SQDIFF //////////////////////////////////////////////////////////////
+
+static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k("matchTemplate_Naive_SQDIFF", ocl::imgproc::match_template_oclsrc,
+                  format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type),
+                         ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat(), templ = _templ.getUMat();
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    size_t globalsize[2] = { result.cols, result.rows };
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::ReadOnly(templ),
+                  ocl::KernelArg::WriteOnly(result)).run(2, globalsize, NULL, false);
+}
+
+static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k("matchTemplate_SQDIFF_NORMED", ocl::imgproc::match_template_oclsrc,
+                  format("-D type=%s -D elem_type=%s -D cn=%d",
+                         ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat image = _image.getUMat(), templ = _templ.getUMat();
+    _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    UMat image_sums, image_sqsums;
+    integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
+
+    UMat temp;
+    multiply(templ, templ, temp, 1, CV_32F);
+    Scalar s = sum(temp);
+    float templ_sqsum = 0;
+    for (int i = 0; i < cn; ++i)
+        templ_sqsum += (float)s[i];
+
+    size_t globalsize[2] = { result.cols, result.rows };
+    return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
+                  templ.rows, templ.cols, templ_sqsum).run(2, globalsize, NULL, false);
+}
+
+static bool matchTemplate_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    if (useNaive(TM_SQDIFF, _image.depth(), _templ.size()))
+        return matchTemplateNaive_SQDIFF(_image, _templ, _result);
+    else
+        return false;
+}
+
+///////////////////////////////////// CCOEFF /////////////////////////////////////////////////////////////////
+
+static bool matchTemplate_CCOEFF(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    UMat image_sums, temp;
+    integral(_image, temp);
+
+    if(temp.depth() == CV_64F)
+        temp.convertTo(image_sums, CV_32F);
+    else
+        image_sums = temp;
+
+    int type = image_sums.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k(cv::format("matchTemplate_Prepared_CCOEFF_C%d", cn).c_str(), ocl::imgproc::match_template_oclsrc,
+                  format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat templ = _templ.getUMat();
+    Size size = _image.size(), tsize = templ.size();
+    _result.create(size.height - templ.rows + 1, size.width - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    size_t globalsize[2] = { result.cols, result.rows };
+
+    if (cn == 1)
+    {
+        float templ_sum = static_cast<float>(sum(_templ)[0]) / tsize.area();
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result),
+                      templ.rows, templ.cols, templ_sum).run(2, globalsize, NULL, false);
+    }
+    else
+    {
+        Vec4f templ_sum = Vec4f::all(0);
+        templ_sum = sum(templ) / tsize.area();
+        if (cn == 2)
+            return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols,
+                          templ_sum[0], templ_sum[1]).run(2, globalsize, NULL, false);
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols,
+                      templ_sum[0], templ_sum[1], templ_sum[2], templ_sum[3]).run(2, globalsize, NULL, false);
+    }
+}
+
+static bool matchTemplate_CCOEFF_NORMED(InputArray _image, InputArray _templ, OutputArray _result)
+{
+    matchTemplate(_image, _templ, _result, CV_TM_CCORR);
+
+    UMat temp, image_sums, image_sqsums;
+    integral(_image, image_sums, image_sqsums, CV_32F, CV_32F);
+
+    int type = image_sums.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+
+    ocl::Kernel k(format("matchTemplate_CCOEFF_NORMED_C%d", cn).c_str(), ocl::imgproc::match_template_oclsrc,
+        format("-D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn));
+    if (k.empty())
+        return false;
+
+    UMat templ = _templ.getUMat();
+    Size size = _image.size(), tsize = templ.size();
+    _result.create(size.height - templ.rows + 1, size.width - templ.cols + 1, CV_32F);
+    UMat result = _result.getUMat();
+
+    size_t globalsize[2] = { result.cols, result.rows };
+    float scale = 1.f / tsize.area();
+
+    if (cn == 1)
+    {
+        float templ_sum = (float)sum(templ)[0];
+
+        multiply(templ, templ, temp, 1, CV_32F);
+        float templ_sqsum = (float)sum(temp)[0];
+
+        templ_sqsum -= scale * templ_sum * templ_sum;
+        templ_sum   *= scale;
+
+        if (templ_sqsum < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return true;
+        }
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                      ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale, templ_sum, templ_sqsum)
+                     .run(2,globalsize,NULL,false);
+    }
+    else
+    {
+        Vec4f templ_sum = Vec4f::all(0), templ_sqsum = Vec4f::all(0);
+        templ_sum = sum(templ);
+
+        multiply(templ, templ, temp, 1, CV_32F);
+        templ_sqsum = sum(temp);
+
+        float templ_sqsum_sum = 0;
+        for (int i = 0; i < cn; i ++)
+            templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
+
+        templ_sum *= scale;
+
+        if (templ_sqsum_sum < DBL_EPSILON)
+        {
+            result = Scalar::all(1);
+            return true;
+        }
+
+        if (cn == 2)
+            return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                          ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale,
+                          templ_sum[0], templ_sum[1], templ_sqsum_sum).run(2, globalsize, NULL, false);
+
+        return k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums),
+                      ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale,
+                      templ_sum[0], templ_sum[1], templ_sum[2], templ_sum[3],
+                      templ_sqsum_sum).run(2, globalsize, NULL, false);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method)
+{
+    int cn = _img.channels();
+
+    if (cn == 3 || cn > 4)
+        return false;
+
+    typedef bool (*Caller)(InputArray _img, InputArray _templ, OutputArray _result);
+
+    static const Caller callers[] =
+    {
+        matchTemplate_SQDIFF, matchTemplate_SQDIFF_NORMED, matchTemplate_CCORR,
+        matchTemplate_CCORR_NORMED, matchTemplate_CCOEFF, matchTemplate_CCOEFF_NORMED
+    };
+    const Caller caller = callers[method];
+
+    return caller(_img, _templ, _result);
+}
+
+#endif
+
 void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
                 Size corrsize, int ctype,
                 Point anchor, double delta, int borderType )
@@ -226,14 +511,23 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr,
         }
     }
 }
-
 }
 
-/*****************************************************************************************/
+////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, int method )
 {
     CV_Assert( CV_TM_SQDIFF <= method && method <= CV_TM_CCOEFF_NORMED );
+    CV_Assert( (_img.depth() == CV_8U || _img.depth() == CV_32F) && _img.type() == _templ.type() && _img.dims() <= 2 );
+
+    bool needswap = _img.size().height < _templ.size().height || _img.size().width < _templ.size().width;
+    if (needswap)
+    {
+        CV_Assert(_img.size().height <= _templ.size().height && _img.size().width <= _templ.size().width);
+    }
+
+    CV_OCL_RUN(_img.dims() <= 2 && _result.isUMat(),
+               (!needswap ? ocl_matchTemplate(_img, _templ, _result, method) : ocl_matchTemplate(_templ, _img, _result, method)))
 
     int numType = method == CV_TM_CCORR || method == CV_TM_CCORR_NORMED ? 0 :
                   method == CV_TM_CCOEFF || method == CV_TM_CCOEFF_NORMED ? 1 : 2;
@@ -242,14 +536,9 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result,
                     method == CV_TM_CCOEFF_NORMED;
 
     Mat img = _img.getMat(), templ = _templ.getMat();
-    if( img.rows < templ.rows || img.cols < templ.cols )
+    if (needswap)
         std::swap(img, templ);
 
-    CV_Assert( (img.depth() == CV_8U || img.depth() == CV_32F) &&
-               img.type() == templ.type() );
-
-    CV_Assert( img.rows >= templ.rows && img.cols >= templ.cols);
-
     Size corrSize(img.cols - templ.cols + 1, img.rows - templ.rows + 1);
     _result.create(corrSize, CV_32F);
     Mat result = _result.getMat();
diff --git a/modules/imgproc/src/thresh.cpp b/modules/imgproc/src/thresh.cpp
index ce853a783..fc0f6f9e9 100644
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -706,6 +706,8 @@ private:
     int thresholdType;
 };
 
+#ifdef HAVE_OPENCL
+
 static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, double maxval, int thresh_type )
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), ktype = CV_MAKE_TYPE(depth, 1);
@@ -739,13 +741,14 @@ static bool ocl_threshold( InputArray _src, OutputArray _dst, double & thresh, d
     return k.run(2, globalsize, NULL, false);
 }
 
+#endif
+
 }
 
 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
 {
-    if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat() &&
-            ocl_threshold(_src, _dst, thresh, maxval, type))
-        return thresh;
+    CV_OCL_RUN_(_src.dims() <= 2 && _dst.isUMat(),
+                ocl_threshold(_src, _dst, thresh, maxval, type), thresh)
 
     Mat src = _src.getMat();
     bool use_otsu = (type & THRESH_OTSU) != 0;
diff --git a/modules/imgproc/test/ocl/test_accumulate.cpp b/modules/imgproc/test/ocl/test_accumulate.cpp
new file mode 100644
index 000000000..586c34b26
--- /dev/null
+++ b/modules/imgproc/test/ocl/test_accumulate.cpp
@@ -0,0 +1,240 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+PARAM_TEST_CASE(AccumulateBase, std::pair<MatDepth, MatDepth>, Channels, bool)
+{
+    int sdepth, ddepth, channels;
+    bool useRoi;
+    double alpha;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_INPUT_PARAMETER(mask)
+    TEST_DECLARE_INPUT_PARAMETER(src2)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        const std::pair<MatDepth, MatDepth> depths = GET_PARAM(0);
+        sdepth = depths.first, ddepth = depths.second;
+        channels = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+    }
+
+    void random_roi()
+    {
+        const int stype = CV_MAKE_TYPE(sdepth, channels),
+                dtype = CV_MAKE_TYPE(ddepth, channels);
+
+        Size roiSize = randomSize(1, 10);
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, stype, -MAX_VALUE, MAX_VALUE);
+
+        Border maskBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC1, -MAX_VALUE, MAX_VALUE);
+        threshold(mask, mask, 80, 255, THRESH_BINARY);
+
+        Border src2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src2, src2_roi, roiSize, src2Border, stype, -MAX_VALUE, MAX_VALUE);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, dtype, -MAX_VALUE, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_INPUT_PARAMETER(mask)
+        UMAT_UPLOAD_INPUT_PARAMETER(src2)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+
+        alpha = randomDouble(-5, 5);
+    }
+};
+
+/////////////////////////////////// Accumulate ///////////////////////////////////
+
+typedef AccumulateBase Accumulate;
+
+OCL_TEST_P(Accumulate, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulate(src_roi, dst_roi));
+        OCL_ON(cv::accumulate(usrc_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-6);
+    }
+}
+
+OCL_TEST_P(Accumulate, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulate(src_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulate(usrc_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-6);
+    }
+}
+
+/////////////////////////////////// AccumulateSquare ///////////////////////////////////
+
+typedef AccumulateBase AccumulateSquare;
+
+OCL_TEST_P(AccumulateSquare, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateSquare(src_roi, dst_roi));
+        OCL_ON(cv::accumulateSquare(usrc_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateSquare, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateSquare(src_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulateSquare(usrc_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// AccumulateProduct ///////////////////////////////////
+
+typedef AccumulateBase AccumulateProduct;
+
+OCL_TEST_P(AccumulateProduct, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateProduct(src_roi, src2_roi, dst_roi));
+        OCL_ON(cv::accumulateProduct(usrc_roi, usrc2_roi, udst_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateProduct, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateProduct(src_roi, src2_roi, dst_roi, mask_roi));
+        OCL_ON(cv::accumulateProduct(usrc_roi, usrc2_roi, udst_roi, umask_roi));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// AccumulateWeighted ///////////////////////////////////
+
+typedef AccumulateBase AccumulateWeighted;
+
+OCL_TEST_P(AccumulateWeighted, Mat)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateWeighted(src_roi, dst_roi, alpha));
+        OCL_ON(cv::accumulateWeighted(usrc_roi, udst_roi, alpha));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+OCL_TEST_P(AccumulateWeighted, Mask)
+{
+    for (int i = 0; i < test_loop_times; ++i)
+    {
+        random_roi();
+
+        OCL_OFF(cv::accumulateWeighted(src_roi, dst_roi, alpha));
+        OCL_ON(cv::accumulateWeighted(usrc_roi, udst_roi, alpha));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1e-2);
+    }
+}
+
+/////////////////////////////////// Instantiation ///////////////////////////////////
+
+#define OCL_DEPTH_ALL_COMBINATIONS \
+    testing::Values(std::make_pair<MatDepth, MatDepth>(CV_8U, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_16U, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_32F, CV_32F), \
+    std::make_pair<MatDepth, MatDepth>(CV_8U, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_16U, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_32F, CV_64F), \
+    std::make_pair<MatDepth, MatDepth>(CV_64F, CV_64F))
+
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, Accumulate, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateSquare, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateProduct, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, AccumulateWeighted, Combine(OCL_DEPTH_ALL_COMBINATIONS, OCL_ALL_CHANNELS, Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif
diff --git a/modules/imgproc/test/ocl/test_blend.cpp b/modules/imgproc/test/ocl/test_blend.cpp
index 4cfe486d2..17c0b1312 100644
--- a/modules/imgproc/test/ocl/test_blend.cpp
+++ b/modules/imgproc/test/ocl/test_blend.cpp
@@ -75,7 +75,7 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)
         const int type = CV_MAKE_TYPE(depth, channels);
         const double upValue = 256;
 
-        Size roiSize = randomSize(1, 20);
+        Size roiSize = randomSize(1, MAX_VALUE);
         Border src1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
         randomSubMat(src1, src1_roi, roiSize, src1Border, type, -upValue, upValue);
 
@@ -104,8 +104,7 @@ PARAM_TEST_CASE(BlendLinear, MatDepth, Channels, bool)
 
     void Near(double eps = 0.0)
     {
-        EXPECT_MAT_NEAR(dst, udst, eps);
-        EXPECT_MAT_NEAR(dst_roi, udst_roi, eps);
+        OCL_EXPECT_MATS_NEAR(dst, eps)
     }
 };
 
diff --git a/modules/imgproc/test/ocl/test_boxfilter.cpp b/modules/imgproc/test/ocl/test_boxfilter.cpp
index 178aef4c2..c95657c9e 100644
--- a/modules/imgproc/test/ocl/test_boxfilter.cpp
+++ b/modules/imgproc/test/ocl/test_boxfilter.cpp
@@ -49,39 +49,34 @@
 namespace cvtest {
 namespace ocl {
 
-enum
-{
-    noType = -1
-};
+////////////////////////////////////////// boxFilter ///////////////////////////////////////////////////////
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// boxFilter
-PARAM_TEST_CASE(BoxFilter, MatDepth, Channels, BorderType, bool)
+PARAM_TEST_CASE(BoxFilterBase, MatDepth, Channels, BorderType, bool, bool)
 {
     static const int kernelMinSize = 2;
     static const int kernelMaxSize = 10;
 
-    int type;
-    Size ksize;
-    Size dsize;
+    int depth, cn, borderType;
+    Size ksize, dsize;
     Point anchor;
-    int borderType;
-    bool useRoi;
+    bool normalize, useRoi;
 
     TEST_DECLARE_INPUT_PARAMETER(src)
     TEST_DECLARE_OUTPUT_PARAMETER(dst)
 
     virtual void SetUp()
     {
-        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        depth = GET_PARAM(0);
+        cn = GET_PARAM(1);
         borderType = GET_PARAM(2); // only not isolated border tested, because CPU module doesn't support isolated border case.
-        useRoi = GET_PARAM(3);
+        normalize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
     }
 
     void random_roi()
     {
+        int type = CV_MAKE_TYPE(depth, cn);
         dsize = randomSize(1, MAX_VALUE);
-
         ksize = randomSize(kernelMinSize, kernelMaxSize);
 
         Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
@@ -100,24 +95,41 @@ PARAM_TEST_CASE(BoxFilter, MatDepth, Channels, BorderType, bool)
 
     void Near(double threshold = 0.0)
     {
-        EXPECT_MAT_NEAR(dst, udst, threshold);
-        EXPECT_MAT_NEAR(dst_roi, udst_roi, threshold);
+        OCL_EXPECT_MATS_NEAR(dst, threshold)
     }
 };
 
+typedef BoxFilterBase BoxFilter;
+
 OCL_TEST_P(BoxFilter, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
         random_roi();
 
-        OCL_OFF(cv::boxFilter(src_roi, dst_roi, -1, ksize, anchor, true, borderType));
-        OCL_ON(cv::boxFilter(usrc_roi, udst_roi, -1, ksize, anchor, true, borderType));
+        OCL_OFF(cv::boxFilter(src_roi, dst_roi, -1, ksize, anchor, normalize, borderType));
+        OCL_ON(cv::boxFilter(usrc_roi, udst_roi, -1, ksize, anchor, normalize, borderType));
 
-        Near(1.0);
+        Near(depth <= CV_32S ? 1 : 1e-3);
     }
 }
 
+typedef BoxFilterBase SqrBoxFilter;
+
+OCL_TEST_P(SqrBoxFilter, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        int ddepth = depth == CV_8U ? CV_32S : CV_64F;
+
+        OCL_OFF(cv::sqrBoxFilter(src_roi, dst_roi, ddepth, ksize, anchor, normalize, borderType));
+        OCL_ON(cv::sqrBoxFilter(usrc_roi, udst_roi, ddepth, ksize, anchor, normalize, borderType));
+
+        Near(depth <= CV_32S ? 1 : 7e-2);
+    }
+}
 
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter,
                             Combine(
@@ -127,6 +139,20 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, BoxFilter,
                                        (BorderType)BORDER_REPLICATE,
                                        (BorderType)BORDER_REFLECT,
                                        (BorderType)BORDER_REFLECT_101),
+                                Bool(),
+                                Bool()  // ROI
+                                )
+                           );
+
+OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SqrBoxFilter,
+                            Combine(
+                                Values(CV_8U, CV_16U, CV_16S, CV_32F, CV_64F),
+                                Values(1, 2, 4),
+                                Values((BorderType)BORDER_CONSTANT,
+                                       (BorderType)BORDER_REPLICATE,
+                                       (BorderType)BORDER_REFLECT,
+                                       (BorderType)BORDER_REFLECT_101),
+                                Bool(),
                                 Bool()  // ROI
                                 )
                            );
diff --git a/modules/ocl/test/test_canny.cpp b/modules/imgproc/test/ocl/test_canny.cpp
similarity index 58%
rename from modules/ocl/test/test_canny.cpp
rename to modules/imgproc/test/ocl/test_canny.cpp
index 82286031f..e328d2a2f 100644
--- a/modules/ocl/test/test_canny.cpp
+++ b/modules/imgproc/test/ocl/test_canny.cpp
@@ -44,46 +44,74 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
 #ifdef HAVE_OPENCL
 
+namespace cvtest {
+namespace ocl {
+
 ////////////////////////////////////////////////////////
 // Canny
-IMPLEMENT_PARAM_CLASS(AppertureSize, int);
-IMPLEMENT_PARAM_CLASS(L2gradient, bool);
 
-PARAM_TEST_CASE(Canny, AppertureSize, L2gradient)
+IMPLEMENT_PARAM_CLASS(AppertureSize, int)
+IMPLEMENT_PARAM_CLASS(L2gradient, bool)
+IMPLEMENT_PARAM_CLASS(UseRoi, bool)
+
+PARAM_TEST_CASE(Canny, AppertureSize, L2gradient, UseRoi)
 {
     int apperture_size;
-    bool useL2gradient;
+    bool useL2gradient, use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
 
-    cv::Mat edges_gold;
     virtual void SetUp()
     {
         apperture_size = GET_PARAM(0);
         useL2gradient = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+    }
+
+    void generateTestData()
+    {
+        Mat img = readImage("shared/fruits.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty()) << "cann't load shared/fruits.png";
+
+        Size roiSize = img.size();
+        int type = img.type();
+        ASSERT_EQ(CV_8UC1, type);
+
+        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, 2, 100);
+        img.copyTo(src_roi);
+
+        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
     }
 };
 
 OCL_TEST_P(Canny, Accuracy)
 {
-    cv::Mat img = readImage("cv/shared/fruits.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
+    generateTestData();
 
-    double low_thresh = 50.0;
-    double high_thresh = 100.0;
+    const double low_thresh = 50.0, high_thresh = 100.0;
 
-    cv::ocl::oclMat ocl_img = cv::ocl::oclMat(img);
+    OCL_OFF(cv::Canny(src_roi, dst_roi, low_thresh, high_thresh, apperture_size, useL2gradient));
+    OCL_ON(cv::Canny(usrc_roi, udst_roi, low_thresh, high_thresh, apperture_size, useL2gradient));
 
-    cv::ocl::oclMat edges;
-    cv::ocl::Canny(ocl_img, edges, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-    cv::Mat edges_gold;
-    cv::Canny(img, edges_gold, low_thresh, high_thresh, apperture_size, useL2gradient);
-
-    EXPECT_MAT_SIMILAR(edges_gold, edges, 1e-2);
+    EXPECT_MAT_SIMILAR(dst_roi, udst_roi, 1e-2);
+    EXPECT_MAT_SIMILAR(dst, udst, 1e-2);
 }
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Canny, testing::Combine(
-                            testing::Values(AppertureSize(3), AppertureSize(5)),
-                            testing::Values(L2gradient(false), L2gradient(true))));
-#endif
+OCL_INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
+                                testing::Values(AppertureSize(3), AppertureSize(5)),
+                                testing::Values(L2gradient(false), L2gradient(true)),
+                                testing::Values(UseRoi(false), UseRoi(true))));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/ocl/test_color.cpp b/modules/imgproc/test/ocl/test_color.cpp
index 4c0f8b311..ffd392a03 100644
--- a/modules/imgproc/test/ocl/test_color.cpp
+++ b/modules/imgproc/test/ocl/test_color.cpp
@@ -248,6 +248,26 @@ OCL_TEST_P(CvtColor8u, GRAY2BGR555) { performTest(1, 2, CVTCODE(GRAY2BGR555)); }
 OCL_TEST_P(CvtColor8u, RGBA2mRGBA) { performTest(4, 4, CVTCODE(RGBA2mRGBA)); }
 OCL_TEST_P(CvtColor8u, mRGBA2RGBA) { performTest(4, 4, CVTCODE(mRGBA2RGBA)); }
 
+// RGB <-> Lab
+
+OCL_TEST_P(CvtColor8u32f, BGR2Lab) { performTest(3, 3, CVTCODE(BGR2Lab)); }
+OCL_TEST_P(CvtColor8u32f, RGB2Lab) { performTest(3, 3, CVTCODE(RGB2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LBGR2Lab) { performTest(3, 3, CVTCODE(LBGR2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LRGB2Lab) { performTest(3, 3, CVTCODE(LRGB2Lab)); }
+OCL_TEST_P(CvtColor8u32f, BGRA2Lab) { performTest(4, 3, CVTCODE(BGR2Lab)); }
+OCL_TEST_P(CvtColor8u32f, RGBA2Lab) { performTest(4, 3, CVTCODE(RGB2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LBGRA2Lab) { performTest(4, 3, CVTCODE(LBGR2Lab)); }
+OCL_TEST_P(CvtColor8u32f, LRGBA2Lab) { performTest(4, 3, CVTCODE(LRGB2Lab)); }
+
+OCL_TEST_P(CvtColor8u32f, Lab2BGR) { performTest(3, 3, CVTCODE(Lab2BGR), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2RGB) { performTest(3, 3, CVTCODE(Lab2RGB), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2LBGR) { performTest(3, 3, CVTCODE(Lab2LBGR), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2LRGB) { performTest(3, 3, CVTCODE(Lab2LRGB), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2BGRA) { performTest(3, 4, CVTCODE(Lab2BGR), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2RGBA) { performTest(3, 4, CVTCODE(Lab2RGB), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2LBGRA) { performTest(3, 4, CVTCODE(Lab2LBGR), depth == CV_8U ? 1 : 1e-5); }
+OCL_TEST_P(CvtColor8u32f, Lab2LRGBA) { performTest(3, 4, CVTCODE(Lab2LRGB), depth == CV_8U ? 1 : 1e-5); }
+
 // YUV -> RGBA_NV12
 
 struct CvtColor_YUV420 :
diff --git a/modules/imgproc/test/ocl/test_filters.cpp b/modules/imgproc/test/ocl/test_filters.cpp
index 5953d8070..fe16fe81d 100644
--- a/modules/imgproc/test/ocl/test_filters.cpp
+++ b/modules/imgproc/test/ocl/test_filters.cpp
@@ -229,6 +229,75 @@ OCL_TEST_P(GaussianBlurTest, Mat)
     }
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Erode
+
+typedef FilterTestBase Erode;
+
+OCL_TEST_P(Erode, Mat)
+{
+    Size kernelSize(ksize, ksize);
+    int iterations = (int)param;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+        Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
+
+        OCL_OFF(cv::erode(src_roi, dst_roi, kernel, Point(-1,-1), iterations) );
+        OCL_ON(cv::erode(usrc_roi, udst_roi, kernel, Point(-1,-1), iterations) );
+
+        Near();
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Dilate
+
+typedef FilterTestBase Dilate;
+
+OCL_TEST_P(Dilate, Mat)
+{
+    Size kernelSize(ksize, ksize);
+    int iterations = (int)param;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+        Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
+
+        OCL_OFF(cv::dilate(src_roi, dst_roi, kernel, Point(-1,-1), iterations) );
+        OCL_ON(cv::dilate(usrc_roi, udst_roi, kernel, Point(-1,-1), iterations) );
+
+        Near();
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// MorphologyEx
+
+typedef FilterTestBase MorphologyEx;
+
+OCL_TEST_P(MorphologyEx, Mat)
+{
+    Size kernelSize(ksize, ksize);
+    int iterations = (int)param;
+    int op = size.height;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+        Mat kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
+
+        OCL_OFF(cv::morphologyEx(src_roi, dst_roi, op, kernel, Point(-1,-1), iterations) );
+        OCL_ON(cv::morphologyEx(usrc_roi, udst_roi, op, kernel, Point(-1,-1), iterations) );
+
+        Near();
+    }
+}
+
+
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 #define FILTER_BORDER_SET_NO_ISOLATED \
@@ -285,6 +354,31 @@ OCL_INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
                             Values(0.0), // not used
                             Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(3, 5, 7),
+                            Values(Size(0,0)),//not used
+                            Values((BorderType)BORDER_CONSTANT),//not used
+                            Values(1.0, 2.0, 3.0),
+                            Bool() ) );
+
+OCL_INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(3, 5, 7),
+                            Values(Size(0,0)),//not used
+                            Values((BorderType)BORDER_CONSTANT),//not used
+                            Values(1.0, 2.0, 3.0),
+                            Bool() ) );
+
+OCL_INSTANTIATE_TEST_CASE_P(Filter, MorphologyEx, Combine(
+                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, CV_64FC1, CV_64FC4),
+                            Values(3, 5, 7),
+                            Values(Size(0,0), Size(0,1), Size(0,2), Size(0,3), Size(0,4), Size(0,5),Size(0,6)),//uses as generator of operations
+                            Values((BorderType)BORDER_CONSTANT),//not used
+                            Values(1.0, 2.0, 3.0),
+                            Bool() ) );
+
+
 } } // namespace cvtest::ocl
 
 #endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/ocl/test_gftt.cpp b/modules/imgproc/test/ocl/test_gftt.cpp
new file mode 100644
index 000000000..df6fa731d
--- /dev/null
+++ b/modules/imgproc/test/ocl/test_gftt.cpp
@@ -0,0 +1,143 @@
+///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////// GoodFeaturesToTrack //////////////////////////
+
+
+PARAM_TEST_CASE(GoodFeaturesToTrack, double, bool)
+{
+    double minDistance;
+    bool useRoi;
+
+    static const int maxCorners;
+    static const double qualityLevel;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    UMat points, upoints;
+
+    virtual void SetUp()
+    {
+        minDistance = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
+    }
+
+    void generateTestData()
+    {
+        Mat frame = readImage("../gpu/opticalflow/rubberwhale1.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(frame.empty()) << "could not load gpu/opticalflow/rubberwhale1.png";
+
+        Size roiSize = frame.size();
+        Border srcBorder = randomBorder(0, useRoi ? 2 : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, frame.type(), 5, 256);
+        src_roi.copyTo(frame);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+    }
+
+    void UMatToVector(const UMat & um, std::vector<Point2f> & v) const
+    {
+        v.resize(um.size().area());
+        um.copyTo(Mat(um.size(), CV_32FC2, &v[0]));
+    }
+};
+
+const int GoodFeaturesToTrack::maxCorners = 1000;
+const double GoodFeaturesToTrack::qualityLevel = 0.01;
+
+OCL_TEST_P(GoodFeaturesToTrack, Accuracy)
+{
+    for (int j = 0; j < test_loop_times; ++j)
+    {
+        generateTestData();
+
+        std::vector<Point2f> upts, pts;
+
+        OCL_OFF(cv::goodFeaturesToTrack(src_roi, points, maxCorners, qualityLevel, minDistance, noArray()));
+        ASSERT_FALSE(points.empty());
+        UMatToVector(points, pts);
+
+        OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+        ASSERT_FALSE(upoints.empty());
+        UMatToVector(upoints, upts);
+
+        ASSERT_EQ(upts.size(), pts.size());
+
+        int mistmatch = 0;
+        for (size_t i = 0; i < pts.size(); ++i)
+        {
+            Point2i a = upts[i], b = pts[i];
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+
+            if (!eq)
+                ++mistmatch;
+        }
+
+        double bad_ratio = static_cast<double>(mistmatch) / pts.size();
+        ASSERT_GE(1e-2, bad_ratio);
+    }
+}
+
+OCL_TEST_P(GoodFeaturesToTrack, EmptyCorners)
+{
+    generateTestData();
+    usrc_roi.setTo(Scalar::all(0));
+
+    OCL_ON(cv::goodFeaturesToTrack(usrc_roi, upoints, maxCorners, qualityLevel, minDistance));
+
+    ASSERT_TRUE(upoints.empty());
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, GoodFeaturesToTrack,
+                            ::testing::Combine(testing::Values(0.0, 3.0), Bool()));
+
+} } // namespace cvtest::ocl
+
+#endif
diff --git a/modules/imgproc/test/ocl/test_histogram.cpp b/modules/imgproc/test/ocl/test_histogram.cpp
new file mode 100644
index 000000000..b0837eeaa
--- /dev/null
+++ b/modules/imgproc/test/ocl/test_histogram.cpp
@@ -0,0 +1,219 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Shengen Yan, yanshengen@gmail.com
+//    Jiang Liyuan, lyuan001.good@163.com
+//    Rock Li, Rock.Li@amd.com
+//    Wu Zailong, bullet@yeah.net
+//    Xu Pang, pangxu010@163.com
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "cvconfig.h"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////////////////////////////////////////////////////////////////
+
+PARAM_TEST_CASE(CalcBackProject, MatDepth, int, bool)
+{
+    int depth, N;
+    bool useRoi;
+
+    std::vector<float> ranges;
+    std::vector<int> channels;
+    double scale;
+
+    std::vector<Mat> images;
+    std::vector<Mat> images_roi;
+    std::vector<UMat> uimages;
+    std::vector<UMat> uimages_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(hist)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        depth = GET_PARAM(0);
+        N = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        ASSERT_GE(2, N);
+
+        images.resize(N);
+        images_roi.resize(N);
+        uimages.resize(N);
+        uimages_roi.resize(N);
+    }
+
+    virtual void random_roi()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+
+        int totalChannels = 0;
+        for (int i = 0; i < N; ++i)
+        {
+            Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+            int cn = randomInt(1, 5);
+            randomSubMat(images[i], images_roi[i], roiSize, srcBorder, CV_MAKE_TYPE(depth, cn), 0, 125);
+
+            ranges.push_back(10);
+            ranges.push_back(100);
+
+            channels.push_back(randomInt(0, cn) + totalChannels);
+            totalChannels += cn;
+        }
+
+        Mat tmpHist;
+        {
+            std::vector<int> hist_size(N);
+            for (int i = 0 ; i < N; ++i)
+                hist_size[i] = randomInt(10, 50);
+
+            cv::calcHist(images_roi, channels, noArray(), tmpHist, hist_size, ranges);
+            ASSERT_EQ(CV_32FC1, tmpHist.type());
+        }
+
+        Border histBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(hist, hist_roi, tmpHist.size(), histBorder, tmpHist.type(), 0, MAX_VALUE);
+        tmpHist.copyTo(hist_roi);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, CV_MAKE_TYPE(depth, 1), 5, 16);
+
+        for (int i = 0; i < N; ++i)
+        {
+            images[i].copyTo(uimages[i]);
+
+            Size _wholeSize;
+            Point ofs;
+            images_roi[i].locateROI(_wholeSize, ofs);
+
+            uimages_roi[i] = uimages[i](Rect(ofs.x, ofs.y, images_roi[i].cols, images_roi[i].rows));
+        }
+
+        UMAT_UPLOAD_INPUT_PARAMETER(hist)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+
+        scale = randomDouble(0.1, 1);
+    }
+};
+
+//////////////////////////////// CalcBackProject //////////////////////////////////////////////
+
+OCL_TEST_P(CalcBackProject, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        OCL_OFF(cv::calcBackProject(images_roi, channels, hist_roi, dst_roi, ranges, scale));
+        OCL_ON(cv::calcBackProject(uimages_roi, channels, uhist_roi, udst_roi, ranges, scale));
+
+        OCL_EXPECT_MATS_NEAR(dst, 0.0)
+    }
+}
+
+//////////////////////////////// CalcHist //////////////////////////////////////////////
+
+PARAM_TEST_CASE(CalcHist, bool)
+{
+    bool useRoi;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(hist)
+
+    virtual void SetUp()
+    {
+        useRoi = GET_PARAM(0);
+    }
+
+    virtual void random_roi()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(src, src_roi, roiSize, srcBorder, CV_8UC1, 0, 256);
+
+        Border histBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(hist, hist_roi, Size(1, 256), histBorder, CV_32SC1, 0, MAX_VALUE);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(hist)
+    }
+};
+
+OCL_TEST_P(CalcHist, Mat)
+{
+    const std::vector<int> channels(1, 0);
+    std::vector<float> ranges(2);
+    std::vector<int> histSize(1, 256);
+    ranges[0] = 0;
+    ranges[1] = 256;
+
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        OCL_OFF(cv::calcHist(std::vector<Mat>(1, src_roi), channels, noArray(), hist_roi, histSize, ranges, false));
+        OCL_ON(cv::calcHist(std::vector<UMat>(1, usrc_roi), channels, noArray(), uhist_roi, histSize, ranges, false));
+
+        OCL_EXPECT_MATS_NEAR(hist, 0.0)
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CalcBackProject, Combine(Values((MatDepth)CV_8U), Values(1, 2), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CalcHist, Values(true, false));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/ocl/test_imgproc.cpp b/modules/imgproc/test/ocl/test_imgproc.cpp
index bf6f8e64a..78b2e573d 100644
--- a/modules/imgproc/test/ocl/test_imgproc.cpp
+++ b/modules/imgproc/test/ocl/test_imgproc.cpp
@@ -103,7 +103,7 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
     }
 };
 
-////////////////////////////////copyMakeBorder////////////////////////////////////////////
+//////////////////////////////// copyMakeBorder ////////////////////////////////////////////
 
 PARAM_TEST_CASE(CopyMakeBorder, MatDepth, // depth
                 Channels, // channels
@@ -171,7 +171,7 @@ OCL_TEST_P(CopyMakeBorder, Mat)
     }
 }
 
-////////////////////////////////equalizeHist//////////////////////////////////////////////
+//////////////////////////////// equalizeHist //////////////////////////////////////////////
 
 typedef ImgprocTestBase EqualizeHist;
 
@@ -188,14 +188,14 @@ OCL_TEST_P(EqualizeHist, Mat)
     }
 }
 
-////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
+//////////////////////////////// Corners test //////////////////////////////////////////
 
 struct CornerTestBase :
         public ImgprocTestBase
 {
     virtual void random_roi()
     {
-        Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
+        Mat image = readImageType("../gpu/stereobm/aloe-L.png", type);
         ASSERT_FALSE(image.empty());
 
         bool isFP = CV_MAT_DEPTH(type) >= CV_32F;
@@ -224,7 +224,7 @@ struct CornerTestBase :
 
 typedef CornerTestBase CornerMinEigenVal;
 
-OCL_TEST_P(CornerMinEigenVal, DISABLED_Mat)
+OCL_TEST_P(CornerMinEigenVal, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -239,11 +239,11 @@ OCL_TEST_P(CornerMinEigenVal, DISABLED_Mat)
     }
 }
 
-////////////////////////////////cornerHarris//////////////////////////////////////////
+//////////////////////////////// cornerHarris //////////////////////////////////////////
 
 typedef CornerTestBase CornerHarris;
 
-OCL_TEST_P(CornerHarris, DISABLED_Mat)
+OCL_TEST_P(CornerHarris, Mat)
 {
     for (int j = 0; j < test_loop_times; j++)
     {
@@ -255,11 +255,31 @@ OCL_TEST_P(CornerHarris, DISABLED_Mat)
         OCL_OFF(cv::cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType));
         OCL_ON(cv::cornerHarris(usrc_roi, udst_roi, blockSize, apertureSize, k, borderType));
 
-        Near(1e-5, true);
+        Near(1e-6, true);
     }
 }
 
-//////////////////////////////////integral/////////////////////////////////////////////////
+//////////////////////////////// preCornerDetect //////////////////////////////////////////
+
+typedef ImgprocTestBase PreCornerDetect;
+
+OCL_TEST_P(PreCornerDetect, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        const int apertureSize = blockSize;
+
+        OCL_OFF(cv::preCornerDetect(src_roi, dst_roi, apertureSize, borderType));
+        OCL_ON(cv::preCornerDetect(usrc_roi, udst_roi, apertureSize, borderType));
+
+        Near(1e-6, true);
+    }
+}
+
+
+////////////////////////////////// integral /////////////////////////////////////////////////
 
 struct Integral :
         public ImgprocTestBase
@@ -331,8 +351,7 @@ OCL_TEST_P(Integral, Mat2)
     }
 }
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//// threshold
+////////////////////////////////////////  threshold //////////////////////////////////////////////
 
 struct Threshold :
         public ImgprocTestBase
@@ -342,7 +361,6 @@ struct Threshold :
     virtual void SetUp()
     {
         type = GET_PARAM(0);
-        blockSize = GET_PARAM(1);
         thresholdType = GET_PARAM(2);
         useRoi = GET_PARAM(3);
     }
@@ -364,9 +382,7 @@ OCL_TEST_P(Threshold, Mat)
     }
 }
 
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// CLAHE
+/////////////////////////////////////////// CLAHE //////////////////////////////////////////////////
 
 PARAM_TEST_CASE(CLAHETest, Size, double, bool)
 {
@@ -440,6 +456,13 @@ OCL_INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
                                     (BorderType)BORDER_REFLECT, (BorderType)BORDER_REFLECT_101),
                             Bool()));
 
+OCL_INSTANTIATE_TEST_CASE_P(Imgproc, PreCornerDetect, Combine(
+                            Values((MatType)CV_8UC1, CV_32FC1),
+                            Values(3, 5),
+                            Values( (BorderType)BORDER_CONSTANT, (BorderType)BORDER_REPLICATE,
+                                    (BorderType)BORDER_REFLECT, (BorderType)BORDER_REFLECT_101),
+                            Bool()));
+
 OCL_INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
                             Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
                             Values(CV_32SC1, CV_32FC1), // desired sdepth
diff --git a/modules/imgproc/test/ocl/test_match_template.cpp b/modules/imgproc/test/ocl/test_match_template.cpp
new file mode 100644
index 000000000..507f0a73b
--- /dev/null
+++ b/modules/imgproc/test/ocl/test_match_template.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+#include "iostream"
+#include "fstream"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
+
+CV_ENUM(MatchTemplType, CV_TM_SQDIFF, CV_TM_SQDIFF_NORMED, CV_TM_CCORR,
+        CV_TM_CCORR_NORMED, CV_TM_CCOEFF, CV_TM_CCOEFF_NORMED)
+
+PARAM_TEST_CASE(MatchTemplate, MatDepth, Channels, MatchTemplType, bool)
+{
+    int type;
+    int depth;
+    int method;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(image)
+    TEST_DECLARE_INPUT_PARAMETER(templ)
+    TEST_DECLARE_OUTPUT_PARAMETER(result)
+
+    virtual void SetUp()
+    {
+        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        depth = GET_PARAM(0);
+        method = GET_PARAM(2);
+        use_roi = GET_PARAM(3);
+    }
+
+    virtual void generateTestData()
+    {
+        Size image_roiSize = randomSize(2, 100);
+        Size templ_roiSize = Size(randomInt(1, image_roiSize.width), randomInt(1, image_roiSize.height));
+        Size result_roiSize = Size(image_roiSize.width - templ_roiSize.width + 1,
+                                   image_roiSize.height - templ_roiSize.height + 1);
+
+        const double upValue = 256;
+
+        Border imageBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(image, image_roi, image_roiSize, imageBorder, type, -upValue, upValue);
+
+        Border templBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(templ, templ_roi, templ_roiSize, templBorder, type, -upValue, upValue);
+
+        Border resultBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(result, result_roi, result_roiSize, resultBorder, CV_32FC1, -upValue, upValue);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(image)
+        UMAT_UPLOAD_INPUT_PARAMETER(templ)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(result)
+    }
+
+    void Near(double threshold = 0.0)
+    {
+        OCL_EXPECT_MATS_NEAR_RELATIVE(result, threshold);
+    }
+};
+
+OCL_TEST_P(MatchTemplate, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::matchTemplate(image_roi, templ_roi, result_roi, method));
+        OCL_ON(cv::matchTemplate(uimage_roi, utempl_roi, uresult_roi, method));
+
+        Near(1.5e-4);
+    }
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(ImageProc, MatchTemplate, Combine(
+                                Values(CV_8U, CV_32F),
+                                Values(1, 2, 4),
+                                MatchTemplType::all(),
+                                Bool())
+                           );
+} } // namespace cvtest::ocl
+
+#endif
diff --git a/modules/imgproc/test/ocl/test_sepfilter2D.cpp b/modules/imgproc/test/ocl/test_sepfilter2D.cpp
new file mode 100644
index 000000000..5e824d6b2
--- /dev/null
+++ b/modules/imgproc/test/ocl/test_sepfilter2D.cpp
@@ -0,0 +1,147 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// sepFilter2D
+PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
+{
+    static const int kernelMinSize = 2;
+    static const int kernelMaxSize = 10;
+
+    int type;
+    Point anchor;
+    int borderType;
+    bool useRoi;
+    Mat kernelX, kernelY;
+
+    TEST_DECLARE_INPUT_PARAMETER(src)
+    TEST_DECLARE_OUTPUT_PARAMETER(dst)
+
+    virtual void SetUp()
+    {
+        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
+        borderType = GET_PARAM(2) | (GET_PARAM(3) ? BORDER_ISOLATED : 0);
+        useRoi = GET_PARAM(4);
+    }
+
+    void random_roi()
+    {
+        Size ksize = randomSize(kernelMinSize, kernelMaxSize);
+        if (1 != (ksize.width % 2))
+            ksize.width++;
+        if (1 != (ksize.height % 2))
+            ksize.height++;
+        Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
+        cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
+        temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
+        cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
+
+        Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
+        int rest = roiSize.width % 4;
+        if (0 != rest)
+            roiSize.width += (4 - rest);
+        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        rest = srcBorder.lef % 4;
+        if (0 != rest)
+            srcBorder.lef += (4 - rest);
+        rest = srcBorder.rig % 4;
+        if (0 != rest)
+            srcBorder.rig += (4 - rest);
+        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
+
+        anchor.x = -1;
+        anchor.y = -1;
+
+        UMAT_UPLOAD_INPUT_PARAMETER(src)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(dst)
+    }
+
+    void Near(double threshold = 0.0)
+    {
+        OCL_EXPECT_MATS_NEAR(dst, threshold);
+    }
+};
+
+OCL_TEST_P(SepFilter2D, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        random_roi();
+
+        OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, 0.0, borderType));
+        OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, 0.0, borderType));
+
+        Near(1.0);
+    }
+}
+
+
+OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
+                            Combine(
+                                Values(CV_8U, CV_32F),
+                                Values(1, 4),
+                                Values(
+                                        (BorderType)BORDER_CONSTANT,
+                                        (BorderType)BORDER_REPLICATE,
+                                        (BorderType)BORDER_REFLECT,
+                                        (BorderType)BORDER_REFLECT_101),
+                                Bool(), // BORDER_ISOLATED
+                                Bool()  // ROI
+                                )
+                           );
+
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp
index 0c94f8f50..cf420fd6a 100644
--- a/modules/imgproc/test/test_color.cpp
+++ b/modules/imgproc/test/test_color.cpp
@@ -99,6 +99,8 @@ CV_ColorCvtBaseTest::CV_ColorCvtBaseTest( bool _custom_inv_transform, bool _allo
 
     test_cpp = false;
     hue_range = 0;
+    blue_idx = 0;
+    inplace = false;
 }
 
 
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index cb9944fd6..2b6169cf5 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -1380,7 +1380,7 @@ CV_FitLineTest::CV_FitLineTest()
     max_noise = 0.05;
 }
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic push
 # pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
@@ -1456,7 +1456,7 @@ void CV_FitLineTest::generate_point_set( void* pointsSet )
     }
 }
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic pop
 #endif
 
@@ -1484,7 +1484,7 @@ void CV_FitLineTest::run_func()
         cv::fitLine(cv::cvarrToMat(points), (cv::Vec6f&)line[0], dist_type, 0, reps, aeps);
 }
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic push
 # pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
@@ -1567,7 +1567,7 @@ _exit_:
     return code;
 }
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic pop
 #endif
 
diff --git a/modules/imgproc/test/test_cvtyuv.cpp b/modules/imgproc/test/test_cvtyuv.cpp
index bd8d95dc7..0cce64cdb 100644
--- a/modules/imgproc/test/test_cvtyuv.cpp
+++ b/modules/imgproc/test/test_cvtyuv.cpp
@@ -603,7 +603,7 @@ CV_ENUM(YUVCVTS, CV_YUV2RGB_NV12, CV_YUV2BGR_NV12, CV_YUV2RGB_NV21, CV_YUV2BGR_N
                  CV_YUV2RGBA_YUY2, CV_YUV2BGRA_YUY2, CV_YUV2RGBA_YVYU, CV_YUV2BGRA_YVYU,
                  CV_YUV2GRAY_420, CV_YUV2GRAY_UYVY, CV_YUV2GRAY_YUY2,
                  CV_YUV2BGR, CV_YUV2RGB, CV_RGB2YUV_YV12, CV_BGR2YUV_YV12, CV_RGBA2YUV_YV12,
-                 CV_BGRA2YUV_YV12, CV_RGB2YUV_I420, CV_BGR2YUV_I420, CV_RGBA2YUV_I420, CV_BGRA2YUV_I420);
+                 CV_BGRA2YUV_YV12, CV_RGB2YUV_I420, CV_BGR2YUV_I420, CV_RGBA2YUV_I420, CV_BGRA2YUV_I420)
 
 typedef ::testing::TestWithParam<YUVCVTS> Imgproc_ColorYUV;
 
diff --git a/modules/imgproc/test/test_filter.cpp b/modules/imgproc/test/test_filter.cpp
index efbad9974..a0927b0f3 100644
--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -898,8 +898,8 @@ struct median_pair
 {
     int col;
     int val;
-    median_pair() {};
-    median_pair( int _col, int _val ) : col(_col), val(_val) {};
+    median_pair() { }
+    median_pair( int _col, int _val ) : col(_col), val(_val) { }
 };
 
 
@@ -1886,3 +1886,35 @@ protected:
 };
 
 TEST(Imgproc_Filtering, supportedFormats) { CV_FilterSupportedFormatsTest test; test.safe_run(); }
+
+TEST(Imgproc_Blur, borderTypes)
+{
+    Size kernelSize(3, 3);
+
+    /// ksize > src_roi.size()
+    Mat src(3, 3, CV_8UC1, cv::Scalar::all(255)), dst;
+    Mat src_roi = src(Rect(1, 1, 1, 1));
+    src_roi.setTo(cv::Scalar::all(0));
+
+    // should work like !BORDER_ISOLATED
+    blur(src_roi, dst, kernelSize, Point(-1, -1), BORDER_REPLICATE);
+    EXPECT_EQ(227, dst.at<uchar>(0, 0));
+
+    // should work like BORDER_ISOLATED
+    blur(src_roi, dst, kernelSize, Point(-1, -1), BORDER_REPLICATE | BORDER_ISOLATED);
+    EXPECT_EQ(0, dst.at<uchar>(0, 0));
+
+    /// ksize <= src_roi.size()
+    src = Mat(5, 5, CV_8UC1, cv::Scalar(255));
+    src_roi = src(Rect(1, 1, 3, 3));
+    src_roi.setTo(0);
+    src.at<uchar>(2, 2) = 255;
+
+    // should work like !BORDER_ISOLATED
+    blur(src_roi, dst, kernelSize, Point(-1, -1), BORDER_REPLICATE);
+    Mat expected_dst =
+            (Mat_<uchar>(3, 3) << 170, 113, 170, 113, 28, 113, 170, 113, 170);
+    EXPECT_EQ(expected_dst.type(), dst.type());
+    EXPECT_EQ(expected_dst.size(), dst.size());
+    EXPECT_DOUBLE_EQ(0.0, cvtest::norm(expected_dst, dst, NORM_INF));
+}
diff --git a/modules/imgproc/test/test_lsd.cpp b/modules/imgproc/test/test_lsd.cpp
index 82f5b0bce..50a353503 100644
--- a/modules/imgproc/test/test_lsd.cpp
+++ b/modules/imgproc/test/test_lsd.cpp
@@ -12,7 +12,7 @@ const int EPOCHS = 20;
 class LSDBase : public testing::Test
 {
 public:
-    LSDBase() {};
+    LSDBase() { }
 
 protected:
     Mat test_image;
@@ -30,7 +30,7 @@ protected:
 class Imgproc_LSD_ADV: public LSDBase
 {
 public:
-    Imgproc_LSD_ADV() {};
+    Imgproc_LSD_ADV() { }
 protected:
 
 };
@@ -38,7 +38,7 @@ protected:
 class Imgproc_LSD_STD: public LSDBase
 {
 public:
-    Imgproc_LSD_STD() {};
+    Imgproc_LSD_STD() { }
 protected:
 
 };
@@ -46,7 +46,7 @@ protected:
 class Imgproc_LSD_NONE: public LSDBase
 {
 public:
-    Imgproc_LSD_NONE() {};
+    Imgproc_LSD_NONE() { }
 protected:
 
 };
diff --git a/modules/imgproc/test/test_moments.cpp b/modules/imgproc/test/test_moments.cpp
index c58d1f53b..b74ee5db8 100644
--- a/modules/imgproc/test/test_moments.cpp
+++ b/modules/imgproc/test/test_moments.cpp
@@ -43,6 +43,13 @@
 using namespace cv;
 using namespace std;
 
+#define OCL_TUNING_MODE 0
+#if OCL_TUNING_MODE
+#define OCL_TUNING_MODE_ONLY(code) code
+#else
+#define OCL_TUNING_MODE_ONLY(code)
+#endif
+
 // image moments
 class CV_MomentsTest : public cvtest::ArrayTest
 {
@@ -60,6 +67,7 @@ protected:
     void run_func();
     int coi;
     bool is_binary;
+    bool try_umat;
 };
 
 
@@ -70,6 +78,7 @@ CV_MomentsTest::CV_MomentsTest()
     test_array[REF_OUTPUT].push_back(NULL);
     coi = -1;
     is_binary = false;
+    OCL_TUNING_MODE_ONLY(test_case_count = 10);
     //element_wise_relative_error = false;
 }
 
@@ -96,25 +105,38 @@ void CV_MomentsTest::get_minmax_bounds( int i, int j, int type, Scalar& low, Sca
     }
 }
 
-
 void CV_MomentsTest::get_test_array_types_and_sizes( int test_case_idx,
                                                 vector<vector<Size> >& sizes, vector<vector<int> >& types )
 {
     RNG& rng = ts->get_rng();
     cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
-    int cn = cvtest::randInt(rng) % 4 + 1;
+    int cn = (cvtest::randInt(rng) % 4) + 1;
     int depth = cvtest::randInt(rng) % 4;
     depth = depth == 0 ? CV_8U : depth == 1 ? CV_16U : depth == 2 ? CV_16S : CV_32F;
-    if( cn == 2 )
+
+    is_binary = cvtest::randInt(rng) % 2 != 0;
+    if( depth == 0 && !is_binary )
+        try_umat = cvtest::randInt(rng) % 5 != 0;
+    else
+        try_umat = cvtest::randInt(rng) % 2 != 0;
+
+    if( cn == 2 || try_umat )
         cn = 1;
 
+    OCL_TUNING_MODE_ONLY(
+    cn = 1;
+    depth = CV_8U;
+    try_umat = true;
+    is_binary = false;
+    sizes[INPUT][0] = Size(1024,768)
+    );
+
     types[INPUT][0] = CV_MAKETYPE(depth, cn);
     types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_64FC1;
     sizes[OUTPUT][0] = sizes[REF_OUTPUT][0] = cvSize(MOMENT_COUNT,1);
     if(CV_MAT_DEPTH(types[INPUT][0])>=CV_32S)
         sizes[INPUT][0].width = MAX(sizes[INPUT][0].width, 3);
 
-    is_binary = cvtest::randInt(rng) % 2 != 0;
     coi = 0;
     cvmat_allowed = true;
     if( cn > 1 )
@@ -149,7 +171,25 @@ void CV_MomentsTest::run_func()
 {
     CvMoments* m = (CvMoments*)test_mat[OUTPUT][0].ptr<double>();
     double* others = (double*)(m + 1);
-    cvMoments( test_array[INPUT][0], m, is_binary );
+    if( try_umat )
+    {
+        UMat u;
+        test_mat[INPUT][0].clone().copyTo(u);
+        OCL_TUNING_MODE_ONLY(
+            static double ttime = 0;
+            static int ncalls = 0;
+            moments(u, is_binary != 0);
+            double t = (double)getTickCount());
+        Moments new_m = moments(u, is_binary != 0);
+        OCL_TUNING_MODE_ONLY(
+            ttime += (double)getTickCount() - t;
+            ncalls++;
+            printf("%g\n", ttime/ncalls/u.total()));
+        *m = new_m;
+    }
+    else
+        cvMoments( test_array[INPUT][0], m, is_binary );
+
     others[0] = cvGetNormalizedCentralMoment( m, 2, 0 );
     others[1] = cvGetNormalizedCentralMoment( m, 1, 1 );
     others[2] = cvGetNormalizedCentralMoment( m, 0, 2 );
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 3a7c5ae03..1948e2114 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -159,7 +159,7 @@ foreach(java_file ${step3_input_files})
 
   if(ANDROID)
     get_filename_component(install_subdir "${java_file_name}" PATH)
-    install(FILES "${output_name}" DESTINATION "${JAVA_INSTALL_ROOT}/src/org/opencv/${install_subdir}" COMPONENT main)
+    install(FILES "${output_name}" DESTINATION "${JAVA_INSTALL_ROOT}/src/org/opencv/${install_subdir}" COMPONENT java)
   endif()
 endforeach()
 
@@ -173,7 +173,7 @@ if(ANDROID)
 
     if(NOT file MATCHES "jni/.+")
       get_filename_component(install_subdir "${file}" PATH)
-      install(FILES "${OpenCV_BINARY_DIR}/${file}" DESTINATION "${JAVA_INSTALL_ROOT}/${install_subdir}" COMPONENT main)
+      install(FILES "${OpenCV_BINARY_DIR}/${file}" DESTINATION "${JAVA_INSTALL_ROOT}/${install_subdir}" COMPONENT java)
     endif()
   endforeach()
 
@@ -209,11 +209,11 @@ if(ANDROID AND ANDROID_EXECUTABLE)
   list(APPEND copied_files ${lib_target_files} "${OpenCV_BINARY_DIR}/${ANDROID_MANIFEST_FILE}")
   list(APPEND step3_input_files "${CMAKE_CURRENT_BINARY_DIR}/${ANDROID_MANIFEST_FILE}")
 
-  install(FILES "${OpenCV_BINARY_DIR}/${ANDROID_PROJECT_PROPERTIES_FILE}" DESTINATION ${JAVA_INSTALL_ROOT} COMPONENT main)
-  install(FILES "${OpenCV_BINARY_DIR}/${ANDROID_MANIFEST_FILE}" DESTINATION ${JAVA_INSTALL_ROOT} COMPONENT main)
+  install(FILES "${OpenCV_BINARY_DIR}/${ANDROID_PROJECT_PROPERTIES_FILE}" DESTINATION ${JAVA_INSTALL_ROOT} COMPONENT java)
+  install(FILES "${OpenCV_BINARY_DIR}/${ANDROID_MANIFEST_FILE}" DESTINATION ${JAVA_INSTALL_ROOT} COMPONENT java)
   # creating empty 'gen' and 'res' folders
-  install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${JAVA_INSTALL_ROOT}/gen\")" COMPONENT main)
-  install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${JAVA_INSTALL_ROOT}/res\")" COMPONENT main)
+  install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${JAVA_INSTALL_ROOT}/gen\")" COMPONENT java)
+  install(CODE "MAKE_DIRECTORY(\"\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${JAVA_INSTALL_ROOT}/res\")" COMPONENT java)
 endif(ANDROID AND ANDROID_EXECUTABLE)
 
 set(step3_depends ${step2_depends} ${step3_input_files} ${copied_files})
@@ -250,7 +250,7 @@ if(ANDROID)
 else(ANDROID)
   set(JAR_NAME opencv-${LIB_NAME_SUFIX}.jar)
   set(JAR_FILE "${OpenCV_BINARY_DIR}/bin/${JAR_NAME}")
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OpenCV_BINARY_DIR}/build.xml" IMMEDIATE @ONLY)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OpenCV_BINARY_DIR}/build.xml" @ONLY)
   list(APPEND step3_depends "${OpenCV_BINARY_DIR}/build.xml")
 
   add_custom_command(OUTPUT "${JAR_FILE}" "${JAR_FILE}.dephelper"
@@ -266,7 +266,7 @@ else(ANDROID)
   else(WIN32)
     set(JAR_INSTALL_DIR share/OpenCV/java)
   endif(WIN32)
-  install(FILES ${JAR_FILE} DESTINATION ${JAR_INSTALL_DIR} COMPONENT main)
+  install(FILES ${JAR_FILE} DESTINATION ${JAR_INSTALL_DIR} COMPONENT java)
 endif(ANDROID)
 
 # step 5: build native part
@@ -337,17 +337,17 @@ endif()
 
 if(ANDROID)
   ocv_install_target(${the_module} EXPORT OpenCVModules
-          LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
-          ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main)
+          LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT java
+          ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT java)
 else()
   if(NOT INSTALL_CREATE_DISTRIB)
     ocv_install_target(${the_module} EXPORT OpenCVModules
-            RUNTIME DESTINATION ${JAR_INSTALL_DIR} COMPONENT main
-            LIBRARY DESTINATION ${JAR_INSTALL_DIR} COMPONENT main)
+            RUNTIME DESTINATION ${JAR_INSTALL_DIR} COMPONENT java
+            LIBRARY DESTINATION ${JAR_INSTALL_DIR} COMPONENT java)
   else()
     ocv_install_target(${the_module} EXPORT OpenCVModules
-            RUNTIME DESTINATION ${JAR_INSTALL_DIR}/${OpenCV_ARCH} COMPONENT main
-            LIBRARY DESTINATION ${JAR_INSTALL_DIR}/${OpenCV_ARCH} COMPONENT main)
+            RUNTIME DESTINATION ${JAR_INSTALL_DIR}/${OpenCV_ARCH} COMPONENT java
+            LIBRARY DESTINATION ${JAR_INSTALL_DIR}/${OpenCV_ARCH} COMPONENT java)
   endif()
 endif()
 
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index c41e6336c..cce270828 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -18,6 +18,8 @@ class_ignore_list = (
 const_ignore_list = (
     "CV_CAP_OPENNI",
     "CV_CAP_PROP_OPENNI_",
+    "CV_CAP_INTELPERC",
+    "CV_CAP_PROP_INTELPERC_"
     "WINDOW_AUTOSIZE",
     "CV_WND_PROP_",
     "CV_WINDOW_",
@@ -396,7 +398,7 @@ JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Core_n_1minMaxLocManual
 
     return result;
 
-    } catch(cv::Exception e) {
+    } catch(const cv::Exception& e) {
         LOGD("Core::n_1minMaxLoc() catched cv::Exception: %s", e.what());
         jclass je = env->FindClass("org/opencv/core/CvException");
         if(!je) je = env->FindClass("java/lang/Exception");
@@ -469,7 +471,7 @@ JNIEXPORT jdoubleArray JNICALL Java_org_opencv_core_Core_n_1getTextSize
 
         return result;
 
-    } catch(cv::Exception e) {
+    } catch(const cv::Exception& e) {
         LOGD("Core::n_1getTextSize() catched cv::Exception: %s", e.what());
         jclass je = env->FindClass("org/opencv/core/CvException");
         if(!je) je = env->FindClass("java/lang/Exception");
diff --git a/modules/java/generator/rst_parser.py b/modules/java/generator/rst_parser.py
index d2c3d4019..750d6f0be 100755
--- a/modules/java/generator/rst_parser.py
+++ b/modules/java/generator/rst_parser.py
@@ -2,8 +2,7 @@
 
 from __future__ import print_function
 import os, sys, re, string, fnmatch
-
-allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "cuda", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"]
+allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "cuda", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "softcascade", "superres"]
 verbose = False
 show_warnings = True
 show_errors = True
diff --git a/modules/java/generator/src/cpp/Mat.cpp b/modules/java/generator/src/cpp/Mat.cpp
index b3b0f66e7..185cb2de9 100644
--- a/modules/java/generator/src/cpp/Mat.cpp
+++ b/modules/java/generator/src/cpp/Mat.cpp
@@ -467,7 +467,7 @@ JNIEXPORT jint JNICALL Java_org_opencv_core_Mat_n_1dims
         LOGD("%s", method_name);
         Mat* me = (Mat*) self; //TODO: check for NULL
         return me->dims;
-    } catch(cv::Exception e) {
+    } catch(const cv::Exception& e) {
         throwJavaException(env, &e, method_name);
     } catch (...) {
         throwJavaException(env, 0, method_name);
diff --git a/modules/java/generator/src/cpp/utils.cpp b/modules/java/generator/src/cpp/utils.cpp
index 40811e8f9..2d409c863 100644
--- a/modules/java/generator/src/cpp/utils.cpp
+++ b/modules/java/generator/src/cpp/utils.cpp
@@ -48,7 +48,7 @@ JNIEXPORT void JNICALL Java_org_opencv_android_Utils_nBitmapToMat2
             }
             AndroidBitmap_unlockPixels(env, bitmap);
             return;
-        } catch(cv::Exception e) {
+        } catch(const cv::Exception& e) {
             AndroidBitmap_unlockPixels(env, bitmap);
             LOGE("nBitmapToMat catched cv::Exception: %s", e.what());
             jclass je = env->FindClass("org/opencv/core/CvException");
@@ -130,7 +130,7 @@ JNIEXPORT void JNICALL Java_org_opencv_android_Utils_nMatToBitmap2
             }
             AndroidBitmap_unlockPixels(env, bitmap);
             return;
-        } catch(cv::Exception e) {
+        } catch(const cv::Exception& e) {
             AndroidBitmap_unlockPixels(env, bitmap);
             LOGE("nMatToBitmap catched cv::Exception: %s", e.what());
             jclass je = env->FindClass("org/opencv/core/CvException");
diff --git a/modules/java/generator/src/java/android+NativeCameraView.java b/modules/java/generator/src/java/android+NativeCameraView.java
index 62d077580..8035d0437 100644
--- a/modules/java/generator/src/java/android+NativeCameraView.java
+++ b/modules/java/generator/src/java/android+NativeCameraView.java
@@ -22,6 +22,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
     private Thread mThread;
 
     protected VideoCapture mCamera;
+    protected NativeCameraFrame mFrame;
 
     public NativeCameraView(Context context, int cameraId) {
         super(context, cameraId);
@@ -97,6 +98,8 @@ public class NativeCameraView extends CameraBridgeViewBase {
             if (mCamera.isOpened() == false)
                 return false;
 
+            mFrame = new NativeCameraFrame(mCamera);
+
             java.util.List<Size> sizes = mCamera.getSupportedPreviewSizes();
 
             /* Select the size that fits surface considering maximum size allowed */
@@ -127,9 +130,8 @@ public class NativeCameraView extends CameraBridgeViewBase {
 
     private void releaseCamera() {
         synchronized (this) {
-            if (mCamera != null) {
-                mCamera.release();
-            }
+            if (mFrame != null) mFrame.release();
+            if (mCamera != null) mCamera.release();
         }
     }
 
@@ -153,6 +155,11 @@ public class NativeCameraView extends CameraBridgeViewBase {
             mRgba = new Mat();
         }
 
+        public void release() {
+            if (mGray != null) mGray.release();
+            if (mRgba != null) mRgba.release();
+        }
+
         private VideoCapture mCapture;
         private Mat mRgba;
         private Mat mGray;
@@ -167,7 +174,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
                     break;
                 }
 
-                deliverAndDrawFrame(new NativeCameraFrame(mCamera));
+                deliverAndDrawFrame(mFrame);
 
             } while (!mStopThread);
         }
diff --git a/modules/java/generator/src/java/android+OpenCVLoader.java b/modules/java/generator/src/java/android+OpenCVLoader.java
index a130ae30f..0892e3af3 100644
--- a/modules/java/generator/src/java/android+OpenCVLoader.java
+++ b/modules/java/generator/src/java/android+OpenCVLoader.java
@@ -37,6 +37,10 @@ public class OpenCVLoader
      */
     public static final String OPENCV_VERSION_2_4_7 = "2.4.7";
 
+    /**
+     * OpenCV Library version 2.4.8.
+     */
+    public static final String OPENCV_VERSION_2_4_8 = "2.4.8";
 
     /**
      * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
@@ -44,7 +48,17 @@ public class OpenCVLoader
      */
     public static boolean initDebug()
     {
-        return StaticHelper.initOpenCV();
+        return StaticHelper.initOpenCV(false);
+    }
+
+    /**
+     * Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
+     * @param InitCuda load and initialize CUDA runtime libraries.
+     * @return Returns true is initialization of OpenCV was successful.
+     */
+    public static boolean initDebug(boolean InitCuda)
+    {
+        return StaticHelper.initOpenCV(InitCuda);
     }
 
     /**
diff --git a/modules/java/generator/src/java/android+StaticHelper.java b/modules/java/generator/src/java/android+StaticHelper.java
index 8d0629c8d..10442c904 100644
--- a/modules/java/generator/src/java/android+StaticHelper.java
+++ b/modules/java/generator/src/java/android+StaticHelper.java
@@ -7,11 +7,21 @@ import android.util.Log;
 
 class StaticHelper {
 
-    public static boolean initOpenCV()
+    public static boolean initOpenCV(boolean InitCuda)
     {
         boolean result;
         String libs = "";
 
+        if(InitCuda)
+        {
+            loadLibrary("cudart");
+            loadLibrary("nppc");
+            loadLibrary("nppi");
+            loadLibrary("npps");
+            loadLibrary("cufft");
+            loadLibrary("cublas");
+        }
+
         Log.d(TAG, "Trying to get library list");
 
         try
@@ -52,7 +62,7 @@ class StaticHelper {
         try
         {
             System.loadLibrary(Name);
-            Log.d(TAG, "OpenCV libs init was ok!");
+            Log.d(TAG, "Library " + Name + " loaded");
         }
         catch(UnsatisfiedLinkError e)
         {
diff --git a/modules/java/generator/src/java/core+TermCriteria.java b/modules/java/generator/src/java/core+TermCriteria.java
index 98a5e3c39..c67e51ea8 100644
--- a/modules/java/generator/src/java/core+TermCriteria.java
+++ b/modules/java/generator/src/java/core+TermCriteria.java
@@ -87,7 +87,6 @@ public class TermCriteria {
 
     @Override
     public String toString() {
-        if (this == null) return "null";
         return "{ type: " + type + ", maxCount: " + maxCount + ", epsilon: " + epsilon + "}";
     }
 }
diff --git a/modules/legacy/include/opencv2/legacy.hpp b/modules/legacy/include/opencv2/legacy.hpp
index 01f726bc0..e85bfd24d 100644
--- a/modules/legacy/include/opencv2/legacy.hpp
+++ b/modules/legacy/include/opencv2/legacy.hpp
@@ -2672,12 +2672,12 @@ protected:
     // The minimum distance to each training patch with all its affine poses is found over all scales.
     // The class ID of a match is returned for each keypoint. The distance is calculated over PCA components
     // loaded with DescriptorOneWay::Initialize, kd tree is used for finding minimum distances.
-    virtual void knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    virtual void knnMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                               std::vector<std::vector<DMatch> >& matches, int k,
-                              const std::vector<Mat>& masks, bool compactResult );
-    virtual void radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+                              InputArrayOfArrays masks, bool compactResult );
+    virtual void radiusMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                  std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                 const std::vector<Mat>& masks, bool compactResult );
+                                 InputArrayOfArrays masks, bool compactResult );
 
     Ptr<OneWayDescriptorBase> base;
     Params params;
@@ -2735,12 +2735,12 @@ public:
     virtual Ptr<GenericDescriptorMatcher> clone( bool emptyTrainData=false ) const;
 
 protected:
-    virtual void knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    virtual void knnMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                               std::vector<std::vector<DMatch> >& matches, int k,
-                              const std::vector<Mat>& masks, bool compactResult );
-    virtual void radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+                              InputArrayOfArrays masks, bool compactResult );
+    virtual void radiusMatchImpl( InputArray queryImage, std::vector<KeyPoint>& queryKeypoints,
                                  std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                 const std::vector<Mat>& masks, bool compactResult );
+                                 InputArrayOfArrays masks, bool compactResult );
 
     void trainFernClassifier();
     void calcBestProbAndMatchIdx( const Mat& image, const Point2f& pt,
@@ -2770,7 +2770,7 @@ public:
     virtual bool empty() const;
 
 protected:
-    virtual void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
+    virtual void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 
     RTreeClassifier classifier_;
     static const int BORDER_SIZE = 16;
@@ -2783,15 +2783,17 @@ CalonderDescriptorExtractor<T>::CalonderDescriptorExtractor(const String& classi
 }
 
 template<typename T>
-void CalonderDescriptorExtractor<T>::computeImpl( const Mat& image,
+void CalonderDescriptorExtractor<T>::computeImpl( InputArray _image,
                                                  std::vector<KeyPoint>& keypoints,
-                                                 Mat& descriptors) const
+                                                 OutputArray _descriptors) const
 {
+    Mat image = _image.getMat(), descriptors;
     // Cannot compute descriptors for keypoints on the image border.
     KeyPointsFilter::runByImageBorder(keypoints, image.size(), BORDER_SIZE);
 
     /// @todo Check 16-byte aligned
-    descriptors.create((int)keypoints.size(), classifier_.classes(), cv::DataType<T>::type);
+    _descriptors.create((int)keypoints.size(), classifier_.classes(), cv::DataType<T>::type);
+     descriptors = _descriptors.getMat();
 
     int patchSize = RandomizedTree::PATCH_SIZE;
     int offset = patchSize / 2;
diff --git a/modules/legacy/src/bgfg_gaussmix.cpp b/modules/legacy/src/bgfg_gaussmix.cpp
index 4a19fde38..415e63a0e 100644
--- a/modules/legacy/src/bgfg_gaussmix.cpp
+++ b/modules/legacy/src/bgfg_gaussmix.cpp
@@ -413,7 +413,7 @@ CV_INLINE int _icvRemoveShadowGMM(float* data, int nD,
 //IEEE Trans. on Pattern Analysis and Machine Intelligence, vol.26, no.5, pages 651-656, 2004
 //http://www.zoranz.net/Publications/zivkovic2004PAMI.pdf
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic push
 # pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
@@ -606,7 +606,7 @@ CV_INLINE int _icvUpdateGMM(float* data, int nD,
     return bBackground;
 }
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic pop
 #endif
 
diff --git a/modules/legacy/src/blobtrack.cpp b/modules/legacy/src/blobtrack.cpp
index 48b83ef91..00e4905cc 100644
--- a/modules/legacy/src/blobtrack.cpp
+++ b/modules/legacy/src/blobtrack.cpp
@@ -205,7 +205,7 @@ double CvVSModule::GetParam(const char* name)
         if(p->pInt) return p->pInt[0];
     }
     return 0;
-};
+}
 
 const char* CvVSModule::GetParamStr(const char* name)
 {
diff --git a/modules/legacy/src/blobtrackgen1.cpp b/modules/legacy/src/blobtrackgen1.cpp
index 7114f2b4c..af9c3b3a1 100644
--- a/modules/legacy/src/blobtrackgen1.cpp
+++ b/modules/legacy/src/blobtrackgen1.cpp
@@ -98,7 +98,7 @@ public:
         m_pFileName = NULL;
 
         SetModuleName("Gen1");
-    };
+    }
 
     ~CvBlobTrackGen1()
     {
@@ -119,7 +119,7 @@ public:
         }   /* Check next track. */
     }   /*  Destructor. */
 
-    void    SetFileName(char* pFileName){m_pFileName = pFileName;};
+    void    SetFileName(char* pFileName){m_pFileName = pFileName;}
 
     void    AddBlob(CvBlob* pBlob)
     {
@@ -140,7 +140,7 @@ public:
         pTrack->FrameLast = m_Frame;
         assert(pTrack->pSeq);
         pTrack->pSeq->AddBlob(pBlob);
-    };
+    }
 
     void    Process(IplImage* /*pImg*/ = NULL, IplImage* /*pFG*/ = NULL)
     {
diff --git a/modules/legacy/src/blobtrackgenyml.cpp b/modules/legacy/src/blobtrackgenyml.cpp
index 0d9de45ee..c33d16d03 100644
--- a/modules/legacy/src/blobtrackgenyml.cpp
+++ b/modules/legacy/src/blobtrackgenyml.cpp
@@ -147,7 +147,7 @@ public:
         m_Size = cvSize(2,2);
 
         SetModuleName("YML");
-    };
+    }
 
     ~CvBlobTrackGenYML()
     {
@@ -164,7 +164,7 @@ public:
 
     }   /* Destructor. */
 
-    void    SetFileName(char* pFileName){m_pFileName = pFileName;};
+    void    SetFileName(char* pFileName){m_pFileName = pFileName;}
     void    AddBlob(CvBlob* pBlob)
     {
         DefBlobTrack* pTrack = (DefBlobTrack*)m_TrackList.GetBlobByID(CV_BLOB_ID(pBlob));
@@ -184,7 +184,7 @@ public:
         pTrack->FrameLast = m_Frame;
         assert(pTrack->pSeq);
         pTrack->pSeq->AddBlob(pBlob);
-    };
+    }
     void    Process(IplImage* pImg = NULL, IplImage* /*pFG*/ = NULL)
     {
         int i;
diff --git a/modules/legacy/src/blobtrackingauto.cpp b/modules/legacy/src/blobtrackingauto.cpp
index ada0bdff4..1fd695310 100644
--- a/modules/legacy/src/blobtrackingauto.cpp
+++ b/modules/legacy/src/blobtrackingauto.cpp
@@ -97,16 +97,16 @@ class CvBlobTrackerAuto1: public CvBlobTrackerAuto
 public:
     CvBlobTrackerAuto1(CvBlobTrackerAutoParam1* param);
     ~CvBlobTrackerAuto1();
-    CvBlob* GetBlob(int index){return m_BlobList.GetBlob(index);};
-    CvBlob* GetBlobByID(int ID){return m_BlobList.GetBlobByID(ID);};
-    int     GetBlobNum(){return m_BlobList.GetBlobNum();};
-    virtual IplImage* GetFGMask(){return m_pFGMask;};
-    float   GetState(int BlobID){return m_pBTA?m_pBTA->GetState(BlobID):0;};
-    const char*   GetStateDesc(int BlobID){return m_pBTA?m_pBTA->GetStateDesc(BlobID):NULL;};
+    CvBlob* GetBlob(int index){return m_BlobList.GetBlob(index);}
+    CvBlob* GetBlobByID(int ID){return m_BlobList.GetBlobByID(ID);}
+    int     GetBlobNum(){return m_BlobList.GetBlobNum();}
+    virtual IplImage* GetFGMask(){return m_pFGMask;}
+    float   GetState(int BlobID){return m_pBTA?m_pBTA->GetState(BlobID):0;}
+    const char*   GetStateDesc(int BlobID){return m_pBTA?m_pBTA->GetStateDesc(BlobID):NULL;}
     /* Return 0 if trajectory is normal;
        return >0 if trajectory abnormal. */
     void Process(IplImage* pImg, IplImage* pMask = NULL);
-    void Release(){delete this;};
+    void Release(){delete this;}
 
 private:
     IplImage*               m_pFGMask;
diff --git a/modules/legacy/src/blobtrackingcc.cpp b/modules/legacy/src/blobtrackingcc.cpp
index c2279a20b..1a72c7e47 100644
--- a/modules/legacy/src/blobtrackingcc.cpp
+++ b/modules/legacy/src/blobtrackingcc.cpp
@@ -125,23 +125,23 @@ public:
         CommentParam("ConfidenceType","Type of calculated Confidence (NearestBlob, AverFG, BC)");
 
         SetModuleName("CC");
-    };
+    }
 
     ~CvBlobTrackerCC()
     {
         if(m_pMem)cvReleaseMemStorage(&m_pMem);
-    };
+    }
 
     /* Blob functions: */
-    virtual int     GetBlobNum() {return m_BlobList.GetBlobNum();};
-    virtual CvBlob* GetBlob(int BlobIndex){return m_BlobList.GetBlob(BlobIndex);};
+    virtual int     GetBlobNum() {return m_BlobList.GetBlobNum();}
+    virtual CvBlob* GetBlob(int BlobIndex){return m_BlobList.GetBlob(BlobIndex);}
     virtual void    SetBlob(int BlobIndex, CvBlob* pBlob)
     {
         CvBlob* pB = m_BlobList.GetBlob(BlobIndex);
         if(pB) pB[0] = pBlob[0];
-    };
+    }
 
-    virtual CvBlob* GetBlobByID(int BlobID){return m_BlobList.GetBlobByID(BlobID);};
+    virtual CvBlob* GetBlobByID(int BlobID){return m_BlobList.GetBlobByID(BlobID);}
     virtual void    DelBlob(int BlobIndex)
     {
         DefBlobTracker* pBT = (DefBlobTracker*)m_BlobList.GetBlob(BlobIndex);
@@ -156,7 +156,7 @@ public:
         }
         delete pBT->pBlobHyp;
         m_BlobList.DelBlob(BlobIndex);
-    };
+    }
 #if 0
     virtual void    DelBlobByID(int BlobID)
     {
@@ -166,7 +166,7 @@ public:
         m_BlobList.DelBlobByID(BlobID);
     };
 #endif
-    virtual void    Release(){delete this;};
+    virtual void    Release(){delete this;}
 
     /* Add new blob to track it and assign to this blob personal ID */
     /* pBlob - pinter to structure with blob parameters (ID is ignored)*/
@@ -185,7 +185,7 @@ public:
         NewB.AverFG = pImgFG?CalcAverageMask(pB,pImgFG):0;
         m_BlobList.AddBlob((CvBlob*)&NewB);
         return m_BlobList.GetBlob(m_BlobList.GetBlobNum()-1);
-    };
+    }
 
     virtual void    Process(IplImage* pImg, IplImage* pImgFG = NULL)
     {
@@ -390,7 +390,7 @@ public:
 
         pBlob[0] = pB[0];
         pBlob->ID = ID;
-    };
+    }
 
     virtual double  GetConfidence(int BlobIndex, CvBlob* pBlob, IplImage* /*pImg*/, IplImage* pImgFG = NULL)
     {
@@ -443,7 +443,7 @@ public:
         }   /* Calculate sum of mask. */
 
         return W;
-    };
+    }
 
     virtual void UpdateBlob(int BlobIndex, CvBlob* /*pBlob*/, IplImage* /*pImg*/, IplImage* pImgFG = NULL)
     {
@@ -455,7 +455,7 @@ public:
         {
         //pBT->AverFG = pBT->AverFG * (1-m_Alpha) + m_Alpha * CalcAverageMask(pBlob,pImgFG);
         }
-    };
+    }
 
     virtual void ParamUpdate()
     {
@@ -481,7 +481,7 @@ public:
         DefBlobTracker* pBT = (DefBlobTracker*)m_BlobList.GetBlob(BlobIdx);
         assert(pBT->pBlobHyp);
         return pBT->pBlobHyp->GetBlobNum();
-    };  /* CvBlobtrackerList::GetBlobHypNum() */
+    }  /* CvBlobtrackerList::GetBlobHypNum() */
 
     /* Return pointer to specified blob hypothesis by index blob: */
     virtual CvBlob* GetBlobHyp(int BlobIndex, int hypothesis)
@@ -489,7 +489,7 @@ public:
         DefBlobTracker* pBT = (DefBlobTracker*)m_BlobList.GetBlob(BlobIndex);
         assert(pBT->pBlobHyp);
         return pBT->pBlobHyp->GetBlob(hypothesis);
-    };  /* CvBlobtrackerList::GetBlobHyp() */
+    }  /* CvBlobtrackerList::GetBlobHyp() */
 
     /* Set new parameters for specified (by index) blob hypothesis
      * (can be called several times for each hypothesis):
@@ -512,7 +512,7 @@ public:
             assert(pBT->pBlobHyp);
             pBT->pBlobHyp->AddBlob(pBlob);
         }
-    };
+    }
 
 private:
     CvBlob* GetNearestBlob(CvBlob* pB)
@@ -542,7 +542,7 @@ private:
 
         return pBBest;
 
-    }; /* GetNearestBlob */
+    } /* GetNearestBlob */
 
 };
 
diff --git a/modules/legacy/src/blobtrackingccwithcr.cpp b/modules/legacy/src/blobtrackingccwithcr.cpp
index ad00b9438..e8bd4dec8 100644
--- a/modules/legacy/src/blobtrackingccwithcr.cpp
+++ b/modules/legacy/src/blobtrackingccwithcr.cpp
@@ -110,23 +110,23 @@ public:
             pM->Release();
         }
         SetParam("SizeVar",0);
-    };
+    }
 
     ~CvBlobTrackerCCCR()
     {
         if(m_pMem)cvReleaseMemStorage(&m_pMem);
-    };
+    }
 
     /* Blob functions: */
-    virtual int     GetBlobNum() {return m_BlobList.GetBlobNum();};
-    virtual CvBlob* GetBlob(int BlobIndex){return m_BlobList.GetBlob(BlobIndex);};
+    virtual int     GetBlobNum() {return m_BlobList.GetBlobNum();}
+    virtual CvBlob* GetBlob(int BlobIndex){return m_BlobList.GetBlob(BlobIndex);}
     virtual void    SetBlob(int BlobIndex, CvBlob* pBlob)
     {
         CvBlob* pB = m_BlobList.GetBlob(BlobIndex);
         if(pB) pB[0] = pBlob[0];
-    };
+    }
 
-    virtual CvBlob* GetBlobByID(int BlobID){return m_BlobList.GetBlobByID(BlobID);};
+    virtual CvBlob* GetBlobByID(int BlobID){return m_BlobList.GetBlobByID(BlobID);}
     virtual void    DelBlob(int BlobIndex)
     {
         DefBlobTrackerCR* pBT = (DefBlobTrackerCR*)m_BlobList.GetBlob(BlobIndex);
@@ -134,7 +134,7 @@ public:
         if(pBT->pPredictor)pBT->pPredictor->Release();
         delete pBT->pBlobHyp;
         m_BlobList.DelBlob(BlobIndex);
-    };
+    }
 
     virtual void    DelBlobByID(int BlobID)
     {
@@ -143,9 +143,9 @@ public:
         if(pBT->pPredictor)pBT->pPredictor->Release();
         delete pBT->pBlobHyp;
         m_BlobList.DelBlobByID(BlobID);
-    };
+    }
 
-    virtual void    Release(){delete this;};
+    virtual void    Release(){delete this;}
 
     /* Add new blob to track it and assign to this blob personal ID */
     /* pBlob - pinter to structure with blob parameters (ID is ignored)*/
@@ -169,7 +169,7 @@ public:
         }
         m_BlobList.AddBlob((CvBlob*)&NewB);
         return m_BlobList.GetBlob(m_BlobList.GetBlobNum()-1);
-    };
+    }
 
     virtual void    Process(IplImage* pImg, IplImage* pImgFG = NULL)
     {
@@ -480,7 +480,7 @@ public:
         }   /* Read next blob. */
     }   /*  CCwithCR LoadState */
 
-    //void SetCollision(int Collision){m_Collision = Collision;};
+    //void SetCollision(int Collision){m_Collision = Collision;}
 };
 
 CvBlobTrackerOne* cvCreateBlobTrackerOneMSPF();
diff --git a/modules/legacy/src/blobtrackinglist.cpp b/modules/legacy/src/blobtrackinglist.cpp
index e48f6593f..cbf007186 100644
--- a/modules/legacy/src/blobtrackinglist.cpp
+++ b/modules/legacy/src/blobtrackinglist.cpp
@@ -228,7 +228,7 @@ public:
         {
             m_BlobTrackerList.DelBlob(i-1);
         }
-    };
+    }
 
     CvBlob* AddBlob(CvBlob* pBlob, IplImage* pImg, IplImage* pImgFG )
     {   /* Create new tracker: */
@@ -244,7 +244,7 @@ public:
         F.pTracker->Init(pBlob,pImg, pImgFG);
         m_BlobTrackerList.AddBlob((CvBlob*)&F);
         return m_BlobTrackerList.GetBlob(m_BlobTrackerList.GetBlobNum()-1);
-    };
+    }
 
     void DelBlob(int BlobIndex)
     {
@@ -404,7 +404,7 @@ public:
         }   /* Update predictor. */
 #endif
         m_ClearHyp = 1;
-    };
+    }
 
 
     /* Process on blob (for multi hypothesis tracing) */
@@ -421,7 +421,7 @@ public:
             pBlob[0] = pF->blob;
         }
         pBlob->ID = ID;
-    };
+    }
 
     virtual double  GetConfidence(int BlobIndex, CvBlob* pBlob, IplImage* pImg, IplImage* pImgFG = NULL)
     {
@@ -429,7 +429,7 @@ public:
         if(pF==NULL) return 0;
         if(pF->pTracker==NULL) return 0;
         return pF->pTracker->GetConfidence(pBlob?pBlob:(&pF->blob), pImg, pImgFG, NULL);
-    };
+    }
 
     virtual double GetConfidenceList(CvBlobSeq* pBlobList, IplImage* pImg, IplImage* pImgFG = NULL)
     {
@@ -460,7 +460,7 @@ public:
 //            cvWaitKey(0);
         }
         return W;
-    };
+    }
 
     virtual void UpdateBlob(int BlobIndex, CvBlob* pBlob, IplImage* pImg, IplImage* /*pImgFG*/ = NULL)
     {
@@ -469,10 +469,10 @@ public:
         {
             pF->pTracker->Update(pBlob?pBlob:&(pF->blob),pImg,m_pImgFG);
         }
-    };
+    }
 
-    int     GetBlobNum(){return m_BlobTrackerList.GetBlobNum();};
-    CvBlob* GetBlob(int index){return m_BlobTrackerList.GetBlob(index);};
+    int     GetBlobNum(){return m_BlobTrackerList.GetBlobNum();}
+    CvBlob* GetBlob(int index){return m_BlobTrackerList.GetBlob(index);}
 
     void  SetBlob(int BlobIndex, CvBlob* pBlob)
     {
@@ -485,7 +485,7 @@ public:
         }
     }
 
-    void    Release(){delete this;};
+    void    Release(){delete this;}
 
     /* Additional functionality: */
     CvBlob* GetBlobByID(int BlobID){return m_BlobTrackerList.GetBlobByID(BlobID);}
@@ -497,7 +497,7 @@ public:
         DefBlobTrackerL* pF = (DefBlobTrackerL*)m_BlobTrackerList.GetBlob(BlobIdx);
         assert(pF->pBlobHyp);
         return pF->pBlobHyp->GetBlobNum();
-    };  /* CvBlobtrackerList::GetBlobHypNum() */
+    }  /* CvBlobtrackerList::GetBlobHypNum() */
 
     /* Return pointer to specified blob hypothesis by index blob: */
     virtual CvBlob* GetBlobHyp(int BlobIndex, int hypothesis)
@@ -505,7 +505,7 @@ public:
         DefBlobTrackerL* pF = (DefBlobTrackerL*)m_BlobTrackerList.GetBlob(BlobIndex);
         assert(pF->pBlobHyp);
         return pF->pBlobHyp->GetBlob(hypothesis);
-    };  /* CvBlobtrackerList::GetBlobHyp() */
+    }  /* CvBlobtrackerList::GetBlobHyp() */
 
     /* Set new parameters for specified (by index) blob hyp (can be called several times for each hyp )*/
     virtual void    SetBlobHyp(int BlobIndex, CvBlob* pBlob)
@@ -526,7 +526,7 @@ public:
             assert(pF->pBlobHyp);
             pF->pBlobHyp->AddBlob(pBlob);
         }
-    };  /* CvBlobtrackerList::SetBlobHyp */
+    }  /* CvBlobtrackerList::SetBlobHyp */
 
 private:
 public:
diff --git a/modules/legacy/src/blobtrackingmsfg.cpp b/modules/legacy/src/blobtrackingmsfg.cpp
index 5fd9634d4..a26ce029f 100644
--- a/modules/legacy/src/blobtrackingmsfg.cpp
+++ b/modules/legacy/src/blobtrackingmsfg.cpp
@@ -260,7 +260,7 @@ private:
 
         pHist->m_HistVolume = Volume;
 
-    };  /* CollectHist */
+    }  /* CollectHist */
 
     double calcBhattacharyya(DefHist* pHM = NULL, DefHist* pHC = NULL, DefHist* pHT = NULL)
     {
@@ -370,7 +370,7 @@ public:
         if(pImg)
             CollectHist(pImg, pImgFG, pBlobInit, &m_HistModel);
         m_Blob = pBlobInit[0];
-    };
+    }
 
     virtual CvBlob* Process(CvBlob* pBlobPrev, IplImage* pImg, IplImage* pImgFG = NULL)
     {
@@ -603,7 +603,7 @@ public:
 
         return &m_Blob;
 
-    };  /* CvBlobTrackerOneMSFG::Process */
+    }  /* CvBlobTrackerOneMSFG::Process */
 
     virtual double GetConfidence(CvBlob* pBlob, IplImage* pImg, IplImage* /*pImgFG*/ = NULL, IplImage* pImgUnusedReg = NULL)
     {
@@ -611,14 +611,14 @@ public:
         double  B = GetBhattacharyya(pImg, pImgUnusedReg, pBlob, &m_HistTemp);
         return exp((B-1)/(2*S));
 
-    };  /*CvBlobTrackerOneMSFG::*/
+    }  /*CvBlobTrackerOneMSFG::*/
 
     virtual void Update(CvBlob* pBlob, IplImage* pImg, IplImage* pImgFG = NULL)
     {   /* Update histogram: */
         UpdateModelHist(pImg, pImgFG, pBlob?pBlob:&m_Blob);
     }   /*CvBlobTrackerOneMSFG::*/
 
-    virtual void Release(){delete this;};
+    virtual void Release(){delete this;}
     virtual void SetCollision(int CollisionFlag)
     {
         m_Collision = CollisionFlag;
@@ -629,7 +629,7 @@ public:
         cvWriteInt(fs,"Collision", m_Collision);
         cvWriteInt(fs,"HistVolume", cvRound(m_HistModel.m_HistVolume));
         cvWrite(fs,"Hist", m_HistModel.m_pHist);
-    };
+    }
     virtual void LoadState(CvFileStorage* fs, CvFileNode* node)
     {
         CvMat* pM;
@@ -641,7 +641,7 @@ public:
             m_HistModel.m_pHist = pM;
             m_HistModel.m_HistVolume = (float)cvSum(pM).val[0];
         }
-    };
+    }
 
 };  /*CvBlobTrackerOneMSFG*/
 
@@ -782,7 +782,7 @@ public:
         cvWriteInt(fs,"ParticleNum",m_ParticleNum);
         cvWriteStruct(fs,"ParticlesPredicted",m_pParticlesPredicted,"ffffiffd",m_ParticleNum);
         cvWriteStruct(fs,"ParticlesResampled",m_pParticlesResampled,"ffffiffd",m_ParticleNum);
-    };
+    }
 
     virtual void LoadState(CvFileStorage* fs, CvFileNode* node)
     {
@@ -796,7 +796,7 @@ public:
             cvReadStructByName(fs,node,"ParticlesPredicted",m_pParticlesPredicted,"ffffiffd");
             cvReadStructByName(fs,node,"ParticlesResampled",m_pParticlesResampled,"ffffiffd");
         }
-    };
+    }
     CvBlobTrackerOneMSPF()
     {
         m_pParticlesPredicted = NULL;
@@ -847,7 +847,7 @@ private:
         if(m_pParticlesPredicted)cvFree(&m_pParticlesPredicted);
         m_pParticlesPredicted = (DefParticle*)cvAlloc(sizeof(DefParticle)*m_ParticleNum);
         m_pParticlesResampled = (DefParticle*)cvAlloc(sizeof(DefParticle)*m_ParticleNum);
-    };  /* Realloc*/
+    }  /* Realloc*/
 
     void DrawDebug(IplImage* pImg, IplImage* /*pImgFG*/)
     {
@@ -1161,7 +1161,7 @@ public:
         }
     }
 
-    virtual void Release(){delete this;};
+    virtual void Release(){delete this;}
     virtual void ParamUpdate()
     {
         Realloc();
diff --git a/modules/legacy/src/blobtrackingmsfgs.cpp b/modules/legacy/src/blobtrackingmsfgs.cpp
index f3b3cf375..afe33847f 100644
--- a/modules/legacy/src/blobtrackingmsfgs.cpp
+++ b/modules/legacy/src/blobtrackingmsfgs.cpp
@@ -210,7 +210,7 @@ private:
 
         if(pHistVolume)pHistVolume[0] = Volume;
 
-    }; /* calcHist */
+    } /* calcHist */
 
     double calcBhattacharyya()
     {
@@ -440,9 +440,9 @@ public:
 
         return &m_Blob;
 
-    };  /* Process */
+    }  /* Process */
 
-    virtual void Release(){delete this;};
+    virtual void Release(){delete this;}
 }; /*CvBlobTrackerOneMSFGS*/
 
 static CvBlobTrackerOne* cvCreateBlobTrackerOneMSFGS()
diff --git a/modules/legacy/src/blobtrackpostproclist.cpp b/modules/legacy/src/blobtrackpostproclist.cpp
index 33e68fe46..ed03d8e58 100644
--- a/modules/legacy/src/blobtrackpostproclist.cpp
+++ b/modules/legacy/src/blobtrackpostproclist.cpp
@@ -74,7 +74,7 @@ public:
             DefBlobFilter* pF = (DefBlobFilter*)m_BlobFilterList.GetBlob(i-1);
             pF->pFilter->Release();
         }
-    };
+    }
 
     virtual void    AddBlob(CvBlob* pBlob)
     {
@@ -93,7 +93,7 @@ public:
         assert(pF);
         pF->blob = pBlob[0];
         pF->m_LastFrame = m_Frame;
-    };
+    }
 
     virtual void    Process()
     {
@@ -115,11 +115,11 @@ public:
             }
         }   /* Next blob. */
         m_Frame++;
-    };
+    }
 
-    int     GetBlobNum(){return m_BlobFilterList.GetBlobNum();};
-    CvBlob* GetBlob(int index){return m_BlobFilterList.GetBlob(index);};
-    void    Release(){delete this;};
+    int     GetBlobNum(){return m_BlobFilterList.GetBlobNum();}
+    CvBlob* GetBlob(int index){return m_BlobFilterList.GetBlob(index);}
+    void    Release(){delete this;}
 
     /* Additional functionality: */
     CvBlob* GetBlobByID(int BlobID){return m_BlobFilterList.GetBlobByID(BlobID);}
diff --git a/modules/legacy/src/clique.cpp b/modules/legacy/src/clique.cpp
index d8f2f59da..90111b159 100644
--- a/modules/legacy/src/clique.cpp
+++ b/modules/legacy/src/clique.cpp
@@ -343,7 +343,7 @@ int cvFindNextMaximalClique( CvCliqueFinder* finder )
             break;
         case NEXT:
             //here we will look for candidate to translate into not
-            //s[k] now contains index of choosen candidate
+            //s[k] now contains index of chosen candidate
             {
                 int* new_ = All[k+1];
                 if( nod[k] != 0 )
@@ -590,7 +590,7 @@ void cvBronKerbosch( CvGraph* graph )
             break;
         case NEXT:
             //here we will look for candidate to translate into not
-            //s[k] now contains index of choosen candidate
+            //s[k] now contains index of chosen candidate
             {
                 int* new_ = All[k+1];
                 if( nod[k] != 0 )
diff --git a/modules/legacy/src/enteringblobdetection.cpp b/modules/legacy/src/enteringblobdetection.cpp
index d66a997a7..a488e0881 100644
--- a/modules/legacy/src/enteringblobdetection.cpp
+++ b/modules/legacy/src/enteringblobdetection.cpp
@@ -209,7 +209,7 @@ public:
     CvBlobDetectorSimple();
    ~CvBlobDetectorSimple();
     int DetectNewBlob(IplImage* pImg, IplImage* pFGMask, CvBlobSeq* pNewBlobList, CvBlobSeq* pOldBlobList);
-    void Release(){delete this;};
+    void Release(){delete this;}
 
 protected:
     IplImage*       m_pMaskBlobNew;
@@ -219,7 +219,7 @@ protected:
 };
 
 /* Blob detector creator (sole interface function for this file) */
-CvBlobDetector* cvCreateBlobDetectorSimple(){return new CvBlobDetectorSimple;};
+CvBlobDetector* cvCreateBlobDetectorSimple(){return new CvBlobDetectorSimple;}
 
 /* Constructor of BlobDetector: */
 CvBlobDetectorSimple::CvBlobDetectorSimple()
@@ -544,7 +544,7 @@ public:
     CvBlobDetectorCC();
    ~CvBlobDetectorCC();
     int DetectNewBlob(IplImage* pImg, IplImage* pFGMask, CvBlobSeq* pNewBlobList, CvBlobSeq* pOldBlobList);
-    void Release(){delete this;};
+    void Release(){delete this;}
 
     virtual void ParamUpdate()
     {
diff --git a/modules/legacy/src/enteringblobdetectionreal.cpp b/modules/legacy/src/enteringblobdetectionreal.cpp
index 9458ab53e..01a1c7604 100644
--- a/modules/legacy/src/enteringblobdetectionreal.cpp
+++ b/modules/legacy/src/enteringblobdetectionreal.cpp
@@ -156,7 +156,7 @@ public:
 
     }   /* cvDetectNewBlob */
 
-    void Release(){delete this;};
+    void Release(){delete this;}
 };
 
 /* Blob detector constructor: */
diff --git a/modules/legacy/src/epilines.cpp b/modules/legacy/src/epilines.cpp
index 8407e646b..b7652403c 100644
--- a/modules/legacy/src/epilines.cpp
+++ b/modules/legacy/src/epilines.cpp
@@ -3621,7 +3621,7 @@ int cvComputeEpipolesFromFundMatrix(CvMatr32f fundMatr,
     CvMat* matrV = cvCreateMat(3,3,CV_MAT32F);
 
     /* From svd we need just last vector of U and V or last row from U' and V' */
-    /* We get transposed matrixes U and V */
+    /* We get transposed matrices U and V */
     cvSVD(&fundMatrC,matrW,matrU,matrV,CV_SVD_V_T|CV_SVD_U_T);
 
     /* Get last row from U' and compute epipole1 */
diff --git a/modules/legacy/src/face.cpp b/modules/legacy/src/face.cpp
index b188a10de..2132ea8f9 100644
--- a/modules/legacy/src/face.cpp
+++ b/modules/legacy/src/face.cpp
@@ -200,6 +200,7 @@ void RFace::CalculateError(FaceData * lpFaceData)
 void  RFace::CreateFace(void * lpData)
 {
     FaceData Data;
+    memset(&Data, 0, sizeof(FaceData));
 
     double Error = MAX_ERROR;
     double CurError = MAX_ERROR;
diff --git a/modules/legacy/src/facetemplate.h b/modules/legacy/src/facetemplate.h
index 31a3a8303..83f5bc30e 100644
--- a/modules/legacy/src/facetemplate.h
+++ b/modules/legacy/src/facetemplate.h
@@ -98,7 +98,7 @@ inline void FaceFeature::SetWeight(double  dWeight)
 class FaceTemplate
 {
 public:
-    FaceTemplate(long lFeatureCount) {m_lFeturesCount = lFeatureCount;	m_lpFeaturesList = new FaceFeature[lFeatureCount];};
+    FaceTemplate(long lFeatureCount) {m_lFeturesCount = lFeatureCount;	m_lpFeaturesList = new FaceFeature[lFeatureCount];}
     virtual ~FaceTemplate();
 
     inline long GetCount();
diff --git a/modules/legacy/src/lmeds.cpp b/modules/legacy/src/lmeds.cpp
index 33b57a759..f05f2a027 100644
--- a/modules/legacy/src/lmeds.cpp
+++ b/modules/legacy/src/lmeds.cpp
@@ -163,7 +163,7 @@ icvLMedS( int *points1, int *points2, int numPoints, CvMatrix3 * fundamentalMatr
 /*===========================================================================*/
 /*===========================================================================*/
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic push
 # pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
@@ -328,7 +328,7 @@ icvCubic( double a2, double a1, double a0, double *squares )
     return CV_NO_ERR;
 }                               /* icvCubic */
 
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
 # pragma GCC diagnostic pop
 #endif
 
@@ -1180,7 +1180,7 @@ icvSingularValueDecomposition( int M,
         }                       /* for */
     }                           /* if */
 
-    /*  Iterations QR-algorithm for bidiagonal matrixes
+    /*  Iterations QR-algorithm for bidiagonal matrices
        W[i] - is the main diagonal
        rv1[i] - is the top diagonal, rv1[0]=0.
      */
diff --git a/modules/legacy/src/oneway.cpp b/modules/legacy/src/oneway.cpp
index 43ded8593..09826ae4f 100644
--- a/modules/legacy/src/oneway.cpp
+++ b/modules/legacy/src/oneway.cpp
@@ -57,8 +57,8 @@ namespace cv{
                 cvCopy(translation, m_translation);
             };
 
-            CvMat* GetRotation() {return m_rotation;};
-            CvMat* GetTranslation() {return m_translation;};
+            CvMat* GetRotation() {return m_rotation;}
+            CvMat* GetTranslation() {return m_translation;}
 
         protected:
             CvMat* m_rotation;
@@ -2232,10 +2232,11 @@ namespace cv{
         return false;
     }
 
-    void OneWayDescriptorMatcher::knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    void OneWayDescriptorMatcher::knnMatchImpl( InputArray _queryImage, std::vector<KeyPoint>& queryKeypoints,
                                                std::vector<std::vector<DMatch> >& matches, int knn,
-                                               const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+                                               InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
     {
+        Mat queryImage = _queryImage.getMat();
         train();
 
         CV_Assert( knn == 1 ); // knn > 1 unsupported because of bug in OneWayDescriptorBase for this case
@@ -2251,10 +2252,12 @@ namespace cv{
         }
     }
 
-    void OneWayDescriptorMatcher::radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+    void OneWayDescriptorMatcher::radiusMatchImpl( InputArray _queryImage, std::vector<KeyPoint>& queryKeypoints,
                                                   std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                                  const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+                                                   InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
     {
+        Mat queryImage = _queryImage.getMat();
+
         train();
 
         matches.resize( queryKeypoints.size() );
diff --git a/modules/legacy/src/planardetect.cpp b/modules/legacy/src/planardetect.cpp
index fa9152d47..304e800c6 100644
--- a/modules/legacy/src/planardetect.cpp
+++ b/modules/legacy/src/planardetect.cpp
@@ -1297,10 +1297,12 @@ void FernDescriptorMatcher::calcBestProbAndMatchIdx( const Mat& image, const Poi
     }
 }
 
-void FernDescriptorMatcher::knnMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void FernDescriptorMatcher::knnMatchImpl( InputArray _queryImage, std::vector<KeyPoint>& queryKeypoints,
                                          std::vector<std::vector<DMatch> >& matches, int knn,
-                                         const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+                                         InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
 {
+    Mat queryImage = _queryImage.getMat();
+
     train();
 
     matches.resize( queryKeypoints.size() );
@@ -1333,10 +1335,11 @@ void FernDescriptorMatcher::knnMatchImpl( const Mat& queryImage, std::vector<Key
     }
 }
 
-void FernDescriptorMatcher::radiusMatchImpl( const Mat& queryImage, std::vector<KeyPoint>& queryKeypoints,
+void FernDescriptorMatcher::radiusMatchImpl( InputArray _queryImage, std::vector<KeyPoint>& queryKeypoints,
                                             std::vector<std::vector<DMatch> >& matches, float maxDistance,
-                                            const std::vector<Mat>& /*masks*/, bool /*compactResult*/ )
+                                            InputArrayOfArrays /*masks*/, bool /*compactResult*/ )
 {
+    Mat queryImage = _queryImage.getMat();
     train();
     matches.resize( queryKeypoints.size() );
     std::vector<float> signature( (size_t)classifier->getClassCount() );
diff --git a/modules/legacy/src/trifocal.cpp b/modules/legacy/src/trifocal.cpp
index b049ccccf..a5275373a 100644
--- a/modules/legacy/src/trifocal.cpp
+++ b/modules/legacy/src/trifocal.cpp
@@ -902,7 +902,7 @@ int icvComputeProjectMatricesNPoints(  CvMat* points1,CvMat* points2,CvMat* poin
         tmpProjMatr[1] = cvMat(9,4,CV_64F,tmpProjMatr_dat+36);
         tmpProjMatr[2] = cvMat(9,4,CV_64F,tmpProjMatr_dat+72);
 
-        /* choosen points */
+        /* chosen points */
 
         while( wasCount < NumSamples )
         {
@@ -1494,7 +1494,7 @@ void GetGeneratorReduceFundSolution(CvMat* points1,CvMat* points2,CvMat* fundRed
     matrV = cvMat(5,5,CV_64F,matrV_dat);
 
     /* From svd we need just two last vectors of V or two last row V' */
-    /* We get transposed matrixes U and V */
+    /* We get transposed matrices U and V */
 
     cvSVD(&matrA,&matrW,0,&matrV,CV_SVD_V_T);
 
@@ -1529,7 +1529,7 @@ int GetGoodReduceFundamMatrFromTwo(CvMat* fundReduceCoef1,CvMat* fundReduceCoef2
         CV_ERROR( CV_StsUnsupportedFormat, "Input parameters must be a matrices" );
     }
 
-    /* using two fundamental matrix comute matrixes for det(F)=0 */
+    /* using two fundamental matrix comute matrices for det(F)=0 */
     /* May compute 1 or 3 matrices. Returns number of solutions */
     /* Here we will use case F=a*F1+(1-a)*F2  instead of F=m*F1+l*F2 */
 
@@ -1667,7 +1667,7 @@ void GetProjMatrFromReducedFundamental(CvMat* fundReduceCoefs,CvMat* projMatrCoe
     matrV = cvMat(3,3,CV_64F,matrV_dat);
 
     /* From svd we need just last vector of V or last row V' */
-    /* We get transposed matrixes U and V */
+    /* We get transposed matrices U and V */
 
     cvSVD(&matrA,&matrW,0,&matrV,CV_SVD_V_T);
 
@@ -1733,7 +1733,7 @@ void GetProjMatrFromReducedFundamental(CvMat* fundReduceCoefs,CvMat* projMatrCoe
         matrV1 = cvMat(6,6,CV_64F,matrV_dat1);
 
         /* From svd we need just last vector of V or last row V' */
-        /* We get transposed matrixes U and V */
+        /* We get transposed matrices U and V */
 
         cvSVD(&matrK,&matrW1,0,&matrV1,CV_SVD_V_T);
 
@@ -2034,7 +2034,7 @@ void icvComputeTransform4D(CvMat* points1,CvMat* points2,CvMat* transMatr)
     }
 
     /* From svd we need just two last vectors of V or two last row V' */
-    /* We get transposed matrixes U and V */
+    /* We get transposed matrices U and V */
 
     cvSVD(matrA,matrW,0,&matrV,CV_SVD_V_T);
 
diff --git a/modules/legacy/src/vecfacetracking.cpp b/modules/legacy/src/vecfacetracking.cpp
index 81d1a04cb..b2a03b47f 100644
--- a/modules/legacy/src/vecfacetracking.cpp
+++ b/modules/legacy/src/vecfacetracking.cpp
@@ -52,7 +52,7 @@ enum
 {
     MOUTH = 0,
     LEYE = 1,
-    REYE = 2,
+    REYE = 2
 };
 
 #define MAX_LAYERS      64
diff --git a/modules/matlab/CMakeLists.txt b/modules/matlab/CMakeLists.txt
index dd5439de0..3a5c6d12f 100644
--- a/modules/matlab/CMakeLists.txt
+++ b/modules/matlab/CMakeLists.txt
@@ -29,7 +29,7 @@
 # arguments to mex. e.g.
 # prepend("-I" OUT /path/to/include/dir) --> -I/path/to/include/dir
 macro(PREPEND TOKEN OUT IN)
-    foreach(VAR ${IN})
+    foreach(VAR ${IN} ${ARGN})
         list(APPEND ${OUT} "${TOKEN}${VAR}")
     endforeach()
 endmacro()
@@ -104,7 +104,7 @@ set(RST_PARSER_PATH ${CMAKE_SOURCE_DIR}/modules/java/generator)
 
 # set mex compiler options
 prepend("-I" MEX_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include)
-prepend("-L" MEX_LIB_DIR  ${LIBRARY_OUTPUT_PATH}/$<CONFIGURATION>)
+prepend("-L" MEX_LIB_DIR  ${LIBRARY_OUTPUT_PATH}/${CMAKE_CFG_INTDIR} )
 set(MEX_OPTS "-largeArrayDims")
 
 if (BUILD_TESTS)
@@ -236,7 +236,7 @@ add_custom_command(
             --jinja2 ${JINJA2_PATH}
             --opts="${MEX_OPTS}"
             --include_dirs="${MEX_INCLUDE_DIRS}"
-            --lib_dir=${MEX_LIB_DIR}
+            --lib_dir="${MEX_LIB_DIR}"
             --libs="${MEX_LIBS}"
             --flags  ${MEX_CXXFLAGS}
             --outdir ${CMAKE_CURRENT_BINARY_DIR}
@@ -256,7 +256,7 @@ add_custom_command(
                              -DMEX_OPTS=${MEX_OPTS}
                              -DMEX_CXXFLAGS=${MEX_CXX_FLAGS}
                              -DMEX_INCLUDE_DIRS="${MEX_INCLUDE_DIRS}"
-                             -DMEX_LIB_DIR=${MEX_LIB_DIR}
+                             -DMEX_LIB_DIR="${MEX_LIB_DIR}"
                              -DCONFIGURATION="$<CONFIGURATION>"
                              -DMEX_LIBS="${MEX_LIBS}"
                              -DMEX_DEBUG_LIBS="${MEX_DEBUG_LIBS}"
diff --git a/modules/ml/doc/neural_networks.rst b/modules/ml/doc/neural_networks.rst
index 0496e2201..776bf243b 100644
--- a/modules/ml/doc/neural_networks.rst
+++ b/modules/ml/doc/neural_networks.rst
@@ -240,6 +240,7 @@ This method applies the specified training algorithm to computing/adjusting the
 
 The RPROP training algorithm is parallelized with the TBB library.
 
+If you are using the default ``cvANN_MLP::SIGMOID_SYM`` activation function then the output should be in the range [-1,1], instead of [0,1], for optimal results.
 
 CvANN_MLP::predict
 ------------------
@@ -257,6 +258,8 @@ Predicts responses for input samples.
 
 The method returns a dummy value which should be ignored.
 
+If you are using the default ``cvANN_MLP::SIGMOID_SYM`` activation function with the default parameter values fparam1=0 and fparam2=0 then the function used is y = 1.7159*tanh(2/3 * x), so the output will range from [-1.7159, 1.7159], instead of [0,1].
+
 CvANN_MLP::get_layer_count
 --------------------------
 Returns the number of layers in the MLP.
diff --git a/modules/ml/doc/normal_bayes_classifier.rst b/modules/ml/doc/normal_bayes_classifier.rst
index a247598fa..dbd6ae229 100644
--- a/modules/ml/doc/normal_bayes_classifier.rst
+++ b/modules/ml/doc/normal_bayes_classifier.rst
@@ -52,12 +52,12 @@ CvNormalBayesClassifier::predict
 --------------------------------
 Predicts the response for sample(s).
 
-.. ocv:function:: float CvNormalBayesClassifier::predict(  const Mat& samples,  Mat* results=0 ) const
+.. ocv:function:: float CvNormalBayesClassifier::predict(  const Mat& samples,  Mat* results=0, Mat* results_prob=0 ) const
 
-.. ocv:function:: float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results=0 ) const
+.. ocv:function:: float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results=0, CvMat* results_prob=0 ) const
 
 .. ocv:pyfunction:: cv2.NormalBayesClassifier.predict(samples) -> retval, results
 
-The method estimates the most probable classes for input vectors. Input vectors (one or more) are stored as rows of the matrix ``samples``. In case of multiple input vectors, there should be one output vector ``results``. The predicted class for a single input vector is returned by the method.
+The method estimates the most probable classes for input vectors. Input vectors (one or more) are stored as rows of the matrix ``samples``. In case of multiple input vectors, there should be one output vector ``results``. The predicted class for a single input vector is returned by the method. The vector ``results_prob`` contains the output probabilities coresponding to each element of ``result``.
 
 The function is parallelized with the TBB library.
diff --git a/modules/ml/doc/support_vector_machines.rst b/modules/ml/doc/support_vector_machines.rst
index 14ee12bb4..9793bd6e3 100644
--- a/modules/ml/doc/support_vector_machines.rst
+++ b/modules/ml/doc/support_vector_machines.rst
@@ -242,7 +242,7 @@ Predicts the response for input sample(s).
 
 .. ocv:function:: float CvSVM::predict( const CvMat* sample, bool returnDFVal=false ) const
 
-.. ocv:function:: float CvSVM::predict( const CvMat* samples, CvMat* results ) const
+.. ocv:function:: float CvSVM::predict( const CvMat* samples, CvMat* results, bool returnDFVal=false ) const
 
 .. ocv:pyfunction:: cv2.SVM.predict(sample[, returnDFVal]) -> retval
 
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index 7325aa075..f13e192be 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -201,7 +201,7 @@ public:
     virtual bool train( const CvMat* trainData, const CvMat* responses,
         const CvMat* varIdx = 0, const CvMat* sampleIdx=0, bool update=false );
 
-    virtual float predict( const CvMat* samples, CV_OUT CvMat* results=0 ) const;
+    virtual float predict( const CvMat* samples, CV_OUT CvMat* results=0, CV_OUT CvMat* results_prob=0 ) const;
     CV_WRAP virtual void clear();
 
     CV_WRAP CvNormalBayesClassifier( const cv::Mat& trainData, const cv::Mat& responses,
@@ -209,7 +209,7 @@ public:
     CV_WRAP virtual bool train( const cv::Mat& trainData, const cv::Mat& responses,
                        const cv::Mat& varIdx = cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
                        bool update=false );
-    CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0 ) const;
+    CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0, CV_OUT cv::Mat* results_prob=0 ) const;
 
     virtual void write( CvFileStorage* storage, const char* name ) const;
     virtual void read( CvFileStorage* storage, CvFileNode* node );
@@ -490,7 +490,7 @@ public:
         bool balanced=false );
 
     virtual float predict( const CvMat* sample, bool returnDFVal=false ) const;
-    virtual float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
+    virtual float predict( const CvMat* samples, CV_OUT CvMat* results, bool returnDFVal=false ) const;
 
     CV_WRAP CvSVM( const cv::Mat& trainData, const cv::Mat& responses,
           const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
@@ -515,9 +515,11 @@ public:
 
     CV_WRAP virtual int get_support_vector_count() const;
     virtual const float* get_support_vector(int i) const;
-    virtual CvSVMParams get_params() const { return params; };
+    virtual CvSVMParams get_params() const { return params; }
     CV_WRAP virtual void clear();
 
+    virtual const CvSVMDecisionFunc* get_decision_function() const { return decision_func; }
+
     static CvParamGrid get_default_grid( int param_id );
 
     virtual void write( CvFileStorage* storage, const char* name ) const;
@@ -1523,7 +1525,7 @@ public:
     // API
     // virtual bool train( CvMLData* data,
              CvGBTreesParams params=CvGBTreesParams(),
-             bool update=false ) {return false;};
+             bool update=false ) {return false;}
 
     // INPUT
     // data          - training set.
diff --git a/modules/ml/src/ml_init.cpp b/modules/ml/src/ml_init.cpp
index 3ff7d9bae..fcf9e1c89 100644
--- a/modules/ml/src/ml_init.cpp
+++ b/modules/ml/src/ml_init.cpp
@@ -52,7 +52,7 @@ CV_INIT_ALGORITHM(EM, "StatModel.EM",
                   obj.info()->addParam(obj, "epsilon", obj.epsilon);
                   obj.info()->addParam(obj, "weights", obj.weights, true);
                   obj.info()->addParam(obj, "means", obj.means, true);
-                  obj.info()->addParam(obj, "covs", obj.covs, true));
+                  obj.info()->addParam(obj, "covs", obj.covs, true))
 
 bool initModule_ml(void)
 {
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index 5ad1b134d..938f3fbd8 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -282,7 +282,7 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
 struct predict_body : cv::ParallelLoopBody {
   predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
      const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
-     CvMat* _results, float* _value, int _var_count1
+     CvMat* _results, float* _value, int _var_count1, CvMat* _results_prob
   )
   {
     c = _c;
@@ -295,6 +295,7 @@ struct predict_body : cv::ParallelLoopBody {
     results = _results;
     value = _value;
     var_count1 = _var_count1;
+    results_prob = _results_prob;
   }
 
   CvMat* c;
@@ -305,6 +306,7 @@ struct predict_body : cv::ParallelLoopBody {
   const int* vidx;
   CvMat* cls_labels;
 
+  CvMat* results_prob;
   CvMat* results;
   float* value;
   int var_count1;
@@ -313,15 +315,21 @@ struct predict_body : cv::ParallelLoopBody {
   {
 
     int cls = -1;
-    int rtype = 0, rstep = 0;
+    int rtype = 0, rstep = 0, rptype = 0, rpstep = 0;
     int nclasses = cls_labels->cols;
     int _var_count = avg[0]->cols;
+    double probability = 0;
 
     if (results)
     {
         rtype = CV_MAT_TYPE(results->type);
         rstep = CV_IS_MAT_CONT(results->type) ? 1 : results->step/CV_ELEM_SIZE(rtype);
     }
+    if (results_prob)
+    {
+        rptype = CV_MAT_TYPE(results_prob->type);
+        rpstep = CV_IS_MAT_CONT(results_prob->type) ? 1 : results_prob->step/CV_ELEM_SIZE(rptype);
+    }
     // allocate memory and initializing headers for calculating
     cv::AutoBuffer<double> buffer(nclasses + var_count1);
     CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
@@ -333,7 +341,6 @@ struct predict_body : cv::ParallelLoopBody {
 
         for(int i = 0; i < nclasses; i++ )
         {
-
             double cur = c->data.db[i];
             CvMat* u = cov_rotate_mats[i];
             CvMat* w = inv_eigen_values[i];
@@ -358,6 +365,7 @@ struct predict_body : cv::ParallelLoopBody {
                 opt = cur;
             }
             /* probability = exp( -0.5 * cur ) */
+            probability = exp( -0.5 * cur );
         }
 
         ival = cls_labels->data.i[cls];
@@ -368,6 +376,13 @@ struct predict_body : cv::ParallelLoopBody {
             else
                 results->data.fl[k*rstep] = (float)ival;
         }
+        if ( results_prob )
+        {
+            if ( rptype == CV_32FC1 )
+                results_prob->data.fl[k*rpstep] = (float)probability;
+            else
+                results_prob->data.db[k*rpstep] = probability;
+        }
         if( k == 0 )
             *value = (float)ival;
     }
@@ -375,7 +390,7 @@ struct predict_body : cv::ParallelLoopBody {
 };
 
 
-float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) const
+float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results, CvMat* results_prob ) const
 {
     float value = 0;
 
@@ -390,18 +405,28 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c
     if( results )
     {
         if( !CV_IS_MAT(results) || (CV_MAT_TYPE(results->type) != CV_32FC1 &&
-        CV_MAT_TYPE(results->type) != CV_32SC1) ||
-        (results->cols != 1 && results->rows != 1) ||
-        results->cols + results->rows - 1 != samples->rows )
+                                    CV_MAT_TYPE(results->type) != CV_32SC1) ||
+          (results->cols != 1 && results->rows != 1) ||
+           results->cols + results->rows - 1 != samples->rows )
         CV_Error( CV_StsBadArg, "The output array must be integer or floating-point vector "
-        "with the number of elements = number of rows in the input matrix" );
+                 "with the number of elements = number of rows in the input matrix" );
+    }
+
+    if( results_prob )
+    {
+        if( !CV_IS_MAT(results_prob) || (CV_MAT_TYPE(results_prob->type) != CV_32FC1 &&
+                                         CV_MAT_TYPE(results_prob->type) != CV_64FC1) ||
+          (results_prob->cols != 1 && results_prob->rows != 1) ||
+           results_prob->cols + results_prob->rows - 1 != samples->rows )
+        CV_Error( CV_StsBadArg, "The output array must be double or float vector "
+                 "with the number of elements = number of rows in the input matrix" );
     }
 
     const int* vidx = var_idx ? var_idx->data.i : 0;
 
     cv::parallel_for_(cv::Range(0, samples->rows),
                       predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
-                                   vidx, cls_labels, results, &value, var_count));
+                                   vidx, cls_labels, results, &value, var_count, results_prob));
 
     return value;
 }
@@ -608,9 +633,9 @@ bool CvNormalBayesClassifier::train( const Mat& _train_data, const Mat& _respons
                  sidx.data.ptr ? &sidx : 0, update);
 }
 
-float CvNormalBayesClassifier::predict( const Mat& _samples, Mat* _results ) const
+float CvNormalBayesClassifier::predict( const Mat& _samples, Mat* _results, Mat* _results_prob ) const
 {
-    CvMat samples = _samples, results, *presults = 0;
+    CvMat samples = _samples, results, *presults = 0, results_prob, *presults_prob = 0;
 
     if( _results )
     {
@@ -621,7 +646,16 @@ float CvNormalBayesClassifier::predict( const Mat& _samples, Mat* _results ) con
         presults = &(results = *_results);
     }
 
-    return predict(&samples, presults);
+    if( _results_prob )
+    {
+        if( !(_results_prob->data && _results_prob->type() == CV_64F &&
+              (_results_prob->cols == 1 || _results_prob->rows == 1) &&
+              _results_prob->cols + _results_prob->rows - 1 == _samples.rows) )
+            _results_prob->create(_samples.rows, 1, CV_64F);
+        presults_prob = &(results_prob = *_results_prob);
+    }
+
+    return predict(&samples, presults, presults_prob);
 }
 
 /* End of file. */
diff --git a/modules/ml/src/precomp.hpp b/modules/ml/src/precomp.hpp
index 06b8f4f35..551ff8179 100644
--- a/modules/ml/src/precomp.hpp
+++ b/modules/ml/src/precomp.hpp
@@ -351,7 +351,7 @@ namespace cv
 {
     struct DTreeBestSplitFinder
     {
-        DTreeBestSplitFinder(){ tree = 0; node = 0; }
+        DTreeBestSplitFinder(){ splitSize = 0, tree = 0; node = 0; }
         DTreeBestSplitFinder( CvDTree* _tree, CvDTreeNode* _node);
         DTreeBestSplitFinder( const DTreeBestSplitFinder& finder, Split );
         virtual ~DTreeBestSplitFinder() {}
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 9f531ac4e..341a817c9 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1245,7 +1245,6 @@ const float* CvSVM::get_support_vector(int i) const
     return sv && (unsigned)i < (unsigned)sv_total ? sv[i] : 0;
 }
 
-
 bool CvSVM::set_params( const CvSVMParams& _params )
 {
     bool ok = false;
@@ -2195,18 +2194,20 @@ float CvSVM::predict( const CvMat* sample, bool returnDFVal ) const
 }
 
 struct predict_body_svm : ParallelLoopBody {
-    predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results)
+    predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results, bool _returnDFVal)
     {
         pointer = _pointer;
         result = _result;
         samples = _samples;
         results = _results;
+        returnDFVal = _returnDFVal;
     }
 
     const CvSVM* pointer;
     float* result;
     const CvMat* samples;
     CvMat* results;
+    bool returnDFVal;
 
     void operator()( const cv::Range& range ) const
     {
@@ -2214,7 +2215,7 @@ struct predict_body_svm : ParallelLoopBody {
         {
             CvMat sample;
             cvGetRow( samples, &sample, i );
-            int r = (int)pointer->predict(&sample);
+            int r = (int)pointer->predict(&sample, returnDFVal);
             if (results)
                 results->data.fl[i] = (float)r;
             if (i == 0)
@@ -2223,11 +2224,11 @@ struct predict_body_svm : ParallelLoopBody {
     }
 };
 
-float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
+float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results, bool returnDFVal) const
 {
     float result = 0;
     cv::parallel_for_(cv::Range(0, samples->rows),
-             predict_body_svm(this, &result, samples, results)
+             predict_body_svm(this, &result, samples, results, returnDFVal)
     );
     return result;
 }
@@ -2347,14 +2348,24 @@ void CvSVM::write_params( CvFileStorage* fs ) const
 }
 
 
+static bool isSvmModelApplicable(int sv_total, int var_all, int var_count, int class_count)
+{
+    return (sv_total > 0 && var_count > 0 && var_count <= var_all && class_count >= 0);
+}
+
+
 void CvSVM::write( CvFileStorage* fs, const char* name ) const
 {
     CV_FUNCNAME( "CvSVM::write" );
 
     __BEGIN__;
 
-    int i, var_count = get_var_count(), df_count, class_count;
+    int i, var_count = get_var_count(), df_count;
+    int class_count = class_labels ? class_labels->cols :
+                      params.svm_type == CvSVM::ONE_CLASS ? 1 : 0;
     const CvSVMDecisionFunc* df = decision_func;
+    if( !isSvmModelApplicable(sv_total, var_all, var_count, class_count) )
+        CV_ERROR( CV_StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
 
     cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_SVM );
 
@@ -2363,9 +2374,6 @@ void CvSVM::write( CvFileStorage* fs, const char* name ) const
     cvWriteInt( fs, "var_all", var_all );
     cvWriteInt( fs, "var_count", var_count );
 
-    class_count = class_labels ? class_labels->cols :
-                  params.svm_type == CvSVM::ONE_CLASS ? 1 : 0;
-
     if( class_count )
     {
         cvWriteInt( fs, "class_count", class_count );
@@ -2503,7 +2511,6 @@ void CvSVM::read_params( CvFileStorage* fs, CvFileNode* svm_node )
     __END__;
 }
 
-
 void CvSVM::read( CvFileStorage* fs, CvFileNode* svm_node )
 {
     const double not_found_dbl = DBL_MAX;
@@ -2532,7 +2539,7 @@ void CvSVM::read( CvFileStorage* fs, CvFileNode* svm_node )
     var_count = cvReadIntByName( fs, svm_node, "var_count", var_all );
     class_count = cvReadIntByName( fs, svm_node, "class_count", 0 );
 
-    if( sv_total <= 0 || var_all <= 0 || var_count <= 0 || var_count > var_all || class_count < 0 )
+    if( !isSvmModelApplicable(sv_total, var_all, var_count, class_count) )
         CV_ERROR( CV_StsParseError, "SVM model data is invalid, check sv_count, var_* and class_count tags" );
 
     CV_CALL( class_labels = (CvMat*)cvReadByName( fs, svm_node, "class_labels" ));
diff --git a/modules/ml/test/test_save_load.cpp b/modules/ml/test/test_save_load.cpp
index 9fd31b9f2..7300185b4 100644
--- a/modules/ml/test/test_save_load.cpp
+++ b/modules/ml/test/test_save_load.cpp
@@ -155,6 +155,14 @@ TEST(ML_RTrees, save_load) { CV_SLMLTest test( CV_RTREES ); test.safe_run(); }
 TEST(ML_ERTrees, save_load) { CV_SLMLTest test( CV_ERTREES ); test.safe_run(); }
 
 
+TEST(ML_SVM, throw_exception_when_save_untrained_model)
+{
+    SVM svm;
+    string filename = tempfile("svm.xml");
+    ASSERT_THROW(svm.save(filename.c_str()), Exception);
+    remove(filename.c_str());
+}
+
 TEST(DISABLED_ML_SVM, linear_save_load)
 {
     CvSVM svm1, svm2, svm3;
diff --git a/modules/nonfree/CMakeLists.txt b/modules/nonfree/CMakeLists.txt
index eec28e975..851646f25 100644
--- a/modules/nonfree/CMakeLists.txt
+++ b/modules/nonfree/CMakeLists.txt
@@ -3,5 +3,5 @@ if(BUILD_ANDROID_PACKAGE)
 endif()
 
 set(the_description "Functionality with possible limitations on the use")
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
-ocv_define_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_cudaarithm opencv_ocl)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wshadow)
+ocv_define_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_cudaarithm)
diff --git a/modules/nonfree/doc/feature_detection.rst b/modules/nonfree/doc/feature_detection.rst
index dc43c8b07..22c1d97a9 100644
--- a/modules/nonfree/doc/feature_detection.rst
+++ b/modules/nonfree/doc/feature_detection.rst
@@ -246,105 +246,3 @@ The class ``SURF_CUDA`` uses some buffers and provides access to it. All buffers
 .. note::
 
    * An example for using the SURF keypoint matcher on GPU can be found at opencv_source_code/samples/gpu/surf_keypoint_matcher.cpp
-
-ocl::SURF_OCL
--------------
-.. ocv:class:: ocl::SURF_OCL
-
-Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
-
-    class SURF_OCL
-    {
-    public:
-        enum KeypointLayout
-        {
-            X_ROW = 0,
-            Y_ROW,
-            LAPLACIAN_ROW,
-            OCTAVE_ROW,
-            SIZE_ROW,
-            ANGLE_ROW,
-            HESSIAN_ROW,
-            ROWS_COUNT
-        };
-
-        //! the default constructor
-        SURF_OCL();
-        //! the full constructor taking all the necessary parameters
-        explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
-             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
-
-        //! returns the descriptor size in float's (64 or 128)
-        int descriptorSize() const;
-
-        //! upload host keypoints to device memory
-        void uploadKeypoints(const vector<KeyPoint>& keypoints,
-            oclMat& keypointsocl);
-        //! download keypoints from device to host memory
-        void downloadKeypoints(const oclMat& keypointsocl,
-            vector<KeyPoint>& keypoints);
-
-        //! download descriptors from device to host memory
-        void downloadDescriptors(const oclMat& descriptorsocl,
-            vector<float>& descriptors);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints,
-            std::vector<float>& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void releaseMemory();
-
-        // SURF parameters
-        double hessianThreshold;
-        int nOctaves;
-        int nOctaveLayers;
-        bool extended;
-        bool upright;
-
-        //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-        float keypointsRatio;
-
-        oclMat sum, mask1, maskSum, intBuffer;
-
-        oclMat det, trace;
-
-        oclMat maxPosBuffer;
-    };
-
-
-The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
-
-The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
-
-* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
-* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
-* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
-* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
-* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
-* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
-* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
-
-The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
-
-The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
-
-.. seealso:: :ocv:class:`SURF`
-
-.. note::
-
-   * OCL : An example of the SURF detector can be found at opencv_source_code/samples/ocl/surf_matcher.cpp
diff --git a/modules/nonfree/include/opencv2/nonfree/features2d.hpp b/modules/nonfree/include/opencv2/nonfree/features2d.hpp
index 88a173115..6a75e99b2 100644
--- a/modules/nonfree/include/opencv2/nonfree/features2d.hpp
+++ b/modules/nonfree/include/opencv2/nonfree/features2d.hpp
@@ -87,8 +87,8 @@ public:
                                 std::vector<KeyPoint>& keypoints ) const;
 
 protected:
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask = Mat() ) const;
-    void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask = noArray() ) const;
+    void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 
     CV_PROP_RW int nfeatures;
     CV_PROP_RW int nOctaveLayers;
@@ -142,9 +142,8 @@ public:
     CV_PROP_RW bool upright;
 
 protected:
-
-    void detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask = Mat() ) const;
-    void computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors ) const;
+    void detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask = noArray() ) const;
+    void computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors ) const;
 };
 
 typedef SURF SurfFeatureDetector;
diff --git a/modules/nonfree/include/opencv2/nonfree/ocl.hpp b/modules/nonfree/include/opencv2/nonfree/ocl.hpp
deleted file mode 100644
index b06fa39af..000000000
--- a/modules/nonfree/include/opencv2/nonfree/ocl.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_NONFREE_OCL_HPP__
-#define __OPENCV_NONFREE_OCL_HPP__
-
-#include "opencv2/ocl.hpp"
-
-namespace cv
-{
-    namespace ocl
-    {
-        //! Speeded up robust features, port from CUDA module.
-        ////////////////////////////////// SURF //////////////////////////////////////////
-
-        class CV_EXPORTS SURF_OCL
-        {
-        public:
-            enum KeypointLayout
-            {
-                X_ROW = 0,
-                Y_ROW,
-                LAPLACIAN_ROW,
-                OCTAVE_ROW,
-                SIZE_ROW,
-                ANGLE_ROW,
-                HESSIAN_ROW,
-                ROWS_COUNT
-            };
-
-            //! the default constructor
-            SURF_OCL();
-            //! the full constructor taking all the necessary parameters
-            explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
-                              int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
-
-            //! returns the descriptor size in float's (64 or 128)
-            int descriptorSize() const;
-            //! returns the default norm type
-            int defaultNorm() const;
-            //! upload host keypoints to device memory
-            void uploadKeypoints(const std::vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
-            //! download keypoints from device to host memory
-            void downloadKeypoints(const oclMat &keypointsocl, std::vector<KeyPoint> &keypoints);
-            //! download descriptors from device to host memory
-            void downloadDescriptors(const oclMat &descriptorsocl, std::vector<float> &descriptors);
-            //! finds the keypoints using fast hessian detector used in SURF
-            //! supports CV_8UC1 images
-            //! keypoints will have nFeature cols and 6 rows
-            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
-            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
-            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
-            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
-            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
-            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
-            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
-            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
-            //! finds the keypoints and computes their descriptors.
-            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
-                            bool useProvidedKeypoints = false);
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
-                            bool useProvidedKeypoints = false);
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
-                            bool useProvidedKeypoints = false);
-
-            void releaseMemory();
-
-            // SURF parameters
-            float hessianThreshold;
-            int nOctaves;
-            int nOctaveLayers;
-            bool extended;
-            bool upright;
-            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-            float keypointsRatio;
-            oclMat sum, mask1, maskSum, intBuffer;
-            oclMat det, trace;
-            oclMat maxPosBuffer;
-
-        };
-    }
-}
-
-#endif //__OPENCV_NONFREE_OCL_HPP__
diff --git a/modules/nonfree/src/nonfree_init.cpp b/modules/nonfree/src/nonfree_init.cpp
index ac804dd9a..c59e73548 100644
--- a/modules/nonfree/src/nonfree_init.cpp
+++ b/modules/nonfree/src/nonfree_init.cpp
@@ -52,7 +52,7 @@ CV_INIT_ALGORITHM(SURF, "Feature2D.SURF",
                   obj.info()->addParam(obj, "nOctaves", obj.nOctaves);
                   obj.info()->addParam(obj, "nOctaveLayers", obj.nOctaveLayers);
                   obj.info()->addParam(obj, "extended", obj.extended);
-                  obj.info()->addParam(obj, "upright", obj.upright));
+                  obj.info()->addParam(obj, "upright", obj.upright))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -61,7 +61,7 @@ CV_INIT_ALGORITHM(SIFT, "Feature2D.SIFT",
                   obj.info()->addParam(obj, "nOctaveLayers", obj.nOctaveLayers);
                   obj.info()->addParam(obj, "contrastThreshold", obj.contrastThreshold);
                   obj.info()->addParam(obj, "edgeThreshold", obj.edgeThreshold);
-                  obj.info()->addParam(obj, "sigma", obj.sigma));
+                  obj.info()->addParam(obj, "sigma", obj.sigma))
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl
index 02f77c224..608a677ce 100644
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, Intel Corporation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
@@ -44,36 +45,59 @@
 //
 //M*/
 
-// specialized for non-image2d_t supported platform, intel HD4000, for example
-#ifdef DISABLE_IMAGE2D
-#define IMAGE_INT32 __global uint  *
-#define IMAGE_INT8  __global uchar *
-#else
-#define IMAGE_INT32 image2d_t
-#define IMAGE_INT8  image2d_t
-#endif
+// The number of degrees between orientation samples in calcOrientation
+#define ORI_SEARCH_INC  5
 
-uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, int elemPerRow)
+// The local size of the calcOrientation kernel
+#define ORI_LOCAL_SIZE  (360 / ORI_SEARCH_INC)
+
+// specialized for non-image2d_t supported platform, intel HD4000, for example
+#ifndef HAVE_IMAGE2D
+__inline uint read_sumTex_(__global uint* sumTex, int sum_step, int img_rows, int img_cols, int2 coord)
 {
-#ifdef DISABLE_IMAGE2D
-    int x = clamp(coord.x, 0, cols);
-    int y = clamp(coord.y, 0, rows);
-    return img[elemPerRow * y + x];
-#else
-    return read_imageui(img, sam, coord).x;
-#endif
+    int x = clamp(coord.x, 0, img_cols);
+    int y = clamp(coord.y, 0, img_rows);
+    return sumTex[sum_step * y + x];
 }
-uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
+
+__inline uchar read_imgTex_(__global uchar* imgTex, int img_step, int img_rows, int img_cols, float2 coord)
 {
-#ifdef DISABLE_IMAGE2D
-    int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
-    int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
-    return img[elemPerRow * y + x];
-#else
-    return (uchar)read_imageui(img, sam, coord).x;
-#endif
+    int x = clamp(convert_int_rte(coord.x), 0, img_cols-1);
+    int y = clamp(convert_int_rte(coord.y), 0, img_rows-1);
+    return imgTex[img_step * y + x];
 }
 
+#define read_sumTex(coord) read_sumTex_(sumTex, sum_step, img_rows, img_cols, coord)
+#define read_imgTex(coord) read_imgTex_(imgTex, img_step, img_rows, img_cols, coord)
+
+#define __PARAM_sumTex__ __global uint* sumTex, int sum_step, int sum_offset
+#define __PARAM_imgTex__ __global uchar* imgTex, int img_step, int img_offset
+
+#define __PASS_sumTex__ sumTex, sum_step, sum_offset
+#define __PASS_imgTex__ imgTex, img_step, img_offset
+
+#else
+__inline uint read_sumTex_(image2d_t sumTex, sampler_t sam, int2 coord)
+{
+    return read_imageui(sumTex, sam, coord).x;
+}
+
+__inline uchar read_imgTex_(image2d_t imgTex, sampler_t sam, float2 coord)
+{
+    return (uchar)read_imageui(imgTex, sam, coord).x;
+}
+
+#define read_sumTex(coord) read_sumTex_(sumTex, sampler, coord)
+#define read_imgTex(coord) read_imgTex_(imgTex, sampler, coord)
+
+#define __PARAM_sumTex__ image2d_t sumTex
+#define __PARAM_imgTex__ image2d_t imgTex
+
+#define __PASS_sumTex__ sumTex
+#define __PASS_imgTex__ imgTex
+
+#endif
+
 // dynamically change the precision used for floating type
 
 #if defined (DOUBLE_SUPPORT)
@@ -88,7 +112,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col
 #endif
 
 // Image read mode
-__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
 
 #ifndef FLT_EPSILON
 #define FLT_EPSILON (1e-15)
@@ -98,144 +122,9 @@ __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
 #define CV_PI_F 3.14159265f
 #endif
 
-// Use integral image to calculate haar wavelets.
-// N = 2
-// for simple haar paatern
-float icvCalcHaarPatternSum_2(
-    IMAGE_INT32 sumTex,
-    __constant float2 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-
-    float ratio = (float)newSize / oldSize;
-
-    F d = 0;
-
-    int2 dx1 = convert_int2_rte(ratio * src[0]);
-    int2 dy1 = convert_int2_rte(ratio * src[1]);
-    int2 dx2 = convert_int2_rte(ratio * src[2]);
-    int2 dy2 = convert_int2_rte(ratio * src[3]);
-
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-
-    return (float)d;
-}
-
-// N = 3
-float icvCalcHaarPatternSum_3(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-
-    float ratio = (float)newSize / oldSize;
-
-    F d = 0;
-
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
-
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-
-    return (float)d;
-}
-
-// N = 4
-float icvCalcHaarPatternSum_4(
-    IMAGE_INT32 sumTex,
-    __constant float4 *src,
-    int oldSize,
-    int newSize,
-    int y, int x,
-    int rows, int cols, int elemPerRow)
-{
-
-    float ratio = (float)newSize / oldSize;
-
-    F d = 0;
-
-    int4 dx1 = convert_int4_rte(ratio * src[0]);
-    int4 dy1 = convert_int4_rte(ratio * src[1]);
-    int4 dx2 = convert_int4_rte(ratio * src[2]);
-    int4 dy2 = convert_int4_rte(ratio * src[3]);
-
-    F t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
-    d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
-    d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
-    d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
-
-    t = 0;
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
-    t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
-    t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
-    d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
-
-    return (float)d;
-}
-
 ////////////////////////////////////////////////////////////////////////
 // Hessian
 
-__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
-__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
-
 __inline int calcSize(int octave, int layer)
 {
     /* Wavelet size at first layer of first octave. */
@@ -250,25 +139,41 @@ __inline int calcSize(int octave, int layer)
     return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }
 
+// Calculate a derivative in an axis-aligned direction (x or y).  The "plus1"
+// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area).
+// So the final computation is plus1a + plus1b - 2 * minus2.  The corners are
+// labeled A, B, C, and D, with A being the top left, B being top right, C
+// being bottom left, and D being bottom right.
+F calcAxisAlignedDerivative(
+        int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale,
+        int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale,
+        int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale)
+{
+    F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D;
+    F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D;
+    F minus2 = minus2_A - minus2_B - minus2_C + minus2_D;
+
+    return (plus1a / plus1a_scale -
+            2.0f * minus2 / minus2_scale +
+            plus1b / plus1b_scale);
+}
 
 //calculate targeted layer per-pixel determinant and trace with an integral image
-__kernel void icvCalcLayerDetAndTrace(
-    IMAGE_INT32 sumTex, // input integral image
-    __global float * det,      // output Determinant
+__kernel void SURF_calcLayerDetAndTrace(
+    __PARAM_sumTex__, // input integral image
+    int img_rows, int img_cols,
+    int c_nOctaveLayers, int c_octave, int c_layer_rows,
+
+    __global float * det,      // output determinant
+    int det_step, int det_offset,
     __global float * trace,    // output trace
-    int det_step,     // the step of det in bytes
-    int trace_step,   // the step of trace in bytes
-    int c_img_rows,
-    int c_img_cols,
-    int c_nOctaveLayers,
-    int c_octave,
-    int c_layer_rows,
-    int sumTex_step
-)
+    int trace_step, int trace_offset)
 {
     det_step   /= sizeof(*det);
     trace_step /= sizeof(*trace);
-    sumTex_step/= sizeof(uint);
+    #ifndef HAVE_IMAGE2D
+    sum_step/= sizeof(uint);
+    #endif
     // Determine the indices
     const int gridDim_y  = get_num_groups(1) / (c_nOctaveLayers + 2);
     const int blockIdx_y = get_group_id(1) % gridDim_y;
@@ -280,192 +185,125 @@ __kernel void icvCalcLayerDetAndTrace(
 
     const int size = calcSize(c_octave, layer);
 
-    const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
-    const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
+    const int samples_i = 1 + ((img_rows - size) >> c_octave);
+    const int samples_j = 1 + ((img_cols - size) >> c_octave);
 
     // Ignore pixels where some of the kernel is outside the image
     const int margin = (size >> 1) >> c_octave;
 
-    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
+    if (size <= img_rows && size <= img_cols && i < samples_i && j < samples_j)
     {
-        const float dx  = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
-        const float dy  = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
-        const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
+        int x = j << c_octave;
+        int y = i << c_octave;
+
+        float ratio = (float)size / 9;
+
+        // Precompute some commonly used values, which are used to offset
+        // texture coordinates in the integral image.
+        int r1 = round(ratio);
+        int r2 = round(ratio * 2.0f);
+        int r3 = round(ratio * 3.0f);
+        int r4 = round(ratio * 4.0f);
+        int r5 = round(ratio * 5.0f);
+        int r6 = round(ratio * 6.0f);
+        int r7 = round(ratio * 7.0f);
+        int r8 = round(ratio * 8.0f);
+        int r9 = round(ratio * 9.0f);
+
+        // Calculate the approximated derivative in the x-direction
+        F d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t02 = read_sumTex( (int2)(x, y + r2));
+            int t07 = read_sumTex( (int2)(x, y + r7));
+            int t32 = read_sumTex( (int2)(x + r3, y + r2));
+            int t37 = read_sumTex( (int2)(x + r3, y + r7));
+            int t62 = read_sumTex( (int2)(x + r6, y + r2));
+            int t67 = read_sumTex( (int2)(x + r6, y + r7));
+            int t92 = read_sumTex( (int2)(x + r9, y + r2));
+            int t97 = read_sumTex( (int2)(x + r9, y + r7));
+
+            d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
+                                          t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
+                                          t32, t37, t62, t67, (r6 - r3) * (r7 - r2));
+        }
+        const float dx  = (float)d;
+
+        // Calculate the approximated derivative in the y-direction
+        d = 0;
+        {
+            // Some of the pixels needed to compute the derivative are
+            // repeated, so we only don't duplicate the fetch here.
+            int t20 = read_sumTex( (int2)(x + r2, y) );
+            int t23 = read_sumTex( (int2)(x + r2, y + r3) );
+            int t70 = read_sumTex( (int2)(x + r7, y) );
+            int t73 = read_sumTex( (int2)(x + r7, y + r3) );
+            int t26 = read_sumTex( (int2)(x + r2, y + r6) );
+            int t76 = read_sumTex( (int2)(x + r7, y + r6) );
+            int t29 = read_sumTex( (int2)(x + r2, y + r9) );
+            int t79 = read_sumTex( (int2)(x + r7, y + r9) );
+
+            d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
+                                          t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
+                                          t23, t26, t73, t76, (r7 - r2) * (r6 - r3));
+        }
+        const float dy  = (float)d;
+
+        // Calculate the approximated derivative in the xy-direction
+        d = 0;
+        {
+            // There's no saving us here, we just have to get all of the pixels in
+            // separate fetches
+            F t = 0;
+            t += read_sumTex( (int2)(x + r1, y + r1) );
+            t -= read_sumTex( (int2)(x + r1, y + r4) );
+            t -= read_sumTex( (int2)(x + r4, y + r1) );
+            t += read_sumTex( (int2)(x + r4, y + r4) );
+            d += t / ((r4 - r1) * (r4 - r1));
+
+            t = 0;
+            t += read_sumTex( (int2)(x + r5, y + r1) );
+            t -= read_sumTex( (int2)(x + r5, y + r4) );
+            t -= read_sumTex( (int2)(x + r8, y + r1) );
+            t += read_sumTex( (int2)(x + r8, y + r4) );
+            d -= t / ((r8 - r5) * (r4 - r1));
+
+            t = 0;
+            t += read_sumTex( (int2)(x + r1, y + r5) );
+            t -= read_sumTex( (int2)(x + r1, y + r8) );
+            t -= read_sumTex( (int2)(x + r4, y + r5) );
+            t += read_sumTex( (int2)(x + r4, y + r8) );
+            d -= t / ((r4 - r1) * (r8 - r5));
+
+            t = 0;
+            t += read_sumTex( (int2)(x + r5, y + r5) );
+            t -= read_sumTex( (int2)(x + r5, y + r8) );
+            t -= read_sumTex( (int2)(x + r8, y + r5) );
+            t += read_sumTex( (int2)(x + r8, y + r8) );
+            d += t / ((r8 - r5) * (r8 - r5));
+        }
+        const float dxy = (float)d;
 
         det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
         trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
     }
 }
 
-
 ////////////////////////////////////////////////////////////////////////
 // NONMAX
 
-__constant float c_DM[5] = {0, 0, 9, 9, 1};
-
-bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int rows, int cols, int step)
-{
-    float ratio = (float)size / 9.0f;
-
-    float d = 0;
-
-    int dx1 = convert_int_rte(ratio * c_DM[0]);
-    int dy1 = convert_int_rte(ratio * c_DM[1]);
-    int dx2 = convert_int_rte(ratio * c_DM[2]);
-    int dy2 = convert_int_rte(ratio * c_DM[3]);
-
-    float t = 0;
-
-    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy1), rows, cols, step);
-    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx1, sum_i + dy2), rows, cols, step);
-    t -= read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy1), rows, cols, step);
-    t += read_sumTex(maskSumTex, sampler, (int2)(sum_j + dx2, sum_i + dy2), rows, cols, step);
-
-    d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
-
-    return (d >= 0.5f);
-}
-
-// Non-maximal suppression to further filtering the candidates from previous step
 __kernel
-void icvFindMaximaInLayer_withmask(
-    __global const float * det,
-    __global const float * trace,
-    __global int4 * maxPosBuffer,
-    volatile __global int* maxCounter,
-    int counter_offset,
-    int det_step,     // the step of det in bytes
-    int trace_step,   // the step of trace in bytes
-    int c_img_rows,
-    int c_img_cols,
-    int c_nOctaveLayers,
-    int c_octave,
-    int c_layer_rows,
-    int c_layer_cols,
-    int c_max_candidates,
-    float c_hessianThreshold,
-    IMAGE_INT32 maskSumTex,
-    int mask_step
-)
-{
-    volatile __local  float N9[768]; // threads.x * threads.y * 3
-
-    det_step   /= sizeof(*det);
-    trace_step /= sizeof(*trace);
-    maxCounter += counter_offset;
-    mask_step  /= sizeof(uint);
-
-    // Determine the indices
-    const int gridDim_y  = get_num_groups(1) / c_nOctaveLayers;
-    const int blockIdx_y = get_group_id(1)   % gridDim_y;
-    const int blockIdx_z = get_group_id(1)   / gridDim_y;
-
-    const int layer = blockIdx_z + 1;
-
-    const int size = calcSize(c_octave, layer);
-
-    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
-    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
-
-    const int j = get_local_id(0) + get_group_id(0) * (get_local_size(0) - 2) + margin - 1;
-    const int i = get_local_id(1) + blockIdx_y * (get_local_size(1) - 2) + margin - 1;
-
-    // Is this thread within the hessian buffer?
-    const int zoff = get_local_size(0) * get_local_size(1);
-    const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
-    N9[localLin - zoff] =
-        det[det_step *
-            (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
-            + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin       ] =
-        det[det_step *
-            (c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1)) // y
-            + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin + zoff] =
-        det[det_step *
-            (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
-            + min(max(j, 0), c_img_cols - 1)];                            // x
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (i < c_layer_rows - margin
-            && j < c_layer_cols - margin
-            && get_local_id(0) > 0
-            && get_local_id(0) < get_local_size(0) - 1
-            && get_local_id(1) > 0
-            && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
-       )
-    {
-        float val0 = N9[localLin];
-
-        if (val0 > c_hessianThreshold)
-        {
-            // Coordinates for the start of the wavelet in the sum image. There
-            // is some integer division involved, so don't try to simplify this
-            // (cancel out sampleStep) without checking the result is the same
-            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
-            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
-
-            if (within_check(maskSumTex, sum_i, sum_j, size, c_img_rows, c_img_cols, mask_step))
-            {
-                // Check to see if we have a max (in its 26 neighbours)
-                const bool condmax = val0 > N9[localLin - 1 - get_local_size(0) - zoff]
-                                     &&                   val0 > N9[localLin     - get_local_size(0) - zoff]
-                                     &&                   val0 > N9[localLin + 1 - get_local_size(0) - zoff]
-                                     &&                   val0 > N9[localLin - 1                     - zoff]
-                                     &&                   val0 > N9[localLin                         - zoff]
-                                     &&                   val0 > N9[localLin + 1                     - zoff]
-                                     &&                   val0 > N9[localLin - 1 + get_local_size(0) - zoff]
-                                     &&                   val0 > N9[localLin     + get_local_size(0) - zoff]
-                                     &&                   val0 > N9[localLin + 1 + get_local_size(0) - zoff]
-
-                                     &&                   val0 > N9[localLin - 1 - get_local_size(0)]
-                                     &&                   val0 > N9[localLin     - get_local_size(0)]
-                                     &&                   val0 > N9[localLin + 1 - get_local_size(0)]
-                                     &&                   val0 > N9[localLin - 1                    ]
-                                     &&                   val0 > N9[localLin + 1                    ]
-                                     &&                   val0 > N9[localLin - 1 + get_local_size(0)]
-                                     &&                   val0 > N9[localLin     + get_local_size(0)]
-                                     &&                   val0 > N9[localLin + 1 + get_local_size(0)]
-
-                                     &&                   val0 > N9[localLin - 1 - get_local_size(0) + zoff]
-                                     &&                   val0 > N9[localLin     - get_local_size(0) + zoff]
-                                     &&                   val0 > N9[localLin + 1 - get_local_size(0) + zoff]
-                                     &&                   val0 > N9[localLin - 1                     + zoff]
-                                     &&                   val0 > N9[localLin                         + zoff]
-                                     &&                   val0 > N9[localLin + 1                     + zoff]
-                                     &&                   val0 > N9[localLin - 1 + get_local_size(0) + zoff]
-                                     &&                   val0 > N9[localLin     + get_local_size(0) + zoff]
-                                     &&                   val0 > N9[localLin + 1 + get_local_size(0) + zoff]
-                                     ;
-
-                if(condmax)
-                {
-                    int ind = atomic_inc(maxCounter);
-
-                    if (ind < c_max_candidates)
-                    {
-                        const int laplacian = (int) copysign(1.0f, trace[trace_step* (layer * c_layer_rows + i) + j]);
-
-                        maxPosBuffer[ind] = (int4)(j, i, layer, laplacian);
-                    }
-                }
-            }
-        }
-    }
-}
-
-__kernel
-void icvFindMaximaInLayer(
+void SURF_findMaximaInLayer(
     __global float * det,
+    int det_step, int det_offset,
     __global float * trace,
+    int trace_step, int trace_offset,
     __global int4 * maxPosBuffer,
     volatile __global  int* maxCounter,
     int counter_offset,
-    int det_step,     // the step of det in bytes
-    int trace_step,   // the step of trace in bytes
-    int c_img_rows,
-    int c_img_cols,
+    int img_rows,
+    int img_cols,
     int c_nOctaveLayers,
     int c_octave,
     int c_layer_rows,
@@ -499,8 +337,8 @@ void icvFindMaximaInLayer(
     const int zoff     = get_local_size(0) * get_local_size(1);
     const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
 
-    int l_x = min(max(j, 0), c_img_cols - 1);
-    int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
+    int l_x = min(max(j, 0), img_cols - 1);
+    int l_y = c_layer_rows * layer + min(max(i, 0), img_rows - 1);
 
     N9[localLin - zoff] =
         det[det_step * (l_y - c_layer_rows) + l_x];
@@ -572,7 +410,7 @@ void icvFindMaximaInLayer(
 }
 
 // solve 3x3 linear system Ax=b for floating point input
-inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local  const float *b, volatile __local  float *x)
+inline bool solve3x3_float(const float4 *A, const float *b, float *x)
 {
     float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
                 - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
@@ -580,7 +418,7 @@ inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local
 
     if (det != 0)
     {
-        F invdet = 1.0 / det;
+        F invdet = 1.0f / det;
 
         x[0] = invdet *
                (b[0]    * (A[1].y * A[2].z - A[1].z * A[2].y) -
@@ -614,15 +452,15 @@ inline bool solve3x3_float(volatile __local  const float4 *A, volatile __local
 ////////////////////////////////////////////////////////////////////////
 // INTERPOLATION
 __kernel
-void icvInterpolateKeypoint(
+void SURF_interpolateKeypoint(
     __global const float * det,
+    int det_step, int det_offset,
     __global const int4 * maxPosBuffer,
     __global float * keypoints,
-    volatile __global  int * featureCounter,
-    int det_step,
-    int keypoints_step,
-    int c_img_rows,
-    int c_img_cols,
+    int keypoints_step, int keypoints_offset,
+    volatile __global int* featureCounter,
+    int img_rows,
+    int img_cols,
     int c_octave,
     int c_layer_rows,
     int c_max_features
@@ -651,7 +489,7 @@ void icvInterpolateKeypoint(
 
     if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
     {
-        volatile __local  float dD[3];
+        float dD[3];
 
         //dx
         dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
@@ -660,7 +498,7 @@ void icvInterpolateKeypoint(
         //ds
         dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
 
-        volatile __local  float4 H[3];
+        float4 H[3];
 
         //dxx
         H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
@@ -681,7 +519,7 @@ void icvInterpolateKeypoint(
         //dss
         H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 
-        volatile __local  float x[3];
+        float x[3];
 
         if (solve3x3_float(H, dD, x))
         {
@@ -711,10 +549,10 @@ void icvInterpolateKeypoint(
                 sampled in a circle of radius 6s using wavelets of size 4s.
                 We ensure the gradient wavelet size is even to ensure the
                 wavelet pattern is balanced and symmetric around its center */
-                const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+                const int grad_wav_size = 2 * round(2.0f * s);
 
                 // check when grad_wav_size is too big
-                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
+                if ((img_rows + 1) >= grad_wav_size && (img_cols + 1) >= grad_wav_size)
                 {
                     // Get a new feature index.
                     int ind = atomic_inc(featureCounter);
@@ -737,9 +575,12 @@ void icvInterpolateKeypoint(
 ////////////////////////////////////////////////////////////////////////
 // Orientation
 
-#define ORI_SEARCH_INC 5
-#define ORI_WIN        60
-#define ORI_SAMPLES    113
+#define ORI_WIN			 60
+#define ORI_SAMPLES		 113
+
+// The distance between samples in the beginning of the the reduction
+#define ORI_RESPONSE_REDUCTION_WIDTH		 48
+#define ORI_RESPONSE_ARRAY_SIZE			     (ORI_RESPONSE_REDUCTION_WIDTH * 2)
 
 __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
 __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
@@ -816,61 +657,90 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
 }
 
 __kernel
-void icvCalcOrientation(
-    IMAGE_INT32 sumTex,
-    __global float * keypoints,
-    int keypoints_step,
-    int c_img_rows,
-    int c_img_cols,
-    int sum_step
-)
+void SURF_calcOrientation(
+    __PARAM_sumTex__, int img_rows, int img_cols,
+    __global float * keypoints, int keypoints_step, int keypoints_offset )
 {
     keypoints_step /= sizeof(*keypoints);
+    #ifndef HAVE_IMAGE2D
     sum_step       /= sizeof(uint);
+    #endif
     __global float* featureX    = keypoints + X_ROW * keypoints_step;
     __global float* featureY    = keypoints + Y_ROW * keypoints_step;
     __global float* featureSize = keypoints + SIZE_ROW * keypoints_step;
     __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 
+    __local  float s_X[ORI_SAMPLES];
+    __local  float s_Y[ORI_SAMPLES];
+    __local  float s_angle[ORI_SAMPLES];
 
-    volatile __local  float s_X[128];
-    volatile __local  float s_Y[128];
-    volatile __local  float s_angle[128];
-
-    volatile __local  float s_sumx[32 * 4];
-    volatile __local  float s_sumy[32 * 4];
+    // Need to allocate enough to make the reduction work without accessing
+    // past the end of the array.
+    __local  float s_sumx[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_sumy[ORI_RESPONSE_ARRAY_SIZE];
+    __local  float s_mod[ORI_RESPONSE_ARRAY_SIZE];
 
     /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
     const float s = featureSize[get_group_id(0)] * 1.2f / 9.0f;
 
-
     /* To find the dominant orientation, the gradients in x and y are
     sampled in a circle of radius 6s using wavelets of size 4s.
     We ensure the gradient wavelet size is even to ensure the
     wavelet pattern is balanced and symmetric around its center */
-    const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
+    const int grad_wav_size = 2 * round(2.0f * s);
 
     // check when grad_wav_size is too big
-    if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
+    if ((img_rows + 1) < grad_wav_size || (img_cols + 1) < grad_wav_size)
         return;
 
     // Calc X, Y, angle and store it to shared memory
-    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    const int tid = get_local_id(0);
+    // Initialize values that are only used as part of the reduction later.
+    if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) {
+        s_mod[tid + ORI_LOCAL_SIZE] = 0.0f;
+    }
 
-    float X = 0.0f, Y = 0.0f, angle = 0.0f;
+    float ratio = (float)grad_wav_size / 4;
 
-    if (tid < ORI_SAMPLES)
+    int r2 = round(ratio * 2.0f);
+    int r4 = round(ratio * 4.0f);
+    for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
     {
+        float X = 0.0f, Y = 0.0f, angle = 0.0f;
         const float margin = (float)(grad_wav_size - 1) / 2.0f;
-        const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
-        const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
+        const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
+        const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
 
-        if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
-                x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
+        if (y >= 0 && y < (img_rows + 1) - grad_wav_size &&
+            x >= 0 && x < (img_cols + 1) - grad_wav_size)
         {
-            X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
-            Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
+            float apt = c_aptW[i];
+
+            // Compute the haar sum without fetching duplicate pixels.
+            float t00 = read_sumTex( (int2)(x, y));
+            float t02 = read_sumTex( (int2)(x, y + r2));
+            float t04 = read_sumTex( (int2)(x, y + r4));
+            float t20 = read_sumTex( (int2)(x + r2, y));
+            float t24 = read_sumTex( (int2)(x + r2, y + r4));
+            float t40 = read_sumTex( (int2)(x + r4, y));
+            float t42 = read_sumTex( (int2)(x + r4, y + r2));
+            float t44 = read_sumTex( (int2)(x + r4, y + r4));
+
+            F t = t00 - t04 - t20 + t24;
+            X -= t / ((r2) * (r4));
+
+            t = t20 - t24 - t40 + t44;
+            X += t / ((r4 - r2) * (r4));
+
+            t = t00 - t02 - t40 + t42;
+            Y += t / ((r2) * (r4));
+
+            t = t02 - t04 - t42 + t44;
+            Y -= t  / ((r4) * (r4 - r2));
+
+            X = apt*X;
+            Y = apt*Y;
 
             angle = atan2(Y, X);
 
@@ -879,76 +749,61 @@ void icvCalcOrientation(
             angle *= 180.0f / CV_PI_F;
 
         }
+
+        s_X[i] = X;
+        s_Y[i] = Y;
+        s_angle[i] = angle;
     }
-    s_X[tid] = X;
-    s_Y[tid] = Y;
-    s_angle[tid] = angle;
     barrier(CLK_LOCAL_MEM_FENCE);
 
     float bestx = 0, besty = 0, best_mod = 0;
+    float sumx = 0.0f, sumy = 0.0f;
+    const int dir = tid * ORI_SEARCH_INC;
+    #pragma unroll
+    for (int i = 0; i < ORI_SAMPLES; ++i) {
+        int angle = round(s_angle[i]);
 
-#pragma unroll
-    for (int i = 0; i < 18; ++i)
-    {
-        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
+        int d = abs(angle - dir);
+        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
+        {
+            sumx += s_X[i];
+            sumy += s_Y[i];
+        }
+    }
+    s_sumx[tid] = sumx;
+    s_sumy[tid] = sumy;
+    s_mod[tid] = sumx*sumx + sumy*sumy;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        volatile float sumx = 0.0f, sumy = 0.0f;
-        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx = s_X[get_local_id(0)];
-            sumy = s_Y[get_local_id(0)];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 32];
-            sumy += s_Y[get_local_id(0) + 32];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 64];
-            sumy += s_Y[get_local_id(0) + 64];
-        }
-        d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
-        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
-        {
-            sumx += s_X[get_local_id(0) + 96];
-            sumy += s_Y[get_local_id(0) + 96];
-        }
-        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
-        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
-
-        const float temp_mod = sumx * sumx + sumy * sumy;
-        if (temp_mod > best_mod)
-        {
-            best_mod = temp_mod;
-            bestx = sumx;
-            besty = sumy;
+    // This reduction searches for the longest wavelet response vector.  The first
+    // step uses all of the work items in the workgroup to narrow the search
+    // down to the three candidates.  It requires s_mod to have a few more
+    // elements allocated past the work-group size, which are pre-initialized to
+    // 0.0f above.
+    for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) {
+        if (tid < t) {
+            if (s_mod[tid] < s_mod[tid + t]) {
+                s_mod[tid] = s_mod[tid + t];
+                s_sumx[tid] = s_sumx[tid + t];
+                s_sumy[tid] = s_sumy[tid + t];
+            }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    if (get_local_id(0) == 0)
-    {
-        s_X[get_local_id(1)] = bestx;
-        s_Y[get_local_id(1)] = besty;
-        s_angle[get_local_id(1)] = best_mod;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (get_local_id(1) == 0 && get_local_id(0) == 0)
+    // Do the final reduction and write out the result.
+    if (tid == 0)
     {
         int bestIdx = 0;
 
-        if (s_angle[1] > s_angle[bestIdx])
+        // The loop above narrowed the search of the longest vector to three
+        // possibilities.  Pick the best here.
+        if (s_mod[1] > s_mod[bestIdx])
             bestIdx = 1;
-        if (s_angle[2] > s_angle[bestIdx])
+        if (s_mod[2] > s_mod[bestIdx])
             bestIdx = 2;
-        if (s_angle[3] > s_angle[bestIdx])
-            bestIdx = 3;
 
-        float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
+        float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]);
         if (kp_dir < 0)
             kp_dir += 2.0f * CV_PI_F;
         kp_dir *= 180.0f / CV_PI_F;
@@ -961,20 +816,18 @@ void icvCalcOrientation(
     }
 }
 
-
 __kernel
-void icvSetUpright(
+void SURF_setUpRight(
     __global float * keypoints,
-    int keypoints_step,
-    int nFeatures
-)
+    int keypoints_step, int keypoints_offset,
+    int rows, int cols )
 {
+    int i = get_global_id(0);
     keypoints_step /= sizeof(*keypoints);
-    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
 
-    if(get_global_id(0) <= nFeatures)
+    if(i < cols)
     {
-        featureDir[get_global_id(0)] = 270.0f;
+        keypoints[mad24(keypoints_step, ANGLE_ROW, i)] = 270.f;
     }
 }
 
@@ -1013,61 +866,50 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
 };
 
 // utility for linear filter
-inline uchar readerGet(
-    IMAGE_INT8 src,
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    int i, int j, int rows, int cols, int elemPerRow
-)
-{
-    float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
-    float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-    return read_imgTex(src, sampler, (float2)(pixel_x, pixel_y), rows, cols, elemPerRow);
-}
+#define readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, i, j) \
+    read_imgTex((float2)(centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir, \
+                         centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir))
 
 inline float linearFilter(
-    IMAGE_INT8 src,
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
-    float y, float x, int rows, int cols, int elemPerRow
-)
+    __PARAM_imgTex__, int img_rows, int img_cols,
+    float centerX, float centerY, float win_offset,
+    float cos_dir, float sin_dir, float y, float x )
 {
     x -= 0.5f;
     y -= 0.5f;
 
     float out = 0.0f;
 
-    const int x1 = convert_int_rtn(x);
-    const int y1 = convert_int_rtn(y);
+    const int x1 = round(x);
+    const int y1 = round(y);
     const int x2 = x1 + 1;
     const int y2 = y1 + 1;
 
-    uchar src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1, rows, cols, elemPerRow);
+    uchar src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x1);
     out = out + src_reg * ((x2 - x) * (y2 - y));
 
-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2, rows, cols, elemPerRow);
+    src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y1, x2);
     out = out + src_reg * ((x - x1) * (y2 - y));
 
-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1, rows, cols, elemPerRow);
+    src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x1);
     out = out + src_reg * ((x2 - x) * (y - y1));
 
-    src_reg = readerGet(src, centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2, rows, cols, elemPerRow);
+    src_reg = readerGet(centerX, centerY, win_offset, cos_dir, sin_dir, y2, x2);
     out = out + src_reg * ((x - x1) * (y - y1));
 
     return out;
 }
 
 void calc_dx_dy(
-    IMAGE_INT8 imgTex,
+    __PARAM_imgTex__,
+    int img_rows, int img_cols,
     volatile __local  float *s_dx_bin,
     volatile __local  float *s_dy_bin,
     volatile __local  float *s_PATCH,
     __global const float* featureX,
     __global const float* featureY,
     __global const float* featureSize,
-    __global const float* featureDir,
-    int rows,
-    int cols,
-    int elemPerRow
-)
+    __global const float* featureDir )
 {
     const float centerX = featureX[get_group_id(0)];
     const float centerY = featureY[get_group_id(0)];
@@ -1104,7 +946,9 @@ void calc_dx_dy(
     const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
     const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
 
-    s_PATCH[get_local_id(1) * 6 + get_local_id(0)] = linearFilter(imgTex, centerX, centerY, win_offset, cos_dir, sin_dir, icoo, jcoo, rows, cols, elemPerRow);
+    s_PATCH[get_local_id(1) * 6 + get_local_id(0)] =
+        linearFilter(__PASS_imgTex__, img_rows, img_cols, centerX, centerY,
+                     win_offset, cos_dir, sin_dir, icoo, jcoo);
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -1130,6 +974,7 @@ void calc_dx_dy(
         s_dy_bin[tid] = vy;
     }
 }
+
 void reduce_sum25(
     volatile __local  float* sdata1,
     volatile __local  float* sdata2,
@@ -1193,16 +1038,13 @@ void reduce_sum25(
 }
 
 __kernel
-void compute_descriptors64(
-    IMAGE_INT8 imgTex,
+void SURF_computeDescriptors64(
+    __PARAM_imgTex__,
+    int img_rows, int img_cols,
+    __global const float* keypoints,
+    int keypoints_step, int keypoints_offset,
     __global float * descriptors,
-    __global const float * keypoints,
-    int descriptors_step,
-    int keypoints_step,
-    int rows,
-    int cols,
-    int img_step
-)
+    int descriptors_step, int descriptors_offset)
 {
     descriptors_step /= sizeof(float);
     keypoints_step   /= sizeof(float);
@@ -1218,7 +1060,7 @@ void compute_descriptors64(
     volatile __local  float sdyabs[25];
     volatile __local  float s_PATCH[6*6];
 
-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
+    calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
     barrier(CLK_LOCAL_MEM_FENCE);
 
     const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1247,17 +1089,15 @@ void compute_descriptors64(
         }
     }
 }
+
 __kernel
-void compute_descriptors128(
-    IMAGE_INT8 imgTex,
-    __global float * descriptors,
-    __global float * keypoints,
-    int descriptors_step,
-    int keypoints_step,
-    int rows,
-    int cols,
-    int img_step
-)
+void SURF_computeDescriptors128(
+    __PARAM_imgTex__,
+    int img_rows, int img_cols,
+    __global const float* keypoints,
+    int keypoints_step, int keypoints_offset,
+    __global float* descriptors,
+    int descriptors_step, int descriptors_offset)
 {
     descriptors_step /= sizeof(*descriptors);
     keypoints_step   /= sizeof(*keypoints);
@@ -1278,7 +1118,7 @@ void compute_descriptors128(
     volatile __local  float sdabs2[25];
     volatile __local  float s_PATCH[6*6];
 
-    calc_dx_dy(imgTex, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir, rows, cols, img_step);
+    calc_dx_dy(__PASS_imgTex__, img_rows, img_cols, sdx, sdy, s_PATCH, featureX, featureY, featureSize, featureDir);
     barrier(CLK_LOCAL_MEM_FENCE);
 
     const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
@@ -1451,7 +1291,7 @@ void reduce_sum64(volatile __local  float* smem, int tid)
 }
 
 __kernel
-void normalize_descriptors128(__global float * descriptors, int descriptors_step)
+void SURF_normalizeDescriptors128(__global float * descriptors, int descriptors_step, int descriptors_offset)
 {
     descriptors_step /= sizeof(*descriptors);
     // no need for thread ID
@@ -1477,8 +1317,9 @@ void normalize_descriptors128(__global float * descriptors, int descriptors_step
     // normalize and store in output
     descriptor_base[get_local_id(0)] = lookup / len;
 }
+
 __kernel
-void normalize_descriptors64(__global float * descriptors, int descriptors_step)
+void SURF_normalizeDescriptors64(__global float * descriptors, int descriptors_step, int descriptors_offset)
 {
     descriptors_step /= sizeof(*descriptors);
     // no need for thread ID
diff --git a/modules/nonfree/src/precomp.hpp b/modules/nonfree/src/precomp.hpp
index 204feaf71..001b5003c 100644
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
@@ -60,11 +60,6 @@
 #  include "opencv2/cudaarithm.hpp"
 #endif
 
-#ifdef HAVE_OPENCV_OCL
-#  include "opencv2/nonfree/ocl.hpp"
-#  include "opencv2/ocl/private/util.hpp"
-#endif
-
 #include "opencv2/core/private.hpp"
 
 #endif
diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp
index 4a36c2d9f..259e934ed 100644
--- a/modules/nonfree/src/sift.cpp
+++ b/modules/nonfree/src/sift.cpp
@@ -818,12 +818,12 @@ void SIFT::operator()(InputArray _image, InputArray _mask,
     }
 }
 
-void SIFT::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void SIFT::detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
-    (*this)(image, mask, keypoints, noArray());
+    (*this)(image.getMat(), mask.getMat(), keypoints, noArray());
 }
 
-void SIFT::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const
+void SIFT::computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const
 {
     (*this)(image, Mat(), keypoints, descriptors, true);
 }
diff --git a/modules/nonfree/src/surf.cpp b/modules/nonfree/src/surf.cpp
index be8d14d4c..05978e495 100644
--- a/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@ -108,6 +108,7 @@ Modifications by Ian Mahon
 
 */
 #include "precomp.hpp"
+#include "surf.hpp"
 
 namespace cv
 {
@@ -897,11 +898,42 @@ void SURF::operator()(InputArray _img, InputArray _mask,
                       OutputArray _descriptors,
                       bool useProvidedKeypoints) const
 {
-    Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum;
+    int imgtype = _img.type(), imgcn = CV_MAT_CN(imgtype);
     bool doDescriptors = _descriptors.needed();
 
-    CV_Assert(!img.empty() && img.depth() == CV_8U);
-    if( img.channels() > 1 )
+    CV_Assert(!_img.empty() && CV_MAT_DEPTH(imgtype) == CV_8U && (imgcn == 1 || imgcn == 3 || imgcn == 4));
+    CV_Assert(_descriptors.needed() || !useProvidedKeypoints);
+
+    if( ocl::useOpenCL() )
+    {
+        SURF_OCL ocl_surf;
+        UMat gpu_kpt;
+        bool ok = ocl_surf.init(this);
+
+        if( ok )
+        {
+            if( !_descriptors.needed() )
+            {
+                ok = ocl_surf.detect(_img, _mask, gpu_kpt);
+            }
+            else
+            {
+                if(useProvidedKeypoints)
+                    ocl_surf.uploadKeypoints(keypoints, gpu_kpt);
+                ok = ocl_surf.detectAndCompute(_img, _mask, gpu_kpt, _descriptors, useProvidedKeypoints);
+            }
+        }
+        if( ok )
+        {
+            if(!useProvidedKeypoints)
+                ocl_surf.downloadKeypoints(gpu_kpt, keypoints);
+            return;
+        }
+    }
+
+    Mat img = _img.getMat(), mask = _mask.getMat(), mask1, sum, msum;
+
+    if( imgcn > 1 )
         cvtColor(img, img, COLOR_BGR2GRAY);
 
     CV_Assert(mask.empty() || (mask.type() == CV_8U && mask.size() == img.size()));
@@ -979,12 +1011,12 @@ void SURF::operator()(InputArray _img, InputArray _mask,
 }
 
 
-void SURF::detectImpl( const Mat& image, std::vector<KeyPoint>& keypoints, const Mat& mask) const
+void SURF::detectImpl( InputArray image, std::vector<KeyPoint>& keypoints, InputArray mask) const
 {
-    (*this)(image, mask, keypoints, noArray(), false);
+    (*this)(image.getMat(), mask.getMat(), keypoints, noArray(), false);
 }
 
-void SURF::computeImpl( const Mat& image, std::vector<KeyPoint>& keypoints, Mat& descriptors) const
+void SURF::computeImpl( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray descriptors) const
 {
     (*this)(image, Mat(), keypoints, descriptors, true);
 }
diff --git a/modules/nonfree/src/surf.hpp b/modules/nonfree/src/surf.hpp
new file mode 100644
index 000000000..ee56fb66c
--- /dev/null
+++ b/modules/nonfree/src/surf.hpp
@@ -0,0 +1,118 @@
+///////////// see LICENSE.txt in the OpenCV root directory //////////////
+
+#ifndef __OPENCV_NONFREE_SURF_HPP__
+#define __OPENCV_NONFREE_SURF_HPP__
+
+namespace cv
+{
+//! Speeded up robust features, port from CUDA module.
+////////////////////////////////// SURF //////////////////////////////////////////
+
+class SURF_OCL
+{
+public:
+    enum KeypointLayout
+    {
+        X_ROW = 0,
+        Y_ROW,
+        LAPLACIAN_ROW,
+        OCTAVE_ROW,
+        SIZE_ROW,
+        ANGLE_ROW,
+        HESSIAN_ROW,
+        ROWS_COUNT
+    };
+
+    //! the full constructor taking all the necessary parameters
+    SURF_OCL();
+
+    bool init(const SURF* params);
+
+    //! returns the descriptor size in float's (64 or 128)
+    int descriptorSize() const { return params->extended ? 128 : 64; }
+
+    void uploadKeypoints(const std::vector<KeyPoint> &keypoints, UMat &keypointsGPU);
+    void downloadKeypoints(const UMat &keypointsGPU, std::vector<KeyPoint> &keypoints);
+
+    //! finds the keypoints using fast hessian detector used in SURF
+    //! supports CV_8UC1 images
+    //! keypoints will have nFeature cols and 6 rows
+    //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+    //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+    //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+    //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+    //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+    //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+    //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+    bool detect(InputArray img, InputArray mask, UMat& keypoints);
+    //! finds the keypoints and computes their descriptors.
+    //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+    bool detectAndCompute(InputArray img, InputArray mask, UMat& keypoints,
+                          OutputArray descriptors, bool useProvidedKeypoints = false);
+
+protected:
+    bool setImage(InputArray img, InputArray mask);
+
+    // kernel callers declarations
+    bool calcLayerDetAndTrace(int octave, int layer_rows);
+
+    bool findMaximaInLayer(int counterOffset, int octave, int layer_rows, int layer_cols);
+
+    bool interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int maxFeatures);
+
+    bool calcOrientation(UMat &keypoints);
+
+    bool setUpRight(UMat &keypoints);
+
+    bool computeDescriptors(const UMat &keypoints, OutputArray descriptors);
+
+    bool detectKeypoints(UMat &keypoints);
+
+    const SURF* params;
+
+    //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+    UMat sum, intBuffer;
+    UMat det, trace;
+    UMat maxPosBuffer;
+
+    int img_cols, img_rows;
+
+    int maxCandidates;
+    int maxFeatures;
+
+    UMat img, counters;
+
+    // texture buffers
+    ocl::Image2D imgTex, sumTex;
+    bool haveImageSupport;
+    String kerOpts;
+
+    int status;
+};
+
+/*
+template<typename _Tp> void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um)
+{
+    if(v.empty())
+        um.release();
+    else
+        Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um);
+}
+
+template<typename _Tp> void copyUMatToVector(const UMat& um, std::vector<_Tp>& v)
+{
+    if(um.empty())
+        v.clear();
+    else
+    {
+        size_t sz = um.total()*um.elemSize();
+        CV_Assert(um.isContinuous() && (sz % sizeof(_Tp) == 0));
+        v.resize(sz/sizeof(_Tp));
+        Mat m(um.size(), um.type(), &v[0]);
+        um.copyTo(m);
+    }
+}*/
+
+}
+
+#endif
diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp
index 5ade5e517..e6fa7d444 100644
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -43,34 +43,16 @@
 //
 //M*/
 #include "precomp.hpp"
+#include "surf.hpp"
 
-#ifdef HAVE_OPENCV_OCL
 #include <cstdio>
+#include <sstream>
 #include "opencl_kernels.hpp"
 
-using namespace cv;
-using namespace cv::ocl;
-
-static ProgramEntry surfprog = cv::ocl::nonfree::surf;
-
 namespace cv
 {
-    namespace ocl
-    {
-        static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-            size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
-        {
-            char optBuf [100] = {0};
-            char * optBufPtr = optBuf;
-            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
-            size_t wave_size = queryWaveFrontSize(kernel);
-            CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
-            sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
-            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
-        }
-    }
-}
+
+enum { ORI_SEARCH_INC=5, ORI_LOCAL_SIZE=(360 / ORI_SEARCH_INC) };
 
 static inline int calcSize(int octave, int layer)
 {
@@ -88,223 +70,208 @@ static inline int calcSize(int octave, int layer)
 }
 
 
-class SURF_OCL_Invoker
+SURF_OCL::SURF_OCL()
 {
-public:
-    // facilities
-    void bindImgTex(const oclMat &img, cl_mem &texture);
-
-    //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
-    //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
-
-    // kernel callers declarations
-    void icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int layer_rows);
-
-    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
-                                  int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
-
-    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
-                                    oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);
-
-    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);
-
-    void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
-
-    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
-    // end of kernel callers declarations
-
-    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
-        surf_(surf),
-        img_cols(img.cols), img_rows(img.rows),
-        use_mask(!mask.empty()), counters(oclMat()),
-        imgTex(NULL), sumTex(NULL), maskSumTex(NULL), _img(img)
-    {
-        CV_Assert(!img.empty() && img.type() == CV_8UC1);
-        CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
-        CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
-
-        const int min_size = calcSize(surf_.nOctaves - 1, 0);
-        CV_Assert(img_rows - min_size >= 0);
-        CV_Assert(img_cols - min_size >= 0);
-
-        const int layer_rows = img_rows >> (surf_.nOctaves - 1);
-        const int layer_cols = img_cols >> (surf_.nOctaves - 1);
-        const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
-        CV_Assert(layer_rows - 2 * min_margin > 0);
-        CV_Assert(layer_cols - 2 * min_margin > 0);
-
-        maxFeatures   = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
-        maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
-
-        CV_Assert(maxFeatures > 0);
-
-        counters.create(1, surf_.nOctaves + 1, CV_32SC1);
-        counters.setTo(Scalar::all(0));
-
-        integral(img, surf_.sum);
-
-        bindImgTex(img, imgTex);
-        bindImgTex(surf_.sum, sumTex);
-        finish();
-
-        maskSumTex = 0;
-
-        if (use_mask)
-        {
-            CV_Error(Error::StsBadFunc, "Masked SURF detector is not implemented yet");
-            //!FIXME
-            // temp fix for missing min overload
-            //oclMat temp(mask.size(), mask.type());
-            //temp.setTo(Scalar::all(1.0));
-            ////cv::ocl::min(mask, temp, surf_.mask1);           ///////// disable this
-            //integral(surf_.mask1, surf_.maskSum);
-            //bindImgTex(surf_.maskSum, maskSumTex);
-        }
-    }
-
-    void detectKeypoints(oclMat &keypoints)
-    {
-        // create image pyramid buffers
-        // different layers have same sized buffers, but they are sampled from Gaussian kernel.
-        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
-        ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
-
-        ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
-        ensureSizeIsEnough(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
-        keypoints.setTo(Scalar::all(0));
-
-        for (int octave = 0; octave < surf_.nOctaves; ++octave)
-        {
-            const int layer_rows = img_rows >> octave;
-            const int layer_cols = img_cols >> octave;
-
-            //loadOctaveConstants(octave, layer_rows, layer_cols);
-
-            icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
-
-            icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
-                                     octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
-
-            int maxCounter = ((Mat)counters).at<int>(1 + octave);
-            maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));
-
-            if (maxCounter > 0)
-            {
-                icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
-                                           keypoints, counters, octave, layer_rows, maxFeatures);
-            }
-        }
-        int featureCounter = Mat(counters).at<int>(0);
-        featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));
-
-        keypoints.cols = featureCounter;
-
-        if (surf_.upright)
-        {
-            //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
-            setUpright(keypoints);
-        }
-        else
-        {
-            findOrientation(keypoints);
-        }
-    }
-
-    void setUpright(oclMat &keypoints)
-    {
-        const int nFeatures = keypoints.cols;
-        if(nFeatures > 0)
-        {
-            icvSetUpright_gpu(keypoints, keypoints.cols);
-        }
-    }
-
-    void findOrientation(oclMat &keypoints)
-    {
-        const int nFeatures = keypoints.cols;
-        if (nFeatures > 0)
-        {
-            icvCalcOrientation_gpu(keypoints, nFeatures);
-        }
-    }
-
-    void computeDescriptors(const oclMat &keypoints, oclMat &descriptors, int descriptorSize)
-    {
-        const int nFeatures = keypoints.cols;
-        if (nFeatures > 0)
-        {
-            ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
-            compute_descriptors_gpu(descriptors, keypoints, nFeatures);
-        }
-    }
-
-    ~SURF_OCL_Invoker()
-    {
-        if(imgTex)
-            openCLFree(imgTex);
-        if(sumTex)
-            openCLFree(sumTex);
-        if(maskSumTex)
-            openCLFree(maskSumTex);
-    }
-
-private:
-    SURF_OCL &surf_;
-
-    int img_cols, img_rows;
-
-    bool use_mask;
-
-    int maxCandidates;
-    int maxFeatures;
-
-    oclMat counters;
-
-    // texture buffers
-    cl_mem imgTex;
-    cl_mem sumTex;
-    cl_mem maskSumTex;
-
-    const oclMat _img; // make a copy for non-image2d_t supported platform
-
-    SURF_OCL_Invoker &operator= (const SURF_OCL_Invoker &right)
-    {
-        (*this) = right;
-        return *this;
-    } // remove warning C4512
-};
-
-cv::ocl::SURF_OCL::SURF_OCL()
-{
-    hessianThreshold = 100.0f;
-    extended = true;
-    nOctaves = 4;
-    nOctaveLayers = 2;
-    keypointsRatio = 0.01f;
-    upright = false;
+    img_cols = img_rows = maxCandidates = maxFeatures = 0;
+    haveImageSupport = false;
+    status = -1;
 }
 
-cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
+bool SURF_OCL::init(const SURF* p)
 {
-    hessianThreshold = saturate_cast<float>(_threshold);
-    extended = _extended;
-    nOctaves = _nOctaves;
-    nOctaveLayers = _nOctaveLayers;
-    keypointsRatio = _keypointsRatio;
-    upright = _upright;
+    params = p;
+    if(status < 0)
+    {
+        status = 0;
+        if(ocl::haveOpenCL())
+        {
+            const ocl::Device& dev = ocl::Device::getDefault();
+            if( dev.type() == ocl::Device::TYPE_CPU || dev.doubleFPConfig() == 0 )
+                return false;
+            haveImageSupport = false;//dev.imageSupport();
+            kerOpts = haveImageSupport ? "-D HAVE_IMAGE2D -D DOUBLE_SUPPORT" : "";
+            status = 1;
+        }
+    }
+    return status > 0;
 }
 
-int cv::ocl::SURF_OCL::descriptorSize() const
+
+bool SURF_OCL::setImage(InputArray _img, InputArray _mask)
 {
-    return extended ? 128 : 64;
+    if( status <= 0 )
+        return false;
+    if( !_mask.empty())
+        return false;
+    int imgtype = _img.type();
+    CV_Assert(!_img.empty());
+    CV_Assert(params && params->nOctaves > 0 && params->nOctaveLayers > 0);
+
+    int min_size = calcSize(params->nOctaves - 1, 0);
+    Size sz = _img.size();
+    img_cols = sz.width;
+    img_rows = sz.height;
+    CV_Assert(img_rows >= min_size && img_cols >= min_size);
+
+    const int layer_rows = img_rows >> (params->nOctaves - 1);
+    const int layer_cols = img_cols >> (params->nOctaves - 1);
+    const int min_margin = ((calcSize((params->nOctaves - 1), 2) >> 1) >> (params->nOctaves - 1)) + 1;
+    CV_Assert(layer_rows - 2 * min_margin > 0);
+    CV_Assert(layer_cols - 2 * min_margin > 0);
+
+    maxFeatures   = std::min(static_cast<int>(img_cols*img_rows * 0.01f), 65535);
+    maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
+
+    CV_Assert(maxFeatures > 0);
+
+    counters.create(1, params->nOctaves + 1, CV_32SC1);
+    counters.setTo(Scalar::all(0));
+
+    img.release();
+    if(_img.isUMat() && imgtype == CV_8UC1)
+        img = _img.getUMat();
+    else if( imgtype == CV_8UC1 )
+        _img.copyTo(img);
+    else
+        cvtColor(_img, img, COLOR_BGR2GRAY);
+
+    integral(img, sum);
+
+    if(haveImageSupport)
+    {
+        imgTex = ocl::Image2D(img);
+        sumTex = ocl::Image2D(sum);
+    }
+
+    return true;
 }
 
-int cv::ocl::SURF_OCL::defaultNorm() const
+
+bool SURF_OCL::detectKeypoints(UMat &keypoints)
 {
-    return NORM_L2;
+    // create image pyramid buffers
+    // different layers have same sized buffers, but they are sampled from Gaussian kernel.
+    det.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32F);
+    trace.create(img_rows * (params->nOctaveLayers + 2), img_cols, CV_32FC1);
+
+    maxPosBuffer.create(1, maxCandidates, CV_32SC4);
+    keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32F);
+    keypoints.setTo(Scalar::all(0));
+    Mat cpuCounters;
+
+    for (int octave = 0; octave < params->nOctaves; ++octave)
+    {
+        const int layer_rows = img_rows >> octave;
+        const int layer_cols = img_cols >> octave;
+
+        if(!calcLayerDetAndTrace(octave, layer_rows))
+            return false;
+
+        if(!findMaximaInLayer(1 + octave, octave, layer_rows, layer_cols))
+            return false;
+
+        cpuCounters = counters.getMat(ACCESS_READ);
+        int maxCounter = cpuCounters.at<int>(1 + octave);
+        maxCounter = std::min(maxCounter, maxCandidates);
+        cpuCounters.release();
+
+        if (maxCounter > 0)
+        {
+            if(!interpolateKeypoint(maxCounter, keypoints, octave, layer_rows, maxFeatures))
+                return false;
+        }
+    }
+
+    cpuCounters = counters.getMat(ACCESS_READ);
+    int featureCounter = cpuCounters.at<int>(0);
+    featureCounter = std::min(featureCounter, maxFeatures);
+    cpuCounters.release();
+
+    keypoints = UMat(keypoints, Rect(0, 0, featureCounter, keypoints.rows));
+
+    if (params->upright)
+        return setUpRight(keypoints);
+    else
+        return calcOrientation(keypoints);
 }
 
-void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, oclMat &keypointsGPU)
+
+bool SURF_OCL::setUpRight(UMat &keypoints)
+{
+    int nFeatures = keypoints.cols;
+    if( nFeatures == 0 )
+        return true;
+
+    size_t globalThreads[3] = {nFeatures, 1};
+    ocl::Kernel kerUpRight("SURF_setUpRight", ocl::nonfree::surf_oclsrc, kerOpts);
+    return kerUpRight.args(ocl::KernelArg::ReadWrite(keypoints)).run(2, globalThreads, 0, true);
+}
+
+bool SURF_OCL::computeDescriptors(const UMat &keypoints, OutputArray _descriptors)
+{
+    int dsize = params->descriptorSize();
+    int nFeatures = keypoints.cols;
+    if (nFeatures == 0)
+    {
+        _descriptors.release();
+        return true;
+    }
+    _descriptors.create(nFeatures, dsize, CV_32F);
+    UMat descriptors;
+    if( _descriptors.isUMat() )
+        descriptors = _descriptors.getUMat();
+    else
+        descriptors.create(nFeatures, dsize, CV_32F);
+
+    ocl::Kernel kerCalcDesc, kerNormDesc;
+
+    if( dsize == 64 )
+    {
+        kerCalcDesc.create("SURF_computeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
+        kerNormDesc.create("SURF_normalizeDescriptors64", ocl::nonfree::surf_oclsrc, kerOpts);
+    }
+    else
+    {
+        CV_Assert(dsize == 128);
+        kerCalcDesc.create("SURF_computeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
+        kerNormDesc.create("SURF_normalizeDescriptors128", ocl::nonfree::surf_oclsrc, kerOpts);
+    }
+
+    size_t localThreads[] = {6, 6};
+    size_t globalThreads[] = {nFeatures*localThreads[0], localThreads[1]};
+
+    if(haveImageSupport)
+    {
+        kerCalcDesc.args(imgTex,
+                         img_rows, img_cols,
+                         ocl::KernelArg::ReadOnlyNoSize(keypoints),
+                         ocl::KernelArg::WriteOnlyNoSize(descriptors));
+    }
+    else
+    {
+        kerCalcDesc.args(ocl::KernelArg::ReadOnlyNoSize(img),
+                         img_rows, img_cols,
+                         ocl::KernelArg::ReadOnlyNoSize(keypoints),
+                         ocl::KernelArg::WriteOnlyNoSize(descriptors));
+    }
+
+    if(!kerCalcDesc.run(2, globalThreads, localThreads, true))
+        return false;
+
+    size_t localThreads_n[] = {dsize, 1};
+    size_t globalThreads_n[] = {nFeatures*localThreads_n[0], localThreads_n[1]};
+
+    globalThreads[0] = nFeatures * localThreads[0];
+    globalThreads[1] = localThreads[1];
+    bool ok = kerNormDesc.args(ocl::KernelArg::ReadWriteNoSize(descriptors)).
+                        run(2, globalThreads_n, localThreads_n, true);
+    if(ok && !_descriptors.isUMat())
+        descriptors.copyTo(_descriptors);
+    return ok;
+}
+
+
+void SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints, UMat &keypointsGPU)
 {
     if (keypoints.empty())
         keypointsGPU.release();
@@ -332,11 +299,11 @@ void cv::ocl::SURF_OCL::uploadKeypoints(const std::vector<KeyPoint> &keypoints,
             kp_laplacian[i] = 1;
         }
 
-        keypointsGPU.upload(keypointsCPU);
+        keypointsCPU.copyTo(keypointsGPU);
     }
 }
 
-void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vector<KeyPoint> &keypoints)
+void SURF_OCL::downloadKeypoints(const UMat &keypointsGPU, std::vector<KeyPoint> &keypoints)
 {
     const int nFeatures = keypointsGPU.cols;
 
@@ -346,8 +313,7 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto
     {
         CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
 
-        Mat keypointsCPU(keypointsGPU);
-
+        Mat keypointsCPU = keypointsGPU.getMat(ACCESS_READ);
         keypoints.resize(nFeatures);
 
         float *kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
@@ -372,354 +338,122 @@ void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat &keypointsGPU, std::vecto
     }
 }
 
-void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat &descriptorsGPU, std::vector<float> &descriptors)
+bool SURF_OCL::detect(InputArray _img, InputArray _mask, UMat& keypoints)
 {
-    if (descriptorsGPU.empty())
-        descriptors.clear();
-    else
-    {
-        CV_Assert(descriptorsGPU.type() == CV_32F);
+    if( !setImage(_img, _mask) )
+        return false;
 
-        descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
-        Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
-        descriptorsGPU.download(descriptorsCPU);
-    }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints)
-{
-    if (!img.empty())
-    {
-        SURF_OCL_Invoker surf(*this, img, mask);
-
-        surf.detectKeypoints(keypoints);
-    }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
-                                   bool useProvidedKeypoints)
-{
-    if (!img.empty())
-    {
-        SURF_OCL_Invoker surf(*this, img, mask);
-
-        if (!useProvidedKeypoints)
-            surf.detectKeypoints(keypoints);
-        else if (!upright)
-        {
-            surf.findOrientation(keypoints);
-        }
-
-        surf.computeDescriptors(keypoints, descriptors, descriptorSize());
-    }
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints)
-{
-    oclMat keypointsGPU;
-
-    (*this)(img, mask, keypointsGPU);
-
-    downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
-                                   oclMat &descriptors, bool useProvidedKeypoints)
-{
-    oclMat keypointsGPU;
-
-    if (useProvidedKeypoints)
-        uploadKeypoints(keypoints, keypointsGPU);
-
-    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
-
-    downloadKeypoints(keypointsGPU, keypoints);
-}
-
-void cv::ocl::SURF_OCL::operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints,
-                                   std::vector<float> &descriptors, bool useProvidedKeypoints)
-{
-    oclMat descriptorsGPU;
-
-    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
-
-    downloadDescriptors(descriptorsGPU, descriptors);
-}
-
-void cv::ocl::SURF_OCL::releaseMemory()
-{
-    sum.release();
-    mask1.release();
-    maskSum.release();
-    intBuffer.release();
-    det.release();
-    trace.release();
-    maxPosBuffer.release();
+    return detectKeypoints(keypoints);
 }
 
 
-// bind source buffer to image oject.
-void SURF_OCL_Invoker::bindImgTex(const oclMat &img, cl_mem &texture)
+bool SURF_OCL::detectAndCompute(InputArray _img, InputArray _mask, UMat& keypoints,
+                                OutputArray _descriptors, bool useProvidedKeypoints )
 {
-    if(texture)
-    {
-        openCLFree(texture);
-    }
-    texture = bindTexture(img);
+    if( !setImage(_img, _mask) )
+        return false;
+
+    if( !useProvidedKeypoints && !detectKeypoints(keypoints) )
+        return false;
+
+    return computeDescriptors(keypoints, _descriptors);
 }
 
+inline int divUp(int a, int b) { return (a + b-1)/b; }
+
 ////////////////////////////
 // kernel caller definitions
-void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, int octave, int nOctaveLayers, int c_layer_rows)
+bool SURF_OCL::calcLayerDetAndTrace(int octave, int c_layer_rows)
 {
+    int nOctaveLayers = params->nOctaveLayers;
     const int min_size = calcSize(octave, 0);
     const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
     const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
 
-    Context *clCxt = det.clCxt;
-    String kernelName = "icvCalcLayerDetAndTrace";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    if(sumTex)
+    size_t localThreads[]  = {16, 16};
+    size_t globalThreads[] =
     {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
+        divUp(max_samples_j, (int)localThreads[0]) * localThreads[0],
+        divUp(max_samples_i, (int)localThreads[1]) * localThreads[1] * (nOctaveLayers + 2)
+    };
+    ocl::Kernel kerCalcDetTrace("SURF_calcLayerDetAndTrace", ocl::nonfree::surf_oclsrc, kerOpts);
+    if(haveImageSupport)
+    {
+        kerCalcDetTrace.args(sumTex,
+                             img_rows, img_cols, nOctaveLayers,
+                             octave, c_layer_rows,
+                             ocl::KernelArg::WriteOnlyNoSize(det),
+                             ocl::KernelArg::WriteOnlyNoSize(trace));
     }
     else
     {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
+        kerCalcDetTrace.args(ocl::KernelArg::ReadOnlyNoSize(sum),
+                             img_rows, img_cols, nOctaveLayers,
+                             octave, c_layer_rows,
+                             ocl::KernelArg::WriteOnlyNoSize(det),
+                             ocl::KernelArg::WriteOnlyNoSize(trace));
     }
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&c_layer_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
-
-    size_t localThreads[3]  = {16, 16, 1};
-    size_t globalThreads[3] =
-    {
-        divUp(max_samples_j, localThreads[0]) *localThreads[0],
-        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
-        1
-    };
-    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
+    return kerCalcDetTrace.run(2, globalThreads, localThreads, true);
 }
 
-void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
-        int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
+bool SURF_OCL::findMaximaInLayer(int counterOffset, int octave,
+                                 int layer_rows, int layer_cols)
 {
     const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+    int nOctaveLayers = params->nOctaveLayers;
 
-    Context *clCxt = det.clCxt;
-    String kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trace.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&counterOffset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&trace.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nLayers));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
-
-    if(useMask)
+    size_t localThreads[3]  = {16, 16};
+    size_t globalThreads[3] =
     {
-        if(maskSumTex)
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maskSumTex));
-        }
-        else
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.data));
-        }
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.maskSum.step));
-    }
-    size_t localThreads[3]  = {16, 16, 1};
-    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) *localThreads[0],
-                               divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) *nLayers *localThreads[1],
-                               1
-                              };
+        divUp(layer_cols - 2 * min_margin, (int)localThreads[0] - 2) * localThreads[0],
+        divUp(layer_rows - 2 * min_margin, (int)localThreads[1] - 2) * nOctaveLayers * localThreads[1]
+    };
 
-    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
+    ocl::Kernel kerFindMaxima("SURF_findMaximaInLayer", ocl::nonfree::surf_oclsrc, kerOpts);
+    return kerFindMaxima.args(ocl::KernelArg::ReadOnlyNoSize(det),
+                              ocl::KernelArg::ReadOnlyNoSize(trace),
+                              ocl::KernelArg::PtrReadWrite(maxPosBuffer),
+                              ocl::KernelArg::PtrReadWrite(counters),
+                              counterOffset, img_rows, img_cols,
+                              octave, nOctaveLayers,
+                              layer_rows, layer_cols,
+                              maxCandidates,
+                              (float)params->hessianThreshold).run(2, globalThreads, localThreads, true);
 }
 
-void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
-        oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
+bool SURF_OCL::interpolateKeypoint(int maxCounter, UMat &keypoints, int octave, int layer_rows, int max_features)
 {
-    Context *clCxt = det.clCxt;
-    String kernelName = "icvInterpolateKeypoint";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));
-
     size_t localThreads[3]  = {3, 3, 3};
-    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
+    size_t globalThreads[3] = {maxCounter*localThreads[0], localThreads[1], 3};
 
-    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
+    ocl::Kernel kerInterp("SURF_interpolateKeypoint", ocl::nonfree::surf_oclsrc, kerOpts);
+
+    return kerInterp.args(ocl::KernelArg::ReadOnlyNoSize(det),
+                   ocl::KernelArg::PtrReadOnly(maxPosBuffer),
+                   ocl::KernelArg::ReadWriteNoSize(keypoints),
+                   ocl::KernelArg::PtrReadWrite(counters),
+                   img_rows, img_cols, octave, layer_rows, max_features).
+        run(3, globalThreads, localThreads, true);
 }
 
-void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
+bool SURF_OCL::calcOrientation(UMat &keypoints)
 {
-    Context *clCxt = counters.clCxt;
-    String kernelName = "icvCalcOrientation";
+    int nFeatures = keypoints.cols;
+    if( nFeatures == 0 )
+        return true;
+    ocl::Kernel kerOri("SURF_calcOrientation", ocl::nonfree::surf_oclsrc, kerOpts);
 
-    std::vector< std::pair<size_t, const void *> > args;
-
-    if(sumTex)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sumTex));
-    }
+    if( haveImageSupport )
+        kerOri.args(sumTex, img_rows, img_cols,
+                    ocl::KernelArg::ReadWriteNoSize(keypoints));
     else
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&surf_.sum.data)); // if image2d is not supported
-    }
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
+        kerOri.args(ocl::KernelArg::ReadOnlyNoSize(sum),
+                    img_rows, img_cols,
+                    ocl::KernelArg::ReadWriteNoSize(keypoints));
 
-    size_t localThreads[3]  = {32, 4, 1};
-    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
-
-    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
+    size_t localThreads[3]  = {ORI_LOCAL_SIZE, 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], 1};
+    return kerOri.run(2, globalThreads, localThreads, true);
 }
 
-void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
-{
-    Context *clCxt = counters.clCxt;
-    String kernelName = "icvSetUpright";
-
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
-
-    size_t localThreads[3]  = {256, 1, 1};
-    size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
-
-    openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
 }
-
-
-void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
-{
-    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-    Context *clCxt = descriptors.clCxt;
-    String kernelName;
-    std::vector< std::pair<size_t, const void *> > args;
-    size_t localThreads[3]  = {1, 1, 1};
-    size_t globalThreads[3] = {1, 1, 1};
-
-    if(descriptors.cols == 64)
-    {
-        kernelName = "compute_descriptors64";
-
-        localThreads[0] = 6;
-        localThreads[1] = 6;
-
-        globalThreads[0] = nFeatures * localThreads[0];
-        globalThreads[1] = 16 * localThreads[1];
-
-        args.clear();
-        if(imgTex)
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
-        }
-        else
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
-        }
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-
-        openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
-
-        kernelName = "normalize_descriptors64";
-
-        localThreads[0] = 64;
-        localThreads[1] = 1;
-
-        globalThreads[0] = nFeatures * localThreads[0];
-        globalThreads[1] = localThreads[1];
-
-        args.clear();
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-
-        openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
-    }
-    else
-    {
-        kernelName = "compute_descriptors128";
-
-        localThreads[0] = 6;
-        localThreads[1] = 6;
-
-        globalThreads[0] = nFeatures * localThreads[0];
-        globalThreads[1] = 16 * localThreads[1];
-
-        args.clear();
-        if(imgTex)
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&imgTex));
-        }
-        else
-        {
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_img.data));
-        }
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-
-        openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
-
-        kernelName = "normalize_descriptors128";
-
-        localThreads[0] = 128;
-        localThreads[1] = 1;
-
-        globalThreads[0] = nFeatures * localThreads[0];
-        globalThreads[1] = localThreads[1];
-
-        args.clear();
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-
-        openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
-    }
-}
-
-#endif //HAVE_OPENCV_OCL
diff --git a/modules/objdetect/doc/cascade_classification.rst b/modules/objdetect/doc/cascade_classification.rst
index b10887b35..11c990673 100644
--- a/modules/objdetect/doc/cascade_classification.rst
+++ b/modules/objdetect/doc/cascade_classification.rst
@@ -32,112 +32,6 @@ The following reference is for the detection part only. There is a separate appl
 .. [Lienhart02] Rainer Lienhart and Jochen Maydt. An Extended Set of Haar-like Features for Rapid Object Detection. IEEE ICIP 2002, Vol. 1, pp. 900-903, Sep. 2002. This paper, as well as the extended technical report, can be retrieved at http://www.multimedia-computing.de/mediawiki//images/5/52/MRL-TR-May02-revised-Dec02.pdf
 
 
-FeatureEvaluator
-----------------
-.. ocv:class:: FeatureEvaluator
-
-Base class for computing feature values in cascade classifiers. ::
-
-    class CV_EXPORTS FeatureEvaluator
-    {
-    public:
-        enum { HAAR = 0, LBP = 1 }; // supported feature types
-        virtual ~FeatureEvaluator(); // destructor
-        virtual bool read(const FileNode& node);
-        virtual Ptr<FeatureEvaluator> clone() const;
-        virtual int getFeatureType() const;
-
-        virtual bool setImage(const Mat& img, Size origWinSize);
-        virtual bool setWindow(Point p);
-
-        virtual double calcOrd(int featureIdx) const;
-        virtual int calcCat(int featureIdx) const;
-
-        static Ptr<FeatureEvaluator> create(int type);
-    };
-
-
-FeatureEvaluator::read
---------------------------
-Reads parameters of features from the ``FileStorage`` node.
-
-.. ocv:function:: bool FeatureEvaluator::read(const FileNode& node)
-
-    :param node: File node from which the feature parameters are read.
-
-
-
-FeatureEvaluator::clone
----------------------------
-Returns a full copy of the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::clone() const
-
-
-
-FeatureEvaluator::getFeatureType
-------------------------------------
-Returns the feature type (``HAAR`` or ``LBP`` for now).
-
-.. ocv:function:: int FeatureEvaluator::getFeatureType() const
-
-
-FeatureEvaluator::setImage
-------------------------------
-Assigns an image to feature evaluator.
-
-.. ocv:function:: bool FeatureEvaluator::setImage(InputArray img, Size origWinSize, Size sumSize)
-
-    :param img: Matrix of the type ``CV_8UC1`` containing an image where the features are computed.
-
-    :param origWinSize: Size of training images.
-
-    :param sumSize: The requested size of integral images (so if the integral image is smaller, it resides in the top-left corner of the larger image of requested size). Because the features are represented using offsets from the image origin, using the same sumSize for all scales helps to avoid constant readjustments of the features to different scales.
-
-The method assigns an image, where the features will be computed, to the feature evaluator.
-
-
-
-FeatureEvaluator::setWindow
--------------------------------
-Assigns a window in the current image where the features will be computed.
-
-.. ocv:function:: bool FeatureEvaluator::setWindow(Point p)
-
-    :param p: Upper left point of the window where the features are computed. Size of the window is equal to the size of training images.
-
-FeatureEvaluator::calcOrd
------------------------------
-Computes the value of an ordered (numerical) feature.
-
-.. ocv:function:: double FeatureEvaluator::calcOrd(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed value of an ordered feature.
-
-
-
-FeatureEvaluator::calcCat
------------------------------
-Computes the value of a categorical feature.
-
-.. ocv:function:: int FeatureEvaluator::calcCat(int featureIdx) const
-
-    :param featureIdx: Index of the feature whose value is computed.
-
-The function returns the computed label of a categorical feature, which is the value from [0,... (number of categories - 1)].
-
-
-FeatureEvaluator::create
-----------------------------
-Constructs the feature evaluator.
-
-.. ocv:function:: Ptr<FeatureEvaluator> FeatureEvaluator::create(int type)
-
-    :param type: Type of features evaluated by cascade (``HAAR`` or ``LBP`` for now).
-
-
 CascadeClassifier
 -----------------
 .. ocv:class:: CascadeClassifier
diff --git a/modules/objdetect/doc/erfilter.rst b/modules/objdetect/doc/erfilter.rst
index a8976fbcd..85d6bcc7f 100644
--- a/modules/objdetect/doc/erfilter.rst
+++ b/modules/objdetect/doc/erfilter.rst
@@ -46,7 +46,7 @@ An ER is a 4-connected set of pixels with all its grey-level values smaller than
         //! Constructor
         explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0);
         //! Destructor
-        ~ERStat(){};
+        ~ERStat() { }
 
         //! seed point and threshold (max grey-level value)
         int pixel;
@@ -105,7 +105,7 @@ Base class for 1st and 2nd stages of Neumann and Matas scene text detection algo
         class CV_EXPORTS Callback
         {
         public:
-            virtual ~Callback(){};
+            virtual ~Callback() { }
             //! The classifier must return probability measure for the region.
             virtual double eval(const ERStat& stat) = 0;
         };
diff --git a/modules/objdetect/include/opencv2/objdetect.hpp b/modules/objdetect/include/opencv2/objdetect.hpp
index 0d5792124..5f2a62772 100644
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@@ -121,29 +121,6 @@ CV_EXPORTS   void groupRectangles_meanshift(std::vector<Rect>& rectList, std::ve
                                             std::vector<double>& foundScales,
                                             double detectThreshold = 0.0, Size winDetSize = Size(64, 128));
 
-class CV_EXPORTS FeatureEvaluator
-{
-public:
-    enum { HAAR = 0,
-           LBP  = 1,
-           HOG  = 2
-         };
-
-    virtual ~FeatureEvaluator();
-
-    virtual bool read(const FileNode& node);
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const;
-
-    virtual bool setImage(InputArray img, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point p);
-
-    virtual double calcOrd(int featureIdx) const;
-    virtual int calcCat(int featureIdx) const;
-
-    static Ptr<FeatureEvaluator> create(int type);
-};
-
 template<> CV_EXPORTS void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const;
 
 enum { CASCADE_DO_CANNY_PRUNING    = 1,
@@ -190,7 +167,7 @@ public:
     public:
         virtual ~MaskGenerator() {}
         virtual Mat generateMask(const Mat& src)=0;
-        virtual void initializeMask(const Mat& /*src*/) {};
+        virtual void initializeMask(const Mat& /*src*/) { }
     };
     virtual void setMaskGenerator(const Ptr<MaskGenerator>& maskGenerator) = 0;
     virtual Ptr<MaskGenerator> getMaskGenerator() = 0;
@@ -269,7 +246,7 @@ public:
     CV_WRAP HOGDescriptor() : winSize(64,128), blockSize(16,16), blockStride(8,8),
         cellSize(8,8), nbins(9), derivAperture(1), winSigma(-1),
         histogramNormType(HOGDescriptor::L2Hys), L2HysThreshold(0.2), gammaCorrection(true),
-        nlevels(HOGDescriptor::DEFAULT_NLEVELS)
+        free_coef(-1.f), nlevels(HOGDescriptor::DEFAULT_NLEVELS)
     {}
 
     CV_WRAP HOGDescriptor(Size _winSize, Size _blockSize, Size _blockStride,
@@ -280,7 +257,7 @@ public:
     : winSize(_winSize), blockSize(_blockSize), blockStride(_blockStride), cellSize(_cellSize),
     nbins(_nbins), derivAperture(_derivAperture), winSigma(_winSigma),
     histogramNormType(_histogramNormType), L2HysThreshold(_L2HysThreshold),
-    gammaCorrection(_gammaCorrection), nlevels(_nlevels)
+    gammaCorrection(_gammaCorrection), free_coef(-1.f), nlevels(_nlevels)
     {}
 
     CV_WRAP HOGDescriptor(const String& filename)
@@ -308,10 +285,11 @@ public:
     CV_WRAP virtual void save(const String& filename, const String& objname = String()) const;
     virtual void copyTo(HOGDescriptor& c) const;
 
-    CV_WRAP virtual void compute(const Mat& img,
+    CV_WRAP virtual void compute(InputArray img,
                          CV_OUT std::vector<float>& descriptors,
                          Size winStride = Size(), Size padding = Size(),
                          const std::vector<Point>& locations = std::vector<Point>()) const;
+
     //with found weights output
     CV_WRAP virtual void detect(const Mat& img, CV_OUT std::vector<Point>& foundLocations,
                         CV_OUT std::vector<double>& weights,
@@ -323,13 +301,14 @@ public:
                         double hitThreshold = 0, Size winStride = Size(),
                         Size padding = Size(),
                         const std::vector<Point>& searchLocations=std::vector<Point>()) const;
+
     //with result weights output
-    CV_WRAP virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
+    CV_WRAP virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
                                   CV_OUT std::vector<double>& foundWeights, double hitThreshold = 0,
                                   Size winStride = Size(), Size padding = Size(), double scale = 1.05,
                                   double finalThreshold = 2.0,bool useMeanshiftGrouping = false) const;
     //without found weights output
-    virtual void detectMultiScale(const Mat& img, CV_OUT std::vector<Rect>& foundLocations,
+    virtual void detectMultiScale(InputArray img, CV_OUT std::vector<Rect>& foundLocations,
                                   double hitThreshold = 0, Size winStride = Size(),
                                   Size padding = Size(), double scale = 1.05,
                                   double finalThreshold = 2.0, bool useMeanshiftGrouping = false) const;
@@ -351,25 +330,27 @@ public:
     CV_PROP double L2HysThreshold;
     CV_PROP bool gammaCorrection;
     CV_PROP std::vector<float> svmDetector;
+    UMat oclSvmDetector;
+    float free_coef;
     CV_PROP int nlevels;
 
 
-   // evaluate specified ROI and return confidence value for each location
-   virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
+    // evaluate specified ROI and return confidence value for each location
+    virtual void detectROI(const cv::Mat& img, const std::vector<cv::Point> &locations,
                                    CV_OUT std::vector<cv::Point>& foundLocations, CV_OUT std::vector<double>& confidences,
                                    double hitThreshold = 0, cv::Size winStride = Size(),
                                    cv::Size padding = Size()) const;
 
-   // evaluate specified ROI and return confidence value for each location in multiple scales
-   virtual void detectMultiScaleROI(const cv::Mat& img,
+    // evaluate specified ROI and return confidence value for each location in multiple scales
+    virtual void detectMultiScaleROI(const cv::Mat& img,
                                                        CV_OUT std::vector<cv::Rect>& foundLocations,
                                                        std::vector<DetectionROI>& locations,
                                                        double hitThreshold = 0,
                                                        int groupThreshold = 0) const;
 
-   // read/parse Dalal's alt model file
-   void readALTModel(String modelfile);
-   void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
+    // read/parse Dalal's alt model file
+    void readALTModel(String modelfile);
+    void groupRectangles(std::vector<cv::Rect>& rectList, std::vector<double>& weights, int groupThreshold, double eps) const;
 };
 
 
diff --git a/modules/objdetect/include/opencv2/objdetect/erfilter.hpp b/modules/objdetect/include/opencv2/objdetect/erfilter.hpp
index 9dc919a41..d7e07d80d 100644
--- a/modules/objdetect/include/opencv2/objdetect/erfilter.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/erfilter.hpp
@@ -67,7 +67,7 @@ public:
     //! Constructor
     explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0);
     //! Destructor
-    ~ERStat(){};
+    ~ERStat() { }
 
     //! seed point and the threshold (max grey-level value)
     int pixel;
@@ -123,7 +123,7 @@ public:
     class CV_EXPORTS Callback
     {
     public:
-        virtual ~Callback(){};
+        virtual ~Callback() { }
         //! The classifier must return probability measure for the region.
         virtual double eval(const ERStat& stat) = 0; //const = 0; //TODO why cannot use const = 0 here?
     };
diff --git a/modules/objdetect/perf/opencl/perf_cascades.cpp b/modules/objdetect/perf/opencl/perf_cascades.cpp
new file mode 100644
index 000000000..b660f5911
--- /dev/null
+++ b/modules/objdetect/perf/opencl/perf_cascades.cpp
@@ -0,0 +1,63 @@
+#include "perf_precomp.hpp"
+#include <opencv2/imgproc.hpp>
+
+#include "opencv2/ts/ocl_perf.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+typedef std::tr1::tuple<std::string, std::string, int> Cascade_Image_MinSize_t;
+typedef perf::TestBaseWithParam<Cascade_Image_MinSize_t> Cascade_Image_MinSize;
+
+#ifdef HAVE_OPENCL
+
+OCL_PERF_TEST_P(Cascade_Image_MinSize, CascadeClassifier,
+                 testing::Combine(
+                    testing::Values( string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt.xml"),
+                                     string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt2.xml"),
+                                     string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt_old.xml"),
+                                     string("cv/cascadeandhog/cascades/haarcascade_frontalface_alt2_old.xml"),
+                                     string("cv/cascadeandhog/cascades/lbpcascade_frontalface.xml") ),
+                    testing::Values( string("cv/shared/lena.png"),
+                                     string("cv/cascadeandhog/images/bttf301.png"),
+                                     string("cv/cascadeandhog/images/class57.png") ),
+                    testing::Values(30, 64, 90) ) )
+{
+    const string cascadePath = get<0>(GetParam());
+    const string imagePath   = get<1>(GetParam());
+    int min_size = get<2>(GetParam());
+    Size minSize(min_size, min_size);
+
+    CascadeClassifier cc( getDataPath(cascadePath) );
+    if (cc.empty())
+        FAIL() << "Can't load cascade file: " << getDataPath(cascadePath);
+
+    Mat img = imread(getDataPath(imagePath), IMREAD_GRAYSCALE);
+    if (img.empty())
+        FAIL() << "Can't load source image: " << getDataPath(imagePath);
+
+    vector<Rect> faces;
+
+    equalizeHist(img, img);
+    declare.in(img).time(60);
+
+    UMat uimg = img.getUMat(ACCESS_READ);
+
+    while(next())
+    {
+        faces.clear();
+        cvtest::ocl::perf::safeFinish();
+
+        startTimer();
+        cc.detectMultiScale(uimg, faces, 1.1, 3, 0, minSize);
+        stopTimer();
+    }
+
+    sort(faces.begin(), faces.end(), comparators::RectLess());
+    SANITY_CHECK(faces, min_size/5);
+}
+
+#endif //HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_hog.cpp b/modules/objdetect/perf/opencl/perf_hogdetect.cpp
similarity index 74%
rename from modules/ocl/perf/perf_hog.cpp
rename to modules/objdetect/perf/opencl/perf_hogdetect.cpp
index 2a6731117..1d107151a 100644
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/objdetect/perf/opencl/perf_hogdetect.cpp
@@ -43,10 +43,14 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-using namespace perf;
+#ifdef HAVE_OPENCL
 
+namespace cvtest {
+namespace ocl {
 ///////////// HOG////////////////////////
 
 struct RectLess :
@@ -66,35 +70,25 @@ struct RectLess :
     }
 };
 
-PERF_TEST(HOGFixture, HOG)
+OCL_PERF_TEST(HOGFixture, HOG)
 {
-    Mat src = imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE);
-    ASSERT_TRUE(!src.empty()) << "can't open input image road.png";
+    UMat src;
+    imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE).copyTo(src);
+    ASSERT_FALSE(src.empty());
 
     vector<cv::Rect> found_locations;
-    declare.in(src).time(5);
+    declare.in(src);
 
-    if (RUN_PLAIN_IMPL)
-    {
-        HOGDescriptor hog;
-        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    HOGDescriptor hog;
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
 
-        TEST_CYCLE() hog.detectMultiScale(src, found_locations);
+    OCL_TEST_CYCLE() hog.detectMultiScale(src, found_locations);
 
-        std::sort(found_locations.begin(), found_locations.end(), RectLess());
-        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::HOGDescriptor ocl_hog;
-        ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() ocl_hog.detectMultiScale(oclSrc, found_locations);
-
-        std::sort(found_locations.begin(), found_locations.end(), RectLess());
-        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
-    }
-    else
-        OCL_PERF_ELSE
+    std::sort(found_locations.begin(), found_locations.end(), RectLess());
+    SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
 }
+
+}
+}
+
+#endif
diff --git a/modules/objdetect/perf/perf_cascadeclassifier.cpp b/modules/objdetect/perf/perf_cascadeclassifier.cpp
deleted file mode 100644
index 1d5bff11f..000000000
--- a/modules/objdetect/perf/perf_cascadeclassifier.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "perf_precomp.hpp"
-#include <opencv2/imgproc.hpp>
-
-using namespace std;
-using namespace cv;
-using namespace perf;
-using std::tr1::make_tuple;
-using std::tr1::get;
-
-typedef std::tr1::tuple<std::string, int> ImageName_MinSize_t;
-typedef perf::TestBaseWithParam<ImageName_MinSize_t> ImageName_MinSize;
-
-PERF_TEST_P(ImageName_MinSize, CascadeClassifierLBPFrontalFace,
-            testing::Combine(testing::Values( std::string("cv/shared/lena.png"),
-                                              std::string("cv/shared/1_itseez-0000289.png"),
-                                              std::string("cv/shared/1_itseez-0000492.png"),
-                                              std::string("cv/shared/1_itseez-0000573.png")),
-                             testing::Values(24, 30, 40, 50, 60, 70, 80, 90)
-                             )
-            )
-{
-    const string filename = get<0>(GetParam());
-    int min_size = get<1>(GetParam());
-    Size minSize(min_size, min_size);
-
-    CascadeClassifier cc(getDataPath("cv/cascadeandhog/cascades/lbpcascade_frontalface.xml"));
-    if (cc.empty())
-        FAIL() << "Can't load cascade file";
-
-    Mat img = imread(getDataPath(filename), 0);
-    if (img.empty())
-        FAIL() << "Can't load source image";
-
-    vector<Rect> faces;
-
-    equalizeHist(img, img);
-    declare.in(img);
-
-    while(next())
-    {
-        faces.clear();
-
-        startTimer();
-        cc.detectMultiScale(img, faces, 1.1, 3, 0, minSize);
-        stopTimer();
-    }
-
-    std::sort(faces.begin(), faces.end(), comparators::RectLess());
-    SANITY_CHECK(faces, 3.001 * faces.size());
-}
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 17776013c..bb187cd61 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -46,71 +46,6 @@
 #include "opencv2/objdetect/objdetect_c.h"
 #include "opencl_kernels.hpp"
 
-#if defined (LOG_CASCADE_STATISTIC)
-struct Logger
-{
-    enum { STADIES_NUM = 20 };
-
-    int gid;
-    cv::Mat mask;
-    cv::Size sz0;
-    int step;
-
-
-    Logger() : gid (0), step(2) {}
-    void setImage(const cv::Mat& image)
-    {
-     if (gid == 0)
-         sz0 = image.size();
-
-      mask.create(image.rows, image.cols * (STADIES_NUM + 1) + STADIES_NUM, CV_8UC1);
-      mask = cv::Scalar(0);
-      cv::Mat roi = mask(cv::Rect(cv::Point(0,0), image.size()));
-      image.copyTo(roi);
-
-      printf("%d) Size = (%d, %d)\n", gid, image.cols, image.rows);
-
-      for(int i = 0; i < STADIES_NUM; ++i)
-      {
-          int x = image.cols + i * (image.cols + 1);
-          cv::line(mask, cv::Point(x, 0), cv::Point(x, mask.rows-1), cv::Scalar(255));
-      }
-
-      if (sz0.width/image.cols > 2 && sz0.height/image.rows > 2)
-          step = 1;
-    }
-
-    void setPoint(const cv::Point& p, int passed_stadies)
-    {
-        int cols = mask.cols / (STADIES_NUM + 1);
-
-        passed_stadies = -passed_stadies;
-        passed_stadies = (passed_stadies == -1) ? STADIES_NUM : passed_stadies;
-
-        unsigned char* ptr = mask.ptr<unsigned char>(p.y) + cols + 1 + p.x;
-        for(int i = 0; i < passed_stadies; ++i, ptr += cols + 1)
-        {
-            *ptr = 255;
-
-            if (step == 2)
-            {
-                ptr[1] = 255;
-                ptr[mask.step] = 255;
-                ptr[mask.step + 1] = 255;
-            }
-        }
-    };
-
-    void write()
-    {
-        char buf[4096];
-        sprintf(buf, "%04d.png", gid++);
-        cv::imwrite(buf, mask);
-    }
-
-} logger;
-#endif
-
 namespace cv
 {
 
@@ -121,7 +56,8 @@ template<typename _Tp> void copyVectorToUMat(const std::vector<_Tp>& v, UMat& um
     Mat(1, (int)(v.size()*sizeof(v[0])), CV_8U, (void*)&v[0]).copyTo(um);
 }
 
-void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps, std::vector<int>* weights, std::vector<double>* levelWeights)
+void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps,
+                     std::vector<int>* weights, std::vector<double>* levelWeights)
 {
     if( groupThreshold <= 0 || rectList.empty() )
     {
@@ -152,6 +88,9 @@ void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps
         rrects[cls].height += rectList[i].height;
         rweights[cls]++;
     }
+
+    bool useDefaultWeights = false;
+
     if ( levelWeights && weights && !weights->empty() && !levelWeights->empty() )
     {
         for( i = 0; i < nlabels; i++ )
@@ -166,6 +105,8 @@ void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps
                 rejectWeights[cls] = (*levelWeights)[i];
         }
     }
+    else
+        useDefaultWeights = true;
 
     for( i = 0; i < nclasses; i++ )
     {
@@ -218,7 +159,7 @@ void groupRectangles(std::vector<Rect>& rectList, int groupThreshold, double eps
         {
             rectList.push_back(r1);
             if( weights )
-                weights->push_back(l1);
+                weights->push_back(useDefaultWeights ? n1 : l1);
             if( levelWeights )
                 levelWeights->push_back(w1);
         }
@@ -426,7 +367,8 @@ void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& weights, int
     groupRectangles(rectList, groupThreshold, eps, &weights, 0);
 }
 //used for cascade detection algorithm for ROC-curve calculating
-void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels, std::vector<double>& levelWeights, int groupThreshold, double eps)
+void groupRectangles(std::vector<Rect>& rectList, std::vector<int>& rejectLevels,
+                     std::vector<double>& levelWeights, int groupThreshold, double eps)
 {
     groupRectangles(rectList, groupThreshold, eps, &rejectLevels, &levelWeights);
 }
@@ -439,14 +381,138 @@ void groupRectangles_meanshift(std::vector<Rect>& rectList, std::vector<double>&
 
 
 FeatureEvaluator::~FeatureEvaluator() {}
-bool FeatureEvaluator::read(const FileNode&) {return true;}
+
+bool FeatureEvaluator::read(const FileNode&, Size _origWinSize)
+{
+    origWinSize = _origWinSize;
+    localSize = lbufSize = Size(0, 0);
+    if (scaleData.empty())
+        scaleData = makePtr<std::vector<ScaleData> >();
+    else
+        scaleData->clear();
+    return true;
+}
+
 Ptr<FeatureEvaluator> FeatureEvaluator::clone() const { return Ptr<FeatureEvaluator>(); }
 int FeatureEvaluator::getFeatureType() const {return -1;}
-bool FeatureEvaluator::setImage(InputArray, Size, Size) {return true;}
-bool FeatureEvaluator::setWindow(Point) { return true; }
-double FeatureEvaluator::calcOrd(int) const { return 0.; }
+bool FeatureEvaluator::setWindow(Point, int) { return true; }
+void FeatureEvaluator::getUMats(std::vector<UMat>& bufs)
+{
+    if (!(sbufFlag & USBUF_VALID))
+    {
+        sbuf.copyTo(usbuf);
+        sbufFlag |= USBUF_VALID;
+    }
+
+    bufs.clear();
+    bufs.push_back(uscaleData);
+    bufs.push_back(usbuf);
+    bufs.push_back(ufbuf);
+}
+
+void FeatureEvaluator::getMats()
+{
+    if (!(sbufFlag & SBUF_VALID))
+    {
+        usbuf.copyTo(sbuf);
+        sbufFlag |= SBUF_VALID;
+    }
+}
+
+float FeatureEvaluator::calcOrd(int) const { return 0.; }
 int FeatureEvaluator::calcCat(int) const { return 0; }
 
+bool FeatureEvaluator::updateScaleData( Size imgsz, const std::vector<float>& _scales )
+{
+    if( scaleData.empty() )
+        scaleData = makePtr<std::vector<ScaleData> >();
+
+    size_t i, nscales = _scales.size();
+    bool recalcOptFeatures = nscales != scaleData->size();
+    scaleData->resize(nscales);
+
+    int layer_dy = 0;
+    Point layer_ofs(0,0);
+    Size prevBufSize = sbufSize;
+    sbufSize.width = std::max(sbufSize.width, (int)alignSize(cvRound(imgsz.width/_scales[0]) + 31, 32));
+    recalcOptFeatures = recalcOptFeatures || sbufSize.width != prevBufSize.width;
+
+    for( i = 0; i < nscales; i++ )
+    {
+        FeatureEvaluator::ScaleData& s = scaleData->at(i);
+        if( !recalcOptFeatures && fabs(s.scale - _scales[i]) > FLT_EPSILON*100*_scales[i] )
+            recalcOptFeatures = true;
+        float sc = _scales[i];
+        Size sz;
+        sz.width = cvRound(imgsz.width/sc);
+        sz.height = cvRound(imgsz.height/sc);
+        s.ystep = sc >= 2 ? 1 : 2;
+        s.scale = sc;
+        s.szi = Size(sz.width+1, sz.height+1);
+        if( layer_ofs.x + s.szi.width > sbufSize.width )
+        {
+            layer_ofs = Point(0, layer_ofs.y + layer_dy);
+            layer_dy = s.szi.height;
+        }
+        s.layer_ofs = layer_ofs.y*sbufSize.width + layer_ofs.x;
+        layer_ofs.x += s.szi.width;
+    }
+
+    layer_ofs.y += layer_dy;
+    sbufSize.height = std::max(sbufSize.height, layer_ofs.y);
+    recalcOptFeatures = recalcOptFeatures || sbufSize.height != prevBufSize.height;
+    return recalcOptFeatures;
+}
+
+
+bool FeatureEvaluator::setImage( InputArray _image, const std::vector<float>& _scales )
+{
+    Size imgsz = _image.size();
+    bool recalcOptFeatures = updateScaleData(imgsz, _scales);
+
+    size_t i, nscales = scaleData->size();
+    Size sz0 = scaleData->at(0).szi;
+    sz0 = Size(std::max(rbuf.cols, (int)alignSize(sz0.width, 16)), std::max(rbuf.rows, sz0.height));
+
+    if (recalcOptFeatures)
+    {
+        computeOptFeatures();
+        copyVectorToUMat(*scaleData, uscaleData);
+    }
+
+    if (_image.isUMat() && localSize.area() > 0)
+    {
+        usbuf.create(sbufSize.height*nchannels, sbufSize.width, CV_32S);
+        urbuf.create(sz0, CV_8U);
+
+        for (i = 0; i < nscales; i++)
+        {
+            const ScaleData& s = scaleData->at(i);
+            UMat dst(urbuf, Rect(0, 0, s.szi.width - 1, s.szi.height - 1));
+            resize(_image, dst, dst.size(), 1. / s.scale, 1. / s.scale, INTER_LINEAR);
+            computeChannels((int)i, dst);
+        }
+        sbufFlag = USBUF_VALID;
+    }
+    else
+    {
+        Mat image = _image.getMat();
+        sbuf.create(sbufSize.height*nchannels, sbufSize.width, CV_32S);
+        rbuf.create(sz0, CV_8U);
+
+        for (i = 0; i < nscales; i++)
+        {
+            const ScaleData& s = scaleData->at(i);
+            Mat dst(s.szi.height - 1, s.szi.width - 1, CV_8U, rbuf.data);
+            resize(image, dst, dst.size(), 1. / s.scale, 1. / s.scale, INTER_LINEAR);
+            computeChannels((int)i, dst);
+        }
+        sbufFlag = SBUF_VALID;
+    }
+
+    return true;
+}
+
 //----------------------------------------------  HaarEvaluator ---------------------------------------
 
 bool HaarEvaluator::Feature :: read( const FileNode& node )
@@ -476,24 +542,32 @@ HaarEvaluator::HaarEvaluator()
 {
     optfeaturesPtr = 0;
     pwin = 0;
+    localSize = Size(4, 2);
+    lbufSize = Size(0, 0);
+    nchannels = 0;
 }
+
 HaarEvaluator::~HaarEvaluator()
 {
 }
 
-bool HaarEvaluator::read(const FileNode& node)
+bool HaarEvaluator::read(const FileNode& node, Size _origWinSize)
 {
+    if (!FeatureEvaluator::read(node, _origWinSize))
+        return false;
     size_t i, n = node.size();
     CV_Assert(n > 0);
     if(features.empty())
         features = makePtr<std::vector<Feature> >();
     if(optfeatures.empty())
         optfeatures = makePtr<std::vector<OptFeature> >();
+    if (optfeatures_lbuf.empty())
+        optfeatures_lbuf = makePtr<std::vector<OptFeature> >();
     features->resize(n);
     FileNodeIterator it = node.begin();
     hasTiltedFeatures = false;
     std::vector<Feature>& ff = *features;
-    sumSize0 = Size();
+    sbufSize = Size();
     ufbuf.release();
 
     for(i = 0; i < n; i++, ++it)
@@ -503,143 +577,151 @@ bool HaarEvaluator::read(const FileNode& node)
         if( ff[i].tilted )
             hasTiltedFeatures = true;
     }
+    nchannels = hasTiltedFeatures ? 3 : 2;
+    normrect = Rect(1, 1, origWinSize.width - 2, origWinSize.height - 2);
+
+    localSize = lbufSize = Size(0, 0);
+    if (ocl::haveOpenCL())
+    {
+        String vname = ocl::Device::getDefault().vendor();
+        if (vname == "Advanced Micro Devices, Inc." ||
+            vname == "AMD")
+        {
+            localSize = Size(8, 8);
+            lbufSize = Size(origWinSize.width + localSize.width,
+                            origWinSize.height + localSize.height);
+            if (lbufSize.area() > 1024)
+                lbufSize = Size(0, 0);
+        }
+    }
+
     return true;
 }
 
 Ptr<FeatureEvaluator> HaarEvaluator::clone() const
 {
     Ptr<HaarEvaluator> ret = makePtr<HaarEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->optfeatures = optfeatures;
-    ret->optfeaturesPtr = optfeatures->empty() ? 0 : &(*(ret->optfeatures))[0];
-    ret->hasTiltedFeatures = hasTiltedFeatures;
-    ret->sum0 = sum0; ret->sqsum0 = sqsum0;
-    ret->sum = sum; ret->sqsum = sqsum;
-    ret->usum0 = usum0; ret->usqsum0 = usqsum0; ret->ufbuf = ufbuf;
-    ret->normrect = normrect;
-    memcpy( ret->nofs, nofs, 4*sizeof(nofs[0]) );
-    ret->pwin = pwin;
-    ret->varianceNormFactor = varianceNormFactor;
+    *ret = *this;
     return ret;
 }
 
-bool HaarEvaluator::setImage( InputArray _image, Size _origWinSize, Size _sumSize )
+
+void HaarEvaluator::computeChannels(int scaleIdx, InputArray img)
 {
-    Size imgsz = _image.size();
-    int cols = imgsz.width, rows = imgsz.height;
+    const ScaleData& s = scaleData->at(scaleIdx);
+    tofs = (int)sbufSize.area();
+    sqofs = hasTiltedFeatures ? tofs*2 : tofs;
 
-    if (imgsz.width < origWinSize.width || imgsz.height < origWinSize.height)
-        return false;
-
-    origWinSize = _origWinSize;
-    normrect = Rect(1, 1, origWinSize.width-2, origWinSize.height-2);
-
-    int rn = _sumSize.height, cn = _sumSize.width, rn_scale = hasTiltedFeatures ? 2 : 1;
-    int sumStep, tofs = 0;
-    CV_Assert(rn >= rows+1 && cn >= cols+1);
-
-    if( _image.isUMat() )
+    if (img.isUMat())
     {
-        usum0.create(rn*rn_scale, cn, CV_32S);
-        usqsum0.create(rn, cn, CV_32S);
-        usum = UMat(usum0, Rect(0, 0, cols+1, rows+1));
-        usqsum = UMat(usqsum0, Rect(0, 0, cols, rows));
+        int sx = s.layer_ofs % sbufSize.width;
+        int sy = s.layer_ofs / sbufSize.width;
+        int sqy = sy + (sqofs / sbufSize.width);
+        UMat sum(usbuf, Rect(sx, sy, s.szi.width, s.szi.height));
+        UMat sqsum(usbuf, Rect(sx, sqy, s.szi.width, s.szi.height));
+        sqsum.flags = (sqsum.flags & ~UMat::DEPTH_MASK) | CV_32F;
 
-        if( hasTiltedFeatures )
+        if (hasTiltedFeatures)
         {
-            UMat utilted(usum0, Rect(0, _sumSize.height, cols+1, rows+1));
-            integral(_image, usum, noArray(), utilted, CV_32S);
-            tofs = (int)((utilted.offset - usum.offset)/sizeof(int));
+            int sty = sy + (tofs / sbufSize.width);
+            UMat tilted(usbuf, Rect(sx, sty, s.szi.width, s.szi.height));
+            integral(img, sum, sqsum, tilted, CV_32S, CV_32F);
         }
         else
         {
-            integral(_image, usum, noArray(), noArray(), CV_32S);
+            UMatData* u = sqsum.u;
+            integral(img, sum, sqsum, noArray(), CV_32S, CV_32F);
+            CV_Assert(sqsum.u == u && sqsum.size() == s.szi && sqsum.type()==CV_32F);
         }
-
-        sqrBoxFilter(_image, usqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        /*sqrBoxFilter(_image.getMat(), sqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        sqsum.copyTo(usqsum);*/
-        sumStep = (int)(usum.step/usum.elemSize());
     }
     else
     {
-        sum0.create(rn*rn_scale, cn, CV_32S);
-        sqsum0.create(rn, cn, CV_32S);
-        sum = sum0(Rect(0, 0, cols+1, rows+1));
-        sqsum = sqsum0(Rect(0, 0, cols, rows));
+        Mat sum(s.szi, CV_32S, sbuf.ptr<int>() + s.layer_ofs, sbuf.step);
+        Mat sqsum(s.szi, CV_32F, sum.ptr<int>() + sqofs, sbuf.step);
 
-        if( hasTiltedFeatures )
+        if (hasTiltedFeatures)
         {
-            Mat tilted = sum0(Rect(0, _sumSize.height, cols+1, rows+1));
-            integral(_image, sum, noArray(), tilted, CV_32S);
-            tofs = (int)((tilted.data - sum.data)/sizeof(int));
+            Mat tilted(s.szi, CV_32S, sum.ptr<int>() + tofs, sbuf.step);
+            integral(img, sum, sqsum, tilted, CV_32S, CV_32F);
         }
         else
-            integral(_image, sum, noArray(), noArray(), CV_32S);
-        sqrBoxFilter(_image, sqsum, CV_32S,
-                     Size(normrect.width, normrect.height),
-                     Point(0, 0), false);
-        sumStep = (int)(sum.step/sum.elemSize());
+            integral(img, sum, sqsum, noArray(), CV_32S, CV_32F);
     }
+}
 
-    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sumStep );
+void HaarEvaluator::computeOptFeatures()
+{
+    int sstep = sbufSize.width;
+    CV_SUM_OFS( nofs[0], nofs[1], nofs[2], nofs[3], 0, normrect, sstep );
 
     size_t fi, nfeatures = features->size();
     const std::vector<Feature>& ff = *features;
+    optfeatures->resize(nfeatures);
+    optfeaturesPtr = &(*optfeatures)[0];
+    for( fi = 0; fi < nfeatures; fi++ )
+        optfeaturesPtr[fi].setOffsets( ff[fi], sstep, tofs );
+    optfeatures_lbuf->resize(nfeatures);
 
-    if( sumSize0 != _sumSize )
-    {
-        optfeatures->resize(nfeatures);
-        optfeaturesPtr = &(*optfeatures)[0];
-        for( fi = 0; fi < nfeatures; fi++ )
-            optfeaturesPtr[fi].setOffsets( ff[fi], sumStep, tofs );
-    }
-    if( _image.isUMat() && (sumSize0 != _sumSize || ufbuf.empty()) )
-        copyVectorToUMat(*optfeatures, ufbuf);
-    sumSize0 = _sumSize;
+    for( fi = 0; fi < nfeatures; fi++ )
+        optfeatures_lbuf->at(fi).setOffsets(ff[fi], lbufSize.width > 0 ? lbufSize.width : sstep, tofs);
 
-    return true;
+    copyVectorToUMat(*optfeatures_lbuf, ufbuf);
 }
 
 
-bool  HaarEvaluator::setWindow( Point pt )
+bool HaarEvaluator::setWindow( Point pt, int scaleIdx )
 {
+    const ScaleData& s = getScaleData(scaleIdx);
+
     if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= sum.cols ||
-        pt.y + origWinSize.height >= sum.rows )
+        pt.x + origWinSize.width >= s.szi.width ||
+        pt.y + origWinSize.height >= s.szi.height )
         return false;
 
-    const int* p = &sum.at<int>(pt);
-    int valsum = CALC_SUM_OFS(nofs, p);
-    double valsqsum = sqsum.at<int>(pt.y + normrect.y, pt.x + normrect.x);
+    pwin = &sbuf.at<int>(pt) + s.layer_ofs;
+    const float* pq = (const float*)(pwin + sqofs);
+    int valsum = CALC_SUM_OFS(nofs, pwin);
+    float valsqsum = CALC_SUM_OFS(nofs, pq);
 
     double nf = (double)normrect.area() * valsqsum - (double)valsum * valsum;
     if( nf > 0. )
         nf = std::sqrt(nf);
     else
         nf = 1.;
-    varianceNormFactor = 1./nf;
-    pwin = p;
+    varianceNormFactor = (float)(1./nf);
 
     return true;
 }
 
+
+void HaarEvaluator::OptFeature::setOffsets( const Feature& _f, int step, int _tofs )
+{
+    weight[0] = _f.rect[0].weight;
+    weight[1] = _f.rect[1].weight;
+    weight[2] = _f.rect[2].weight;
+
+    if( _f.tilted )
+    {
+        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], _tofs, _f.rect[0].r, step );
+        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], _tofs, _f.rect[1].r, step );
+        CV_TILTED_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], _tofs, _f.rect[2].r, step );
+    }
+    else
+    {
+        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
+        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
+        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, _f.rect[2].r, step );
+    }
+}
+
 Rect HaarEvaluator::getNormRect() const
 {
     return normrect;
 }
 
-void HaarEvaluator::getUMats(std::vector<UMat>& bufs)
+int HaarEvaluator::getSquaresOffset() const
 {
-    bufs.clear();
-    bufs.push_back(usum);
-    bufs.push_back(usqsum);
-    bufs.push_back(ufbuf);
+    return sqofs;
 }
 
 //----------------------------------------------  LBPEvaluator -------------------------------------
@@ -654,254 +736,121 @@ bool LBPEvaluator::Feature :: read(const FileNode& node )
 LBPEvaluator::LBPEvaluator()
 {
     features = makePtr<std::vector<Feature> >();
+    optfeatures = makePtr<std::vector<OptFeature> >();
+    scaleData = makePtr<std::vector<ScaleData> >();
 }
+
 LBPEvaluator::~LBPEvaluator()
 {
 }
 
-bool LBPEvaluator::read( const FileNode& node )
+bool LBPEvaluator::read( const FileNode& node, Size _origWinSize )
 {
+    if (!FeatureEvaluator::read(node, _origWinSize))
+        return false;
+    if(features.empty())
+        features = makePtr<std::vector<Feature> >();
+    if(optfeatures.empty())
+        optfeatures = makePtr<std::vector<OptFeature> >();
+    if (optfeatures_lbuf.empty())
+        optfeatures_lbuf = makePtr<std::vector<OptFeature> >();
+
     features->resize(node.size());
-    featuresPtr = &(*features)[0];
+    optfeaturesPtr = 0;
     FileNodeIterator it = node.begin(), it_end = node.end();
+    std::vector<Feature>& ff = *features;
     for(int i = 0; it != it_end; ++it, i++)
     {
-        if(!featuresPtr[i].read(*it))
+        if(!ff[i].read(*it))
             return false;
     }
+    nchannels = 1;
+    localSize = lbufSize = Size(0, 0);
+    if (ocl::haveOpenCL())
+    {
+        const ocl::Device& device = ocl::Device::getDefault();
+        String vname = device.vendor();
+        if ((vname == "Advanced Micro Devices, Inc." ||
+            vname == "AMD") && !device.hostUnifiedMemory())
+            localSize = Size(8, 8);
+    }
     return true;
 }
 
 Ptr<FeatureEvaluator> LBPEvaluator::clone() const
 {
     Ptr<LBPEvaluator> ret = makePtr<LBPEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->featuresPtr = &(*ret->features)[0];
-    ret->sum0 = sum0, ret->sum = sum;
-    ret->normrect = normrect;
-    ret->offset = offset;
+    *ret = *this;
     return ret;
 }
 
-bool LBPEvaluator::setImage( InputArray _image, Size _origWinSize, Size )
+void LBPEvaluator::computeChannels(int scaleIdx, InputArray _img)
 {
-    Mat image = _image.getMat();
-    int rn = image.rows+1, cn = image.cols+1;
-    origWinSize = _origWinSize;
+    const ScaleData& s = scaleData->at(scaleIdx);
 
-    if( image.cols < origWinSize.width || image.rows < origWinSize.height )
-        return false;
+    if (_img.isUMat())
+    {
+        int sx = s.layer_ofs % sbufSize.width;
+        int sy = s.layer_ofs / sbufSize.width;
+        UMat sum(usbuf, Rect(sx, sy, s.szi.width, s.szi.height));
+        integral(_img, sum, noArray(), noArray(), CV_32S);
+    }
+    else
+    {
+        Mat sum(s.szi, CV_32S, sbuf.ptr<int>() + s.layer_ofs, sbuf.step);
+        integral(_img, sum, noArray(), noArray(), CV_32S);
+    }
+}
 
-    if( sum0.rows < rn || sum0.cols < cn )
-        sum0.create(rn, cn, CV_32S);
-    sum = Mat(rn, cn, CV_32S, sum0.data);
-    integral(image, sum);
+void LBPEvaluator::computeOptFeatures()
+{
+    int sstep = sbufSize.width;
 
     size_t fi, nfeatures = features->size();
-
+    const std::vector<Feature>& ff = *features;
+    optfeatures->resize(nfeatures);
+    optfeaturesPtr = &(*optfeatures)[0];
     for( fi = 0; fi < nfeatures; fi++ )
-        featuresPtr[fi].updatePtrs( sum );
-    return true;
+        optfeaturesPtr[fi].setOffsets( ff[fi], sstep );
+    copyVectorToUMat(*optfeatures, ufbuf);
 }
 
-bool LBPEvaluator::setWindow( Point pt )
+
+void LBPEvaluator::OptFeature::setOffsets( const Feature& _f, int step )
 {
+    Rect tr = _f.rect;
+    int w0 = tr.width;
+    int h0 = tr.height;
+
+    CV_SUM_OFS( ofs[0], ofs[1], ofs[4], ofs[5], 0, tr, step );
+    tr.x += 2*w0;
+    CV_SUM_OFS( ofs[2], ofs[3], ofs[6], ofs[7], 0, tr, step );
+    tr.y += 2*h0;
+    CV_SUM_OFS( ofs[10], ofs[11], ofs[14], ofs[15], 0, tr, step );
+    tr.x -= 2*w0;
+    CV_SUM_OFS( ofs[8], ofs[9], ofs[12], ofs[13], 0, tr, step );
+}
+
+
+bool LBPEvaluator::setWindow( Point pt, int scaleIdx )
+{
+    CV_Assert(0 <= scaleIdx && scaleIdx < (int)scaleData->size());
+    const ScaleData& s = scaleData->at(scaleIdx);
+
     if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= sum.cols ||
-        pt.y + origWinSize.height >= sum.rows )
+        pt.x + origWinSize.width >= s.szi.width ||
+        pt.y + origWinSize.height >= s.szi.height )
         return false;
-    offset = pt.y * ((int)sum.step/sizeof(int)) + pt.x;
+
+    pwin = &sbuf.at<int>(pt) + s.layer_ofs;
     return true;
 }
 
-//----------------------------------------------  HOGEvaluator ---------------------------------------
-bool HOGEvaluator::Feature :: read( const FileNode& node )
-{
-    FileNode rnode = node[CC_RECT];
-    FileNodeIterator it = rnode.begin();
-    it >> rect[0].x >> rect[0].y >> rect[0].width >> rect[0].height >> featComponent;
-    rect[1].x = rect[0].x + rect[0].width;
-    rect[1].y = rect[0].y;
-    rect[2].x = rect[0].x;
-    rect[2].y = rect[0].y + rect[0].height;
-    rect[3].x = rect[0].x + rect[0].width;
-    rect[3].y = rect[0].y + rect[0].height;
-    rect[1].width = rect[2].width = rect[3].width = rect[0].width;
-    rect[1].height = rect[2].height = rect[3].height = rect[0].height;
-    return true;
-}
-
-HOGEvaluator::HOGEvaluator()
-{
-    features = makePtr<std::vector<Feature> >();
-}
-
-HOGEvaluator::~HOGEvaluator()
-{
-}
-
-bool HOGEvaluator::read( const FileNode& node )
-{
-    features->resize(node.size());
-    featuresPtr = &(*features)[0];
-    FileNodeIterator it = node.begin(), it_end = node.end();
-    for(int i = 0; it != it_end; ++it, i++)
-    {
-        if(!featuresPtr[i].read(*it))
-            return false;
-    }
-    return true;
-}
-
-Ptr<FeatureEvaluator> HOGEvaluator::clone() const
-{
-    Ptr<HOGEvaluator> ret = makePtr<HOGEvaluator>();
-    ret->origWinSize = origWinSize;
-    ret->features = features;
-    ret->featuresPtr = &(*ret->features)[0];
-    ret->offset = offset;
-    ret->hist = hist;
-    ret->normSum = normSum;
-    return ret;
-}
-
-bool HOGEvaluator::setImage( InputArray _image, Size winSize, Size )
-{
-    Mat image = _image.getMat();
-    int rows = image.rows + 1;
-    int cols = image.cols + 1;
-    origWinSize = winSize;
-    if( image.cols < origWinSize.width || image.rows < origWinSize.height )
-        return false;
-    hist.clear();
-    for( int bin = 0; bin < Feature::BIN_NUM; bin++ )
-    {
-        hist.push_back( Mat(rows, cols, CV_32FC1) );
-    }
-    normSum.create( rows, cols, CV_32FC1 );
-
-    integralHistogram( image, hist, normSum, Feature::BIN_NUM );
-
-    size_t featIdx, featCount = features->size();
-
-    for( featIdx = 0; featIdx < featCount; featIdx++ )
-    {
-        featuresPtr[featIdx].updatePtrs( hist, normSum );
-    }
-    return true;
-}
-
-bool HOGEvaluator::setWindow(Point pt)
-{
-    if( pt.x < 0 || pt.y < 0 ||
-        pt.x + origWinSize.width >= hist[0].cols-2 ||
-        pt.y + origWinSize.height >= hist[0].rows-2 )
-        return false;
-    offset = pt.y * ((int)hist[0].step/sizeof(float)) + pt.x;
-    return true;
-}
-
-void HOGEvaluator::integralHistogram(const Mat &img, std::vector<Mat> &histogram, Mat &norm, int nbins) const
-{
-    CV_Assert( img.type() == CV_8U || img.type() == CV_8UC3 );
-    int x, y, binIdx;
-
-    Size gradSize(img.size());
-    Size histSize(histogram[0].size());
-    Mat grad(gradSize, CV_32F);
-    Mat qangle(gradSize, CV_8U);
-
-    AutoBuffer<int> mapbuf(gradSize.width + gradSize.height + 4);
-    int* xmap = (int*)mapbuf + 1;
-    int* ymap = xmap + gradSize.width + 2;
-
-    const int borderType = (int)BORDER_REPLICATE;
-
-    for( x = -1; x < gradSize.width + 1; x++ )
-        xmap[x] = borderInterpolate(x, gradSize.width, borderType);
-    for( y = -1; y < gradSize.height + 1; y++ )
-        ymap[y] = borderInterpolate(y, gradSize.height, borderType);
-
-    int width = gradSize.width;
-    AutoBuffer<float> _dbuf(width*4);
-    float* dbuf = _dbuf;
-    Mat Dx(1, width, CV_32F, dbuf);
-    Mat Dy(1, width, CV_32F, dbuf + width);
-    Mat Mag(1, width, CV_32F, dbuf + width*2);
-    Mat Angle(1, width, CV_32F, dbuf + width*3);
-
-    float angleScale = (float)(nbins/CV_PI);
-
-    for( y = 0; y < gradSize.height; y++ )
-    {
-        const uchar* currPtr = img.data + img.step*ymap[y];
-        const uchar* prevPtr = img.data + img.step*ymap[y-1];
-        const uchar* nextPtr = img.data + img.step*ymap[y+1];
-        float* gradPtr = (float*)grad.ptr(y);
-        uchar* qanglePtr = (uchar*)qangle.ptr(y);
-
-        for( x = 0; x < width; x++ )
-        {
-            dbuf[x] = (float)(currPtr[xmap[x+1]] - currPtr[xmap[x-1]]);
-            dbuf[width + x] = (float)(nextPtr[xmap[x]] - prevPtr[xmap[x]]);
-        }
-        cartToPolar( Dx, Dy, Mag, Angle, false );
-        for( x = 0; x < width; x++ )
-        {
-            float mag = dbuf[x+width*2];
-            float angle = dbuf[x+width*3];
-            angle = angle*angleScale - 0.5f;
-            int bidx = cvFloor(angle);
-            angle -= bidx;
-            if( bidx < 0 )
-                bidx += nbins;
-            else if( bidx >= nbins )
-                bidx -= nbins;
-
-            qanglePtr[x] = (uchar)bidx;
-            gradPtr[x] = mag;
-        }
-    }
-    integral(grad, norm, grad.depth());
-
-    float* histBuf;
-    const float* magBuf;
-    const uchar* binsBuf;
-
-    int binsStep = (int)( qangle.step / sizeof(uchar) );
-    int histStep = (int)( histogram[0].step / sizeof(float) );
-    int magStep = (int)( grad.step / sizeof(float) );
-    for( binIdx = 0; binIdx < nbins; binIdx++ )
-    {
-        histBuf = (float*)histogram[binIdx].data;
-        magBuf = (const float*)grad.data;
-        binsBuf = (const uchar*)qangle.data;
-
-        memset( histBuf, 0, histSize.width * sizeof(histBuf[0]) );
-        histBuf += histStep + 1;
-        for( y = 0; y < qangle.rows; y++ )
-        {
-            histBuf[-1] = 0.f;
-            float strSum = 0.f;
-            for( x = 0; x < qangle.cols; x++ )
-            {
-                if( binsBuf[x] == binIdx )
-                    strSum += magBuf[x];
-                histBuf[x] = histBuf[-histStep + x] + strSum;
-            }
-            histBuf += histStep;
-            binsBuf += binsStep;
-            magBuf += magStep;
-        }
-    }
-}
 
 Ptr<FeatureEvaluator> FeatureEvaluator::create( int featureType )
 {
     return featureType == HAAR ? Ptr<FeatureEvaluator>(new HaarEvaluator) :
         featureType == LBP ? Ptr<FeatureEvaluator>(new LBPEvaluator) :
-        featureType == HOG ? Ptr<FeatureEvaluator>(new HOGEvaluator) :
         Ptr<FeatureEvaluator>();
 }
 
@@ -944,24 +893,21 @@ void CascadeClassifierImpl::read(const FileNode& node)
     read_(node);
 }
 
-int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, double& weight )
+int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, int scaleIdx, double& weight )
 {
-    CV_Assert( !oldCascade );
-
-    assert( data.featureType == FeatureEvaluator::HAAR ||
+    assert( !oldCascade &&
+           (data.featureType == FeatureEvaluator::HAAR ||
             data.featureType == FeatureEvaluator::LBP ||
-            data.featureType == FeatureEvaluator::HOG );
+            data.featureType == FeatureEvaluator::HOG) );
 
-    if( !evaluator->setWindow(pt) )
+    if( !evaluator->setWindow(pt, scaleIdx) )
         return -1;
-    if( data.isStumpBased() )
+    if( data.maxNodesPerTree == 1 )
     {
         if( data.featureType == FeatureEvaluator::HAAR )
             return predictOrderedStump<HaarEvaluator>( *this, evaluator, weight );
         else if( data.featureType == FeatureEvaluator::LBP )
             return predictCategoricalStump<LBPEvaluator>( *this, evaluator, weight );
-        else if( data.featureType == FeatureEvaluator::HOG )
-            return predictOrderedStump<HOGEvaluator>( *this, evaluator, weight );
         else
             return -2;
     }
@@ -971,8 +917,6 @@ int CascadeClassifierImpl::runAt( Ptr<FeatureEvaluator>& evaluator, Point pt, do
             return predictOrdered<HaarEvaluator>( *this, evaluator, weight );
         else if( data.featureType == FeatureEvaluator::LBP )
             return predictCategorical<LBPEvaluator>( *this, evaluator, weight );
-        else if( data.featureType == FeatureEvaluator::HOG )
-            return predictOrdered<HOGEvaluator>( *this, evaluator, weight );
         else
             return -2;
     }
@@ -999,14 +943,17 @@ Ptr<BaseCascadeClassifier::MaskGenerator> createFaceDetectionMaskGenerator()
 class CascadeClassifierInvoker : public ParallelLoopBody
 {
 public:
-    CascadeClassifierInvoker( CascadeClassifierImpl& _cc, Size _sz1, int _stripSize, int _yStep, double _factor,
-        std::vector<Rect>& _vec, std::vector<int>& _levels, std::vector<double>& _weights, bool outputLevels, const Mat& _mask, Mutex* _mtx)
+    CascadeClassifierInvoker( CascadeClassifierImpl& _cc, int _nscales, int _nstripes,
+                              const FeatureEvaluator::ScaleData* _scaleData,
+                              const int* _stripeSizes, std::vector<Rect>& _vec,
+                              std::vector<int>& _levels, std::vector<double>& _weights,
+                              bool outputLevels, const Mat& _mask, Mutex* _mtx)
     {
         classifier = &_cc;
-        processingRectSize = _sz1;
-        stripSize = _stripSize;
-        yStep = _yStep;
-        scalingFactor = _factor;
+        nscales = _nscales;
+        nstripes = _nstripes;
+        scaleData = _scaleData;
+        stripeSizes = _stripeSizes;
         rectangles = &_vec;
         rejectLevels = outputLevels ? &_levels : 0;
         levelWeights = outputLevels ? &_weights : 0;
@@ -1017,167 +964,203 @@ public:
     void operator()(const Range& range) const
     {
         Ptr<FeatureEvaluator> evaluator = classifier->featureEvaluator->clone();
+        double gypWeight = 0.;
+        Size origWinSize = classifier->data.origWinSize;
 
-        Size winSize(cvRound(classifier->data.origWinSize.width * scalingFactor),
-                     cvRound(classifier->data.origWinSize.height * scalingFactor));
-
-        int y1 = range.start * stripSize;
-        int y2 = std::min(range.end * stripSize, processingRectSize.height);
-        for( int y = y1; y < y2; y += yStep )
+        for( int scaleIdx = 0; scaleIdx < nscales; scaleIdx++ )
         {
-            for( int x = 0; x < processingRectSize.width; x += yStep )
+            const FeatureEvaluator::ScaleData& s = scaleData[scaleIdx];
+            float scalingFactor = s.scale;
+            int yStep = s.ystep;
+            int stripeSize = stripeSizes[scaleIdx];
+            int y0 = range.start*stripeSize;
+            Size szw = s.getWorkingSize(origWinSize);
+            int y1 = std::min(range.end*stripeSize, szw.height);
+            Size winSize(cvRound(origWinSize.width * scalingFactor),
+                         cvRound(origWinSize.height * scalingFactor));
+
+            for( int y = y0; y < y1; y += yStep )
             {
-                if ( (!mask.empty()) && (mask.at<uchar>(Point(x,y))==0)) {
-                    continue;
-                }
-
-                double gypWeight;
-                int result = classifier->runAt(evaluator, Point(x, y), gypWeight);
-
-#if defined (LOG_CASCADE_STATISTIC)
-
-                logger.setPoint(Point(x, y), result);
-#endif
-                if( rejectLevels )
+                for( int x = 0; x < szw.width; x += yStep )
                 {
-                    if( result == 1 )
-                        result =  -(int)classifier->data.stages.size();
-                    if( classifier->data.stages.size() + result == 0 )
+                    int result = classifier->runAt(evaluator, Point(x, y), scaleIdx, gypWeight);
+                    if( rejectLevels )
+                    {
+                        if( result == 1 )
+                            result = -(int)classifier->data.stages.size();
+                        if( classifier->data.stages.size() + result == 0 )
+                        {
+                            mtx->lock();
+                            rectangles->push_back(Rect(cvRound(x*scalingFactor),
+                                                       cvRound(y*scalingFactor),
+                                                       winSize.width, winSize.height));
+                            rejectLevels->push_back(-result);
+                            levelWeights->push_back(gypWeight);
+                            mtx->unlock();
+                        }
+                    }
+                    else if( result > 0 )
                     {
                         mtx->lock();
-                        rectangles->push_back(Rect(cvRound(x*scalingFactor), cvRound(y*scalingFactor), winSize.width, winSize.height));
-                        rejectLevels->push_back(-result);
-                        levelWeights->push_back(gypWeight);
+                        rectangles->push_back(Rect(cvRound(x*scalingFactor),
+                                                   cvRound(y*scalingFactor),
+                                                   winSize.width, winSize.height));
                         mtx->unlock();
                     }
+                    if( result == 0 )
+                        x += yStep;
                 }
-                else if( result > 0 )
-                {
-                    mtx->lock();
-                    rectangles->push_back(Rect(cvRound(x*scalingFactor), cvRound(y*scalingFactor),
-                                               winSize.width, winSize.height));
-                    mtx->unlock();
-                }
-                if( result == 0 )
-                    x += yStep;
             }
         }
     }
 
     CascadeClassifierImpl* classifier;
     std::vector<Rect>* rectangles;
-    Size processingRectSize;
-    int stripSize, yStep;
-    double scalingFactor;
+    int nscales, nstripes;
+    const FeatureEvaluator::ScaleData* scaleData;
+    const int* stripeSizes;
     std::vector<int> *rejectLevels;
     std::vector<double> *levelWeights;
+    std::vector<float> scales;
     Mat mask;
     Mutex* mtx;
 };
 
+
 struct getRect { Rect operator ()(const CvAvgComp& e) const { return e.rect; } };
 struct getNeighbors { int operator ()(const CvAvgComp& e) const { return e.neighbors; } };
 
 
-bool CascadeClassifierImpl::detectSingleScale( InputArray _image, Size processingRectSize,
-                                           int yStep, double factor, std::vector<Rect>& candidates,
-                                           std::vector<int>& levels, std::vector<double>& weights,
-                                           Size sumSize0, bool outputRejectLevels )
+bool CascadeClassifierImpl::ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
+                                                            std::vector<Rect>& candidates )
 {
-    if( !featureEvaluator->setImage(_image, data.origWinSize, sumSize0) )
+    int featureType = getFeatureType();
+    std::vector<UMat> bufs;
+    featureEvaluator->getUMats(bufs);
+    Size localsz = featureEvaluator->getLocalSize();
+    if( localsz.area() == 0 )
         return false;
+    Size lbufSize = featureEvaluator->getLocalBufSize();
+    size_t localsize[] = { localsz.width, localsz.height };
+    const int grp_per_CU = 12;
+    size_t globalsize[] = { grp_per_CU*ocl::Device::getDefault().maxComputeUnits()*localsize[0], localsize[1] };
+    bool ok = false;
 
-#if defined (LOG_CASCADE_STATISTIC)
-    logger.setImage(image);
-#endif
-
-    Mat currentMask;
-    if (maskGenerator) {
-        Mat image = _image.getMat();
-        currentMask=maskGenerator->generateMask(image);
-    }
-
-    std::vector<Rect> candidatesVector;
-    std::vector<int> rejectLevels;
-    std::vector<double> levelWeights;
-
-    int stripCount, stripSize;
-
-    const int PTS_PER_THREAD = 1000;
-    stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
-    stripCount = std::min(std::max(stripCount, 1), 100);
-    stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-
-    if( outputRejectLevels )
-    {
-        parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
-            candidatesVector, rejectLevels, levelWeights, true, currentMask, &mtx));
-        levels.insert( levels.end(), rejectLevels.begin(), rejectLevels.end() );
-        weights.insert( weights.end(), levelWeights.begin(), levelWeights.end() );
-    }
-    else
-    {
-         parallel_for_(Range(0, stripCount), CascadeClassifierInvoker( *this, processingRectSize, stripSize, yStep, factor,
-            candidatesVector, rejectLevels, levelWeights, false, currentMask, &mtx));
-    }
-    candidates.insert( candidates.end(), candidatesVector.begin(), candidatesVector.end() );
-
-#if defined (LOG_CASCADE_STATISTIC)
-    logger.write();
-#endif
-
-    return true;
-}
-
-
-bool CascadeClassifierImpl::ocl_detectSingleScale( InputArray _image, Size processingRectSize,
-                                                   int yStep, double factor, Size sumSize0 )
-{
-    const int VECTOR_SIZE = 1;
-    Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
-    if( haar.empty() )
-        return false;
-
-    haar->setImage(_image, data.origWinSize, sumSize0);
-
-    if( cascadeKernel.empty() )
-    {
-        cascadeKernel.create("runHaarClassifierStump", ocl::objdetect::cascadedetect_oclsrc,
-                             format("-D VECTOR_SIZE=%d", VECTOR_SIZE));
-        if( cascadeKernel.empty() )
-            return false;
-    }
+    ufacepos.create(1, MAX_FACES*3+1, CV_32S);
+    UMat ufacepos_count(ufacepos, Rect(0, 0, 1, 1));
+    ufacepos_count.setTo(Scalar::all(0));
 
     if( ustages.empty() )
     {
         copyVectorToUMat(data.stages, ustages);
-        copyVectorToUMat(data.stumps, ustumps);
+        if (!data.stumps.empty())
+            copyVectorToUMat(data.stumps, unodes);
+        else
+            copyVectorToUMat(data.nodes, unodes);
+        copyVectorToUMat(data.leaves, uleaves);
+        if( !data.subsets.empty() )
+            copyVectorToUMat(data.subsets, usubsets);
     }
 
-    std::vector<UMat> bufs;
-    haar->getUMats(bufs);
-    CV_Assert(bufs.size() == 3);
+    int nstages = (int)data.stages.size();
 
-    Rect normrect = haar->getNormRect();
+    if( featureType == FeatureEvaluator::HAAR )
+    {
+        Ptr<HaarEvaluator> haar = featureEvaluator.dynamicCast<HaarEvaluator>();
+        if( haar.empty() )
+            return false;
 
-    //processingRectSize = Size(yStep, yStep);
-    size_t globalsize[] = { (processingRectSize.width/yStep + VECTOR_SIZE-1)/VECTOR_SIZE, processingRectSize.height/yStep };
+        if( haarKernel.empty() )
+        {
+            String opts;
+            if (lbufSize.area())
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d -D NODE_COUNT=%d",
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width, data.maxNodesPerTree);
+            else
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D NODE_COUNT=%d",
+                              localsz.width, localsz.height, data.maxNodesPerTree);
+            haarKernel.create("runHaarClassifier", ocl::objdetect::cascadedetect_oclsrc, opts);
+            if( haarKernel.empty() )
+                return false;
+        }
 
-    cascadeKernel.args(ocl::KernelArg::ReadOnlyNoSize(bufs[0]), // sum
-                       ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sqsum
+        Rect normrect = haar->getNormRect();
+        int sqofs = haar->getSquaresOffset();
+        int splitstage_ocl = 1;
+
+        haarKernel.args((int)scales.size(),
+                        ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
+                        ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sum
+                        ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
+
+                        // cascade classifier
+                        splitstage_ocl, nstages,
+                        ocl::KernelArg::PtrReadOnly(ustages),
+                        ocl::KernelArg::PtrReadOnly(unodes),
+                        ocl::KernelArg::PtrReadOnly(uleaves),
+
+                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
+                        normrect, sqofs, data.origWinSize, (int)MAX_FACES);
+        ok = haarKernel.run(2, globalsize, localsize, true);
+    }
+    else if( featureType == FeatureEvaluator::LBP )
+    {
+        if (data.maxNodesPerTree > 1)
+            return false;
+
+        Ptr<LBPEvaluator> lbp = featureEvaluator.dynamicCast<LBPEvaluator>();
+        if( lbp.empty() )
+            return false;
+
+        if( lbpKernel.empty() )
+        {
+            String opts;
+            if (lbufSize.area())
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d -D SUM_BUF_SIZE=%d -D SUM_BUF_STEP=%d",
+                              localsz.width, localsz.height, lbufSize.area(), lbufSize.width);
+            else
+                opts = format("-D LOCAL_SIZE_X=%d -D LOCAL_SIZE_Y=%d", localsz.width, localsz.height);
+            lbpKernel.create("runLBPClassifierStumpSimple", ocl::objdetect::cascadedetect_oclsrc, opts);
+            if( lbpKernel.empty() )
+                return false;
+        }
+
+        int splitstage_ocl = 1;
+        int subsetSize = (data.ncategories + 31)/32;
+        lbpKernel.args((int)scales.size(),
+                       ocl::KernelArg::PtrReadOnly(bufs[0]), // scaleData
+                       ocl::KernelArg::ReadOnlyNoSize(bufs[1]), // sum
                        ocl::KernelArg::PtrReadOnly(bufs[2]), // optfeatures
 
                        // cascade classifier
-                       (int)data.stages.size(),
+                       splitstage_ocl, nstages,
                        ocl::KernelArg::PtrReadOnly(ustages),
-                       ocl::KernelArg::PtrReadOnly(ustumps),
+                       ocl::KernelArg::PtrReadOnly(unodes),
+                       ocl::KernelArg::PtrReadOnly(usubsets),
+                       subsetSize,
 
                        ocl::KernelArg::PtrWriteOnly(ufacepos), // positions
-                       processingRectSize,
-                       yStep, (float)factor,
-                       normrect, data.origWinSize, MAX_FACES);
-    bool ok = cascadeKernel.run(2, globalsize, 0, true);
-    //CV_Assert(ok);
+                       data.origWinSize, (int)MAX_FACES);
+
+        ok = lbpKernel.run(2, globalsize, localsize, true);
+    }
+
+    if( ok )
+    {
+        Mat facepos = ufacepos.getMat(ACCESS_READ);
+        const int* fptr = facepos.ptr<int>();
+        int nfaces = fptr[0];
+        nfaces = std::min(nfaces, (int)MAX_FACES);
+
+        for( int i = 0; i < nfaces; i++ )
+        {
+            const FeatureEvaluator::ScaleData& s = featureEvaluator->getScaleData(fptr[i*3 + 1]);
+            candidates.push_back(Rect(cvRound(fptr[i*3 + 2]*s.scale),
+                                      cvRound(fptr[i*3 + 3]*s.scale),
+                                      cvRound(data.origWinSize.width*s.scale),
+                                      cvRound(data.origWinSize.height*s.scale)));
+        }
+    }
     return ok;
 }
 
@@ -1226,9 +1209,9 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
                                                     bool outputRejectLevels )
 {
     Size imgsz = _image.size();
-    int imgtype = _image.type();
 
-    Mat grayImage, imageBuffer;
+    Mat grayImage;
+    _InputArray gray;
 
     candidates.clear();
     rejectLevels.clear();
@@ -1237,118 +1220,86 @@ void CascadeClassifierImpl::detectMultiScaleNoGrouping( InputArray _image, std::
     if( maxObjectSize.height == 0 || maxObjectSize.width == 0 )
         maxObjectSize = imgsz;
 
-    bool use_ocl = ocl::useOpenCL() &&
-        getFeatureType() == FeatureEvaluator::HAAR &&
-        !isOldFormatCascade() &&
-        data.isStumpBased() &&
-        maskGenerator.empty() &&
-        !outputRejectLevels &&
-        tryOpenCL;
+    bool use_ocl = tryOpenCL && ocl::useOpenCL() &&
+         featureEvaluator->getLocalSize().area() > 0 &&
+         ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU &&
+         (data.minNodesPerTree == data.maxNodesPerTree) &&
+         !isOldFormatCascade() &&
+         maskGenerator.empty() &&
+         !outputRejectLevels;
 
-    if( !use_ocl )
+    /*if( use_ocl )
     {
-        Mat image = _image.getMat();
-        if (maskGenerator)
-            maskGenerator->initializeMask(image);
-
-        grayImage = image;
-        if( CV_MAT_CN(imgtype) > 1 )
-        {
-            Mat temp;
-            cvtColor(grayImage, temp, COLOR_BGR2GRAY);
-            grayImage = temp;
-        }
-
-        imageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
-    }
-    else
-    {
-        UMat uimage = _image.getUMat();
-        if( CV_MAT_CN(imgtype) > 1 )
-            cvtColor(uimage, ugrayImage, COLOR_BGR2GRAY);
+        if (_image.channels() > 1)
+            cvtColor(_image, ugrayImage, COLOR_BGR2GRAY);
+        else if (_image.isUMat())
+            ugrayImage = _image.getUMat();
         else
-            uimage.copyTo(ugrayImage);
-        uimageBuffer.create(imgsz.height + 1, imgsz.width + 1, CV_8U);
+            _image.copyTo(ugrayImage);
+        gray = ugrayImage;
     }
-
-    Size sumSize0((imgsz.width + SUM_ALIGN) & -SUM_ALIGN, imgsz.height+1);
-
-    if( use_ocl )
+    else*/
     {
-        ufacepos.create(1, MAX_FACES*4 + 1, CV_32S);
-        UMat ufacecount(ufacepos, Rect(0,0,1,1));
-        ufacecount.setTo(Scalar::all(0));
+        if (_image.channels() > 1)
+            cvtColor(_image, grayImage, COLOR_BGR2GRAY);
+        else if (_image.isMat())
+            grayImage = _image.getMat();
+        else
+            _image.copyTo(grayImage);
+        gray = grayImage;
     }
 
+    std::vector<float> scales;
+    scales.reserve(1024);
+
     for( double factor = 1; ; factor *= scaleFactor )
     {
         Size originalWindowSize = getOriginalWindowSize();
 
         Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
-        Size scaledImageSize( cvRound( imgsz.width/factor ), cvRound( imgsz.height/factor ) );
-        Size processingRectSize( scaledImageSize.width - originalWindowSize.width,
-                                 scaledImageSize.height - originalWindowSize.height );
-
-        if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
-            break;
-        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height )
+        if( windowSize.width > maxObjectSize.width || windowSize.height > maxObjectSize.height ||
+            windowSize.width > imgsz.width || windowSize.height > imgsz.height )
             break;
         if( windowSize.width < minObjectSize.width || windowSize.height < minObjectSize.height )
             continue;
-
-        int yStep;
-        if( getFeatureType() == cv::FeatureEvaluator::HOG )
-        {
-            yStep = 4;
-        }
-        else
-        {
-            yStep = factor > 2. ? 1 : 2;
-        }
-
-        if( use_ocl )
-        {
-            UMat uscaledImage(uimageBuffer, Rect(0, 0, scaledImageSize.width, scaledImageSize.height));
-            resize( ugrayImage, uscaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
-
-            if( ocl_detectSingleScale( uscaledImage, processingRectSize, yStep, factor, sumSize0 ) )
-                continue;
-
-            /////// if the OpenCL branch has been executed but failed, fall back to CPU: /////
-
-            tryOpenCL = false; // for this cascade do not try OpenCL anymore
-
-            // since we may already have some partial results from OpenCL code (unlikely, but still),
-            // we just recursively call the function again, but with tryOpenCL==false it will
-            // go with CPU route, so there is no infinite recursion
-            detectMultiScaleNoGrouping( _image, candidates, rejectLevels, levelWeights,
-                                       scaleFactor, minObjectSize, maxObjectSize,
-                                       outputRejectLevels);
-            return;
-        }
-        else
-        {
-            Mat scaledImage( scaledImageSize, CV_8U, imageBuffer.data );
-            resize( grayImage, scaledImage, scaledImageSize, 0, 0, INTER_LINEAR );
-
-            if( !detectSingleScale( scaledImage, processingRectSize, yStep, factor, candidates,
-                                    rejectLevels, levelWeights, sumSize0, outputRejectLevels ) )
-                break;
-        }
+        scales.push_back((float)factor);
     }
 
-    if( use_ocl && tryOpenCL )
+    if( !featureEvaluator->setImage(gray, scales) )
+        return;
+
+    // OpenCL code
+    if( use_ocl && ocl_detectMultiScaleNoGrouping( scales, candidates ))
+        return;
+    tryOpenCL = false;
+
+    // CPU code
+    featureEvaluator->getMats();
     {
-        Mat facepos = ufacepos.getMat(ACCESS_READ);
-        const int* fptr = facepos.ptr<int>();
-        int i, nfaces = fptr[0];
-        for( i = 0; i < nfaces; i++ )
+        Mat currentMask;
+        if (maskGenerator)
+            currentMask = maskGenerator->generateMask(gray.getMat());
+
+        size_t i, nscales = scales.size();
+        cv::AutoBuffer<int> stripeSizeBuf(nscales);
+        int* stripeSizes = stripeSizeBuf;
+        const FeatureEvaluator::ScaleData* s = &featureEvaluator->getScaleData(0);
+        Size szw = s->getWorkingSize(data.origWinSize);
+        int nstripes = cvCeil(szw.width/32.);
+        for( i = 0; i < nscales; i++ )
         {
-            candidates.push_back(Rect(fptr[i*4+1], fptr[i*4+2], fptr[i*4+3], fptr[i*4+4]));
+            szw = s[i].getWorkingSize(data.origWinSize);
+            stripeSizes[i] = std::max((szw.height/s[i].ystep + nstripes-1)/nstripes, 1)*s[i].ystep;
         }
+
+        CascadeClassifierInvoker invoker(*this, (int)nscales, nstripes, s, stripeSizes,
+                                         candidates, rejectLevels, levelWeights,
+                                         outputRejectLevels, currentMask, &mtx);
+        parallel_for_(Range(0, nstripes), invoker);
     }
 }
 
+
 void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
                                           std::vector<int>& rejectLevels,
                                           std::vector<double>& levelWeights,
@@ -1388,10 +1339,9 @@ void CascadeClassifierImpl::detectMultiScale( InputArray _image, std::vector<Rec
                                           double scaleFactor, int minNeighbors,
                                           int flags, Size minObjectSize, Size maxObjectSize)
 {
-    Mat image = _image.getMat();
     std::vector<int> fakeLevels;
     std::vector<double> fakeWeights;
-    detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor,
+    detectMultiScale( _image, objects, fakeLevels, fakeWeights, scaleFactor,
         minNeighbors, flags, minObjectSize, maxObjectSize );
 }
 
@@ -1476,6 +1426,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
     stumps.clear();
 
     FileNodeIterator it = fn.begin(), it_end = fn.end();
+    minNodesPerTree = INT_MAX;
     maxNodesPerTree = 0;
 
     for( int si = 0; it != it_end; si++, ++it )
@@ -1502,6 +1453,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
 
             DTree tree;
             tree.nodeCount = (int)internalNodes.size()/nodeStep;
+            minNodesPerTree = std::min(minNodesPerTree, tree.nodeCount);
             maxNodesPerTree = std::max(maxNodesPerTree, tree.nodeCount);
 
             classifiers.push_back(tree);
@@ -1539,7 +1491,7 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
         }
     }
 
-    if( isStumpBased() )
+    if( maxNodesPerTree == 1 )
     {
         int nodeOfs = 0, leafOfs = 0;
         size_t nstages = stages.size();
@@ -1564,9 +1516,11 @@ bool CascadeClassifierImpl::Data::read(const FileNode &root)
 bool CascadeClassifierImpl::read_(const FileNode& root)
 {
     tryOpenCL = true;
-    cascadeKernel = ocl::Kernel();
+    haarKernel = ocl::Kernel();
+    lbpKernel = ocl::Kernel();
     ustages.release();
-    ustumps.release();
+    unodes.release();
+    uleaves.release();
     if( !data.read(root) )
         return false;
 
@@ -1576,7 +1530,7 @@ bool CascadeClassifierImpl::read_(const FileNode& root)
     if( fn.empty() )
         return false;
 
-    return featureEvaluator->read(fn);
+    return featureEvaluator->read(fn, data.origWinSize);
 }
 
 template<> void DefaultDeleter<CvHaarClassifierCascade>::operator ()(CvHaarClassifierCascade* obj) const
@@ -1612,7 +1566,7 @@ bool CascadeClassifier::load( const String& filename )
 
 bool CascadeClassifier::read(const FileNode &root)
 {
-    Ptr<CascadeClassifierImpl> ccimpl;
+    Ptr<CascadeClassifierImpl> ccimpl = makePtr<CascadeClassifierImpl>();
     bool ok = ccimpl->read_(root);
     if( ok )
         cc = ccimpl.staticCast<BaseCascadeClassifier>();
diff --git a/modules/objdetect/src/cascadedetect.hpp b/modules/objdetect/src/cascadedetect.hpp
index c2add08cf..17eeccd53 100644
--- a/modules/objdetect/src/cascadedetect.hpp
+++ b/modules/objdetect/src/cascadedetect.hpp
@@ -3,6 +3,72 @@
 namespace cv
 {
 
+class FeatureEvaluator
+{
+public:
+    enum
+    {
+        HAAR = 0,
+        LBP  = 1,
+        HOG  = 2
+    };
+
+    struct ScaleData
+    {
+        ScaleData() { scale = 0.f; layer_ofs = ystep = 0; }
+        Size getWorkingSize(Size winSize) const
+        {
+            return Size(std::max(szi.width - winSize.width, 0),
+                        std::max(szi.height - winSize.height, 0));
+        }
+
+        float scale;
+        Size szi;
+        int layer_ofs, ystep;
+    };
+
+    virtual ~FeatureEvaluator();
+
+    virtual bool read(const FileNode& node, Size origWinSize);
+    virtual Ptr<FeatureEvaluator> clone() const;
+    virtual int getFeatureType() const;
+    int getNumChannels() const { return nchannels; }
+
+    virtual bool setImage(InputArray img, const std::vector<float>& scales);
+    virtual bool setWindow(Point p, int scaleIdx);
+    const ScaleData& getScaleData(int scaleIdx) const
+    {
+        CV_Assert( 0 <= scaleIdx && scaleIdx < (int)scaleData->size());
+        return scaleData->at(scaleIdx);
+    }
+    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual void getMats();
+
+    Size getLocalSize() const { return localSize; }
+    Size getLocalBufSize() const { return lbufSize; }
+
+    virtual float calcOrd(int featureIdx) const;
+    virtual int calcCat(int featureIdx) const;
+
+    static Ptr<FeatureEvaluator> create(int type);
+
+protected:
+    enum { SBUF_VALID=1, USBUF_VALID=2 };
+    int sbufFlag;
+
+    bool updateScaleData( Size imgsz, const std::vector<float>& _scales );
+    virtual void computeChannels( int, InputArray ) {}
+    virtual void computeOptFeatures() {}
+
+    Size origWinSize, sbufSize, localSize, lbufSize;
+    int nchannels;
+    Mat sbuf, rbuf;
+    UMat urbuf, usbuf, ufbuf, uscaleData;
+
+    Ptr<std::vector<ScaleData> > scaleData;
+};
+
+
 class CascadeClassifierImpl : public BaseCascadeClassifier
 {
 public:
@@ -54,9 +120,8 @@ protected:
                             int yStep, double factor, std::vector<Rect>& candidates,
                             std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
                             Size sumSize0, bool outputRejectLevels = false );
-    bool ocl_detectSingleScale( InputArray image, Size processingRectSize,
-                                int yStep, double factor, Size sumSize0 );
-
+    bool ocl_detectMultiScaleNoGrouping( const std::vector<float>& scales,
+                                         std::vector<Rect>& candidates );
 
     void detectMultiScaleNoGrouping( InputArray image, std::vector<Rect>& candidates,
                                     std::vector<int>& rejectLevels, std::vector<double>& levelWeights,
@@ -72,6 +137,7 @@ protected:
     };
 
     friend class CascadeClassifierInvoker;
+    friend class SparseCascadeClassifierInvoker;
 
     template<class FEval>
     friend int predictOrdered( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
@@ -85,7 +151,7 @@ protected:
     template<class FEval>
     friend int predictCategoricalStump( CascadeClassifierImpl& cascade, Ptr<FeatureEvaluator> &featureEvaluator, double& weight);
 
-    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, double& weight );
+    int runAt( Ptr<FeatureEvaluator>& feval, Point pt, int scaleIdx, double& weight );
 
     class Data
     {
@@ -112,7 +178,7 @@ protected:
 
         struct Stump
         {
-            Stump() {};
+            Stump() { }
             Stump(int _featureIdx, float _threshold, float _left, float _right)
             : featureIdx(_featureIdx), threshold(_threshold), left(_left), right(_right) {}
 
@@ -126,12 +192,10 @@ protected:
 
         bool read(const FileNode &node);
 
-        bool isStumpBased() const { return maxNodesPerTree == 1; }
-
         int stageType;
         int featureType;
         int ncategories;
-        int maxNodesPerTree;
+        int minNodesPerTree, maxNodesPerTree;
         Size origWinSize;
 
         std::vector<Stage> stages;
@@ -147,9 +211,9 @@ protected:
     Ptr<CvHaarClassifierCascade> oldCascade;
 
     Ptr<MaskGenerator> maskGenerator;
-    UMat ugrayImage, uimageBuffer;
-    UMat ufacepos, ustages, ustumps, usubsets;
-    ocl::Kernel cascadeKernel;
+    UMat ugrayImage;
+    UMat ufacepos, ustages, unodes, uleaves, usubsets;
+    ocl::Kernel haarKernel, lbpKernel;
     bool tryOpenCL;
 
     Mutex mtx;
@@ -250,13 +314,11 @@ public:
     struct Feature
     {
         Feature();
-
         bool read( const FileNode& node );
 
         bool tilted;
 
         enum { RECT_NUM = 3 };
-
         struct
         {
             Rect r;
@@ -270,7 +332,6 @@ public:
 
         enum { RECT_NUM = Feature::RECT_NUM };
         float calc( const int* pwin ) const;
-
         void setOffsets( const Feature& _f, int step, int tofs );
 
         int ofs[RECT_NUM][4];
@@ -280,35 +341,34 @@ public:
     HaarEvaluator();
     virtual ~HaarEvaluator();
 
-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize);
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::HAAR; }
 
-    virtual bool setImage(InputArray, Size origWinSize, Size sumSize);
-    virtual bool setWindow(Point pt);
-    virtual Rect getNormRect() const;
-    virtual void getUMats(std::vector<UMat>& bufs);
+    virtual bool setWindow(Point p, int scaleIdx);
+    Rect getNormRect() const;
+    int getSquaresOffset() const;
 
-    double operator()(int featureIdx) const
+    float operator()(int featureIdx) const
     { return optfeaturesPtr[featureIdx].calc(pwin) * varianceNormFactor; }
-    virtual double calcOrd(int featureIdx) const
+    virtual float calcOrd(int featureIdx) const
     { return (*this)(featureIdx); }
 
 protected:
-    Size origWinSize, sumSize0;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
+
     Ptr<std::vector<Feature> > features;
     Ptr<std::vector<OptFeature> > optfeatures;
-    OptFeature* optfeaturesPtr; // optimization
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
     bool hasTiltedFeatures;
 
-    Mat sum0, sum, sqsum0, sqsum;
-    UMat usum0, usum, usqsum0, usqsum, ufbuf;
-
+    int tofs, sqofs;
+    Vec4i nofs;
     Rect normrect;
-    int nofs[4];
-
     const int* pwin;
-    double varianceNormFactor;
+    OptFeature* optfeaturesPtr; // optimization
+    float varianceNormFactor;
 };
 
 inline HaarEvaluator::Feature :: Feature()
@@ -338,28 +398,6 @@ inline float HaarEvaluator::OptFeature :: calc( const int* ptr ) const
     return ret;
 }
 
-inline void HaarEvaluator::OptFeature :: setOffsets( const Feature& _f, int step, int tofs )
-{
-    weight[0] = _f.rect[0].weight;
-    weight[1] = _f.rect[1].weight;
-    weight[2] = _f.rect[2].weight;
-
-    Rect r2 = weight[2] > 0 ? _f.rect[2].r : Rect(0,0,0,0);
-    if (_f.tilted)
-    {
-        CV_TILTED_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], tofs, _f.rect[0].r, step );
-        CV_TILTED_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], tofs, _f.rect[1].r, step );
-        CV_TILTED_PTRS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], tofs, r2, step );
-    }
-    else
-    {
-        CV_SUM_OFS( ofs[0][0], ofs[0][1], ofs[0][2], ofs[0][3], 0, _f.rect[0].r, step );
-        CV_SUM_OFS( ofs[1][0], ofs[1][1], ofs[1][2], ofs[1][3], 0, _f.rect[1].r, step );
-        CV_SUM_OFS( ofs[2][0], ofs[2][1], ofs[2][2], ofs[2][3], 0, r2, step );
-    }
-}
-
-
 //----------------------------------------------  LBPEvaluator -------------------------------------
 
 class LBPEvaluator : public FeatureEvaluator
@@ -369,156 +407,73 @@ public:
     {
         Feature();
         Feature( int x, int y, int _block_w, int _block_h  ) :
-        rect(x, y, _block_w, _block_h) {}
+                 rect(x, y, _block_w, _block_h) {}
 
-        int calc( int offset ) const;
-        void updatePtrs( const Mat& sum );
         bool read(const FileNode& node );
 
         Rect rect; // weight and height for block
-        const int* p[16]; // fast
+    };
+
+    struct OptFeature
+    {
+        OptFeature();
+
+        int calc( const int* pwin ) const;
+        void setOffsets( const Feature& _f, int step );
+        int ofs[16];
     };
 
     LBPEvaluator();
     virtual ~LBPEvaluator();
 
-    virtual bool read( const FileNode& node );
+    virtual bool read( const FileNode& node, Size origWinSize );
     virtual Ptr<FeatureEvaluator> clone() const;
     virtual int getFeatureType() const { return FeatureEvaluator::LBP; }
 
-    virtual bool setImage(InputArray image, Size _origWinSize, Size);
-    virtual bool setWindow(Point pt);
+    virtual bool setWindow(Point p, int scaleIdx);
 
     int operator()(int featureIdx) const
-    { return featuresPtr[featureIdx].calc(offset); }
+    { return optfeaturesPtr[featureIdx].calc(pwin); }
     virtual int calcCat(int featureIdx) const
     { return (*this)(featureIdx); }
 protected:
-    Size origWinSize;
-    Ptr<std::vector<Feature> > features;
-    Feature* featuresPtr; // optimization
-    Mat sum0, sum;
-    Rect normrect;
+    virtual void computeChannels( int i, InputArray img );
+    virtual void computeOptFeatures();
 
-    int offset;
+    Ptr<std::vector<Feature> > features;
+    Ptr<std::vector<OptFeature> > optfeatures;
+    Ptr<std::vector<OptFeature> > optfeatures_lbuf;
+    OptFeature* optfeaturesPtr; // optimization
+
+    const int* pwin;
 };
 
 
 inline LBPEvaluator::Feature :: Feature()
 {
     rect = Rect();
+}
+
+inline LBPEvaluator::OptFeature :: OptFeature()
+{
     for( int i = 0; i < 16; i++ )
-        p[i] = 0;
+        ofs[i] = 0;
 }
 
-inline int LBPEvaluator::Feature :: calc( int _offset ) const
+inline int LBPEvaluator::OptFeature :: calc( const int* p ) const
 {
-    int cval = CALC_SUM_( p[5], p[6], p[9], p[10], _offset );
+    int cval = CALC_SUM_OFS_( ofs[5], ofs[6], ofs[9], ofs[10], p );
 
-    return (CALC_SUM_( p[0], p[1], p[4], p[5], _offset ) >= cval ? 128 : 0) |   // 0
-           (CALC_SUM_( p[1], p[2], p[5], p[6], _offset ) >= cval ? 64 : 0) |    // 1
-           (CALC_SUM_( p[2], p[3], p[6], p[7], _offset ) >= cval ? 32 : 0) |    // 2
-           (CALC_SUM_( p[6], p[7], p[10], p[11], _offset ) >= cval ? 16 : 0) |  // 5
-           (CALC_SUM_( p[10], p[11], p[14], p[15], _offset ) >= cval ? 8 : 0)|  // 8
-           (CALC_SUM_( p[9], p[10], p[13], p[14], _offset ) >= cval ? 4 : 0)|   // 7
-           (CALC_SUM_( p[8], p[9], p[12], p[13], _offset ) >= cval ? 2 : 0)|    // 6
-           (CALC_SUM_( p[4], p[5], p[8], p[9], _offset ) >= cval ? 1 : 0);
+    return (CALC_SUM_OFS_( ofs[0], ofs[1], ofs[4], ofs[5], p ) >= cval ? 128 : 0) |   // 0
+           (CALC_SUM_OFS_( ofs[1], ofs[2], ofs[5], ofs[6], p ) >= cval ? 64 : 0) |    // 1
+           (CALC_SUM_OFS_( ofs[2], ofs[3], ofs[6], ofs[7], p ) >= cval ? 32 : 0) |    // 2
+           (CALC_SUM_OFS_( ofs[6], ofs[7], ofs[10], ofs[11], p ) >= cval ? 16 : 0) |  // 5
+           (CALC_SUM_OFS_( ofs[10], ofs[11], ofs[14], ofs[15], p ) >= cval ? 8 : 0)|  // 8
+           (CALC_SUM_OFS_( ofs[9], ofs[10], ofs[13], ofs[14], p ) >= cval ? 4 : 0)|   // 7
+           (CALC_SUM_OFS_( ofs[8], ofs[9], ofs[12], ofs[13], p ) >= cval ? 2 : 0)|    // 6
+           (CALC_SUM_OFS_( ofs[4], ofs[5], ofs[8], ofs[9], p ) >= cval ? 1 : 0);
 }
 
-inline void LBPEvaluator::Feature :: updatePtrs( const Mat& _sum )
-{
-    const int* ptr = (const int*)_sum.data;
-    size_t step = _sum.step/sizeof(ptr[0]);
-    Rect tr = rect;
-    CV_SUM_PTRS( p[0], p[1], p[4], p[5], ptr, tr, step );
-    tr.x += 2*rect.width;
-    CV_SUM_PTRS( p[2], p[3], p[6], p[7], ptr, tr, step );
-    tr.y += 2*rect.height;
-    CV_SUM_PTRS( p[10], p[11], p[14], p[15], ptr, tr, step );
-    tr.x -= 2*rect.width;
-    CV_SUM_PTRS( p[8], p[9], p[12], p[13], ptr, tr, step );
-}
-
-//---------------------------------------------- HOGEvaluator -------------------------------------------
-
-class HOGEvaluator : public FeatureEvaluator
-{
-public:
-    struct Feature
-    {
-        Feature();
-        float calc( int offset ) const;
-        void updatePtrs( const std::vector<Mat>& _hist, const Mat &_normSum );
-        bool read( const FileNode& node );
-
-        enum { CELL_NUM = 4, BIN_NUM = 9 };
-
-        Rect rect[CELL_NUM];
-        int featComponent; //component index from 0 to 35
-        const float* pF[4]; //for feature calculation
-        const float* pN[4]; //for normalization calculation
-    };
-    HOGEvaluator();
-    virtual ~HOGEvaluator();
-    virtual bool read( const FileNode& node );
-    virtual Ptr<FeatureEvaluator> clone() const;
-    virtual int getFeatureType() const { return FeatureEvaluator::HOG; }
-    virtual bool setImage( InputArray image, Size winSize, Size );
-    virtual bool setWindow( Point pt );
-    double operator()(int featureIdx) const
-    {
-        return featuresPtr[featureIdx].calc(offset);
-    }
-    virtual double calcOrd( int featureIdx ) const
-    {
-        return (*this)(featureIdx);
-    }
-
-private:
-    virtual void integralHistogram( const Mat& srcImage, std::vector<Mat> &histogram, Mat &norm, int nbins ) const;
-
-    Size origWinSize;
-    Ptr<std::vector<Feature> > features;
-    Feature* featuresPtr;
-    std::vector<Mat> hist;
-    Mat normSum;
-    int offset;
-};
-
-inline HOGEvaluator::Feature :: Feature()
-{
-    rect[0] = rect[1] = rect[2] = rect[3] = Rect();
-    pF[0] = pF[1] = pF[2] = pF[3] = 0;
-    pN[0] = pN[1] = pN[2] = pN[3] = 0;
-    featComponent = 0;
-}
-
-inline float HOGEvaluator::Feature :: calc( int _offset ) const
-{
-    float res = CALC_SUM(pF, _offset);
-    float normFactor = CALC_SUM(pN, _offset);
-    res = (res > 0.001f) ? (res / ( normFactor + 0.001f) ) : 0.f;
-    return res;
-}
-
-inline void HOGEvaluator::Feature :: updatePtrs( const std::vector<Mat> &_hist, const Mat &_normSum )
-{
-    int binIdx = featComponent % BIN_NUM;
-    int cellIdx = featComponent / BIN_NUM;
-    Rect normRect = Rect( rect[0].x, rect[0].y, 2*rect[0].width, 2*rect[0].height );
-
-    const float* featBuf = (const float*)_hist[binIdx].data;
-    size_t featStep = _hist[0].step / sizeof(featBuf[0]);
-
-    const float* normBuf = (const float*)_normSum.data;
-    size_t normStep = _normSum.step / sizeof(normBuf[0]);
-
-    CV_SUM_PTRS( pF[0], pF[1], pF[2], pF[3], featBuf, rect[cellIdx], featStep );
-    CV_SUM_PTRS( pN[0], pN[1], pN[2], pN[3], normBuf, normRect, normStep );
-}
-
-
-
 
 //----------------------------------------------  predictor functions -------------------------------------
 
@@ -653,11 +608,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
     const CascadeClassifierImpl::Data::Stump* cascadeStumps = &cascade.data.stumps[0];
     const CascadeClassifierImpl::Data::Stage* cascadeStages = &cascade.data.stages[0];
 
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    float tmp = 0; // float accumulator -- float operations are quicker
-#else
     double tmp = 0;
-#endif
     for( int si = 0; si < nstages; si++ )
     {
         const CascadeClassifierImpl::Data::Stage& stage = cascadeStages[si];
@@ -674,7 +625,7 @@ inline int predictCategoricalStump( CascadeClassifierImpl& cascade,
 
         if( tmp < stage.threshold )
         {
-            sum = (double)tmp;
+            sum = tmp;
             return -si;
         }
 
diff --git a/modules/objdetect/src/erfilter.cpp b/modules/objdetect/src/erfilter.cpp
index 52743afe9..dc8f18caf 100644
--- a/modules/objdetect/src/erfilter.cpp
+++ b/modules/objdetect/src/erfilter.cpp
@@ -83,7 +83,7 @@ public:
     //Constructor
     ERFilterNM();
     //Destructor
-    ~ERFilterNM() {};
+    ~ERFilterNM() {}
 
     float minProbability;
     bool  nonMaxSuppression;
@@ -146,7 +146,7 @@ public:
     //Constructor
     ERClassifierNM1(const std::string& filename);
     // Destructor
-    ~ERClassifierNM1() {};
+    ~ERClassifierNM1() {}
 
     // The classifier must return probability measure for the region.
     double eval(const ERStat& stat);
@@ -162,7 +162,7 @@ public:
     //constructor
     ERClassifierNM2(const std::string& filename);
     // Destructor
-    ~ERClassifierNM2() {};
+    ~ERClassifierNM2() {}
 
     // The classifier must return probability measure for the region.
     double eval(const ERStat& stat);
@@ -294,7 +294,7 @@ void ERFilterNM::er_tree_extract( InputArray image )
         push_new_component = false;
 
         // explore the (remaining) edges to the neighbors to the current pixel
-        for (current_edge = current_edge; current_edge < 4; current_edge++)
+        for ( ; current_edge < 4; current_edge++)
         {
 
             int neighbour_pixel = current_pixel;
@@ -933,14 +933,14 @@ ERStat* ERFilterNM::er_tree_nonmax_suppression ( ERStat * stat, ERStat *parent,
 void ERFilterNM::setCallback(const Ptr<ERFilter::Callback>& cb)
 {
     classifier = cb;
-};
+}
 
 void ERFilterNM::setMinArea(float _minArea)
 {
     CV_Assert( (_minArea >= 0) && (_minArea < maxArea) );
     minArea = _minArea;
     return;
-};
+}
 
 void ERFilterNM::setMaxArea(float _maxArea)
 {
@@ -948,39 +948,39 @@ void ERFilterNM::setMaxArea(float _maxArea)
     CV_Assert(minArea < _maxArea);
     maxArea = _maxArea;
     return;
-};
+}
 
 void ERFilterNM::setThresholdDelta(int _thresholdDelta)
 {
     CV_Assert( (_thresholdDelta > 0) && (_thresholdDelta <= 128) );
     thresholdDelta = _thresholdDelta;
     return;
-};
+}
 
 void ERFilterNM::setMinProbability(float _minProbability)
 {
     CV_Assert( (_minProbability >= 0.0) && (_minProbability <= 1.0) );
     minProbability = _minProbability;
     return;
-};
+}
 
 void ERFilterNM::setMinProbabilityDiff(float _minProbabilityDiff)
 {
     CV_Assert( (_minProbabilityDiff >= 0.0) && (_minProbabilityDiff <= 1.0) );
     minProbabilityDiff = _minProbabilityDiff;
     return;
-};
+}
 
 void ERFilterNM::setNonMaxSuppression(bool _nonMaxSuppression)
 {
     nonMaxSuppression = _nonMaxSuppression;
     return;
-};
+}
 
 int ERFilterNM::getNumRejected()
 {
     return num_rejected_regions;
-};
+}
 
 
 
@@ -993,7 +993,7 @@ ERClassifierNM1::ERClassifierNM1(const std::string& filename)
         boost.load( filename.c_str(), "boost" );
     else
         CV_Error(CV_StsBadArg, "Default classifier file not found!");
-};
+}
 
 double ERClassifierNM1::eval(const ERStat& stat)
 {
@@ -1009,7 +1009,7 @@ double ERClassifierNM1::eval(const ERStat& stat)
 
     // Logistic Correction returns a probability value (in the range(0,1))
     return (double)1-(double)1/(1+exp(-2*votes));
-};
+}
 
 
 // load default 2nd stage classifier if found
@@ -1019,7 +1019,7 @@ ERClassifierNM2::ERClassifierNM2(const std::string& filename)
         boost.load( filename.c_str(), "boost" );
     else
         CV_Error(CV_StsBadArg, "Default classifier file not found!");
-};
+}
 
 double ERClassifierNM2::eval(const ERStat& stat)
 {
@@ -1036,7 +1036,7 @@ double ERClassifierNM2::eval(const ERStat& stat)
 
     // Logistic Correction returns a probability value (in the range(0,1))
     return (double)1-(double)1/(1+exp(-2*votes));
-};
+}
 
 
 /*!
@@ -1949,7 +1949,6 @@ private:
     double (dissimilarity::*distfn) (const int_fast32_t, const int_fast32_t) const;
 
     auto_array_ptr<double> precomputed;
-    double * precomputed2;
 
     double * V;
     const double * V_data;
@@ -2164,7 +2163,7 @@ public:
     unsigned char metric_;
 
     /// Constructor.
-    MaxMeaningfulClustering(unsigned char method, unsigned char metric){ method_=method; metric_=metric; };
+    MaxMeaningfulClustering(unsigned char method, unsigned char metric){ method_=method; metric_=metric; }
 
     void operator()(double *data, unsigned int num, int dim, unsigned char method,
                     unsigned char metric, vector< vector<int> > *meaningful_clusters);
diff --git a/modules/objdetect/src/haar.cpp b/modules/objdetect/src/haar.cpp
index a83dfa93e..2f864797f 100644
--- a/modules/objdetect/src/haar.cpp
+++ b/modules/objdetect/src/haar.cpp
@@ -336,7 +336,7 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
             out->isStumpBased &= node_count == 1;
         }
     }
-
+/*
 #ifdef HAVE_IPP
     int can_use_ipp = !out->has_tilted_features && !out->is_tree && out->isStumpBased;
 
@@ -392,7 +392,7 @@ icvCreateHidHaarClassifierCascade( CvHaarClassifierCascade* cascade )
         }
     }
 #endif
-
+*/
     cascade->hid_cascade = out;
     assert( (char*)haar_node_ptr - (char*)out <= datasize );
 
diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp
index cef5355c5..18bb7afc2 100644
--- a/modules/objdetect/src/hog.cpp
+++ b/modules/objdetect/src/hog.cpp
@@ -42,6 +42,7 @@
 
 #include "precomp.hpp"
 #include "opencv2/core/core_c.h"
+#include "opencl_kernels.hpp"
 
 #include <cstdio>
 #include <iterator>
@@ -58,6 +59,29 @@
 namespace cv
 {
 
+#define NTHREADS 256
+
+enum {DESCR_FORMAT_COL_BY_COL, DESCR_FORMAT_ROW_BY_ROW};
+
+static int numPartsWithin(int size, int part_size, int stride)
+{
+    return (size - part_size + stride) / stride;
+}
+
+static Size numPartsWithin(cv::Size size, cv::Size part_size,
+                                                cv::Size stride)
+{
+    return Size(numPartsWithin(size.width, part_size.width, stride.width),
+        numPartsWithin(size.height, part_size.height, stride.height));
+}
+
+static size_t getBlockHistogramSize(Size block_size, Size cell_size, int nbins)
+{
+    Size cells_per_block = Size(block_size.width / cell_size.width,
+        block_size.height / cell_size.height);
+    return (size_t)(nbins * cells_per_block.area());
+}
+
 size_t HOGDescriptor::getDescriptorSize() const
 {
     CV_Assert(blockSize.width % cellSize.width == 0 &&
@@ -88,7 +112,24 @@ bool HOGDescriptor::checkDetectorSize() const
 void HOGDescriptor::setSVMDetector(InputArray _svmDetector)
 {
     _svmDetector.getMat().convertTo(svmDetector, CV_32F);
-    CV_Assert( checkDetectorSize() );
+    CV_Assert(checkDetectorSize());
+
+    Mat detector_reordered(1, (int)svmDetector.size(), CV_32FC1);
+
+    size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    cv::Size blocks_per_img = numPartsWithin(winSize, blockSize, blockStride);
+
+    for (int i = 0; i < blocks_per_img.height; ++i)
+        for (int j = 0; j < blocks_per_img.width; ++j)
+        {
+            const float *src = &svmDetector[0] + (j * blocks_per_img.height + i) * block_hist_size;
+            float *dst = (float*)detector_reordered.data + (i * blocks_per_img.width + j) * block_hist_size;
+            for (size_t k = 0; k < block_hist_size; ++k)
+                dst[k] = src[k];
+        }
+    size_t descriptor_size = getDescriptorSize();
+    free_coef = svmDetector.size() > descriptor_size ? svmDetector[descriptor_size] : 0;
+    detector_reordered.copyTo(oclSvmDetector);
 }
 
 #define CV_TYPE_NAME_HOG_DESCRIPTOR "opencv-object-detector-hog"
@@ -1029,7 +1070,318 @@ static inline int gcd(int a, int b)
     return a;
 }
 
-void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
+#ifdef HAVE_OPENCL
+
+static bool ocl_compute_gradients_8UC1(int height, int width, InputArray _img, float angle_scale,
+                                       UMat grad, UMat qangle, bool correct_gamma, int nbins)
+{
+    ocl::Kernel k("compute_gradients_8UC1_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    UMat img = _img.getUMat();
+
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+    size_t globalThreads[3] = { width, height, 1 };
+    char correctGamma = (correct_gamma) ? 1 : 0;
+    int grad_quadstep = (int)grad.step >> 3;
+    int qangle_step_shift = 0;
+    int qangle_step = (int)qangle.step >> (1 + qangle_step_shift);
+
+    int idx = 0;
+    idx = k.set(idx, height);
+    idx = k.set(idx, width);
+    idx = k.set(idx, (int)img.step1());
+    idx = k.set(idx, grad_quadstep);
+    idx = k.set(idx, qangle_step);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(img));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(grad));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(qangle));
+    idx = k.set(idx, angle_scale);
+    idx = k.set(idx, correctGamma);
+    idx = k.set(idx, nbins);
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_computeGradient(InputArray img, UMat grad, UMat qangle, int nbins, Size effect_size, bool gamma_correction)
+{
+    float angleScale = (float)(nbins / CV_PI);
+
+    return ocl_compute_gradients_8UC1(effect_size.height, effect_size.width, img,
+         angleScale, grad, qangle, gamma_correction, nbins);
+}
+
+#define CELL_WIDTH 8
+#define CELL_HEIGHT 8
+#define CELLS_PER_BLOCK_X 2
+#define CELLS_PER_BLOCK_Y 2
+
+static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, int height, int width,
+                              UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
+{
+    ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+    if(is_cpu)
+       opts = "-D CPU ";
+    else
+        opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+    k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+    if(k.empty())
+        return false;
+
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)/block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)/block_stride_y;
+    int blocks_total = img_block_width * img_block_height;
+
+    int qangle_step_shift = 0;
+    int grad_quadstep = (int)grad.step >> 2;
+    int qangle_step = (int)qangle.step >> qangle_step_shift;
+
+    int blocks_in_group = 4;
+    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
+    size_t globalThreads[3] = {((img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group) * localThreads[0], 2, 1 };
+
+    int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
+    int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
+
+    int smem = (hists_size + final_hists_size) * blocks_in_group;
+
+    int idx = 0;
+    idx = k.set(idx, block_stride_x);
+    idx = k.set(idx, block_stride_y);
+    idx = k.set(idx, nbins);
+    idx = k.set(idx, (int)block_hist_size);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, blocks_in_group);
+    idx = k.set(idx, blocks_total);
+    idx = k.set(idx, grad_quadstep);
+    idx = k.set(idx, qangle_step);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(grad));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(qangle));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(gauss_w_lut));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(block_hists));
+    idx = k.set(idx, (void*)NULL, (size_t)smem);
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static int power_2up(unsigned int n)
+{
+    for(unsigned int i = 1; i<=1024; i<<=1)
+        if(n < i)
+            return i;
+    return -1; // Input is too big
+}
+
+static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_y,
+                                int height, int width, UMat block_hists, float threshold)
+{
+    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
+        / block_stride_x;
+    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
+        / block_stride_y;
+    int nthreads;
+    size_t globalThreads[3] = { 1, 1, 1  };
+    size_t localThreads[3] = { 1, 1, 1  };
+
+    int idx = 0;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+    ocl::Kernel k;
+    if ( nbins == 9 )
+    {
+        k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        int blocks_in_group = NTHREADS / block_hist_size;
+        nthreads = blocks_in_group * block_hist_size;
+        int num_groups = (img_block_width * img_block_height + blocks_in_group - 1)/blocks_in_group;
+        globalThreads[0] = nthreads * num_groups;
+        localThreads[0] = nthreads;
+    }
+    else
+    {
+        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+
+        nthreads = power_2up(block_hist_size);
+        globalThreads[0] = img_block_width * nthreads;
+        globalThreads[1] = img_block_height;
+        localThreads[0] = nthreads;
+
+        if ((nthreads < 32) || (nthreads > 512) )
+            return false;
+
+        idx = k.set(idx, nthreads);
+        idx = k.set(idx, block_hist_size);
+        idx = k.set(idx, img_block_width);
+    }
+    idx = k.set(idx, ocl::KernelArg::PtrReadWrite(block_hists));
+    idx = k.set(idx, threshold);
+    idx = k.set(idx, (void*)NULL,  nthreads * sizeof(float));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
+                                       int height, int width, UMat block_hists, UMat descriptors,
+                                       int block_hist_size, int descr_size, int descr_width)
+{
+    ocl::Kernel k("extract_descrs_by_rows_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    int descriptors_quadstep = (int)descriptors.step >> 2;
+
+    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+
+    int idx = 0;
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, descriptors_quadstep);
+    idx = k.set(idx, descr_size);
+    idx = k.set(idx, descr_width);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
+                                       int height, int width, UMat block_hists, UMat descriptors,
+                                       int block_hist_size, int descr_size, int nblocks_win_x, int nblocks_win_y)
+{
+    ocl::Kernel k("extract_descrs_by_cols_kernel", ocl::objdetect::objdetect_hog_oclsrc);
+    if(k.empty())
+        return false;
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    int descriptors_quadstep = (int)descriptors.step >> 2;
+
+    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
+    size_t localThreads[3] = { NTHREADS, 1, 1 };
+
+    int idx = 0;
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, descriptors_quadstep);
+    idx = k.set(idx, descr_size);
+    idx = k.set(idx, nblocks_win_x);
+    idx = k.set(idx, nblocks_win_y);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(descriptors));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_compute(InputArray _img, Size win_stride, std::vector<float>& _descriptors, int descr_format, Size blockSize,
+                        Size cellSize, int nbins, Size blockStride, Size winSize, float sigma, bool gammaCorrection, double L2HysThreshold)
+{
+     Size imgSize = _img.size();
+    Size effect_size = imgSize;
+
+    UMat grad(imgSize, CV_32FC2);
+    UMat qangle(imgSize, CV_8UC2);
+
+    const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
+    UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
+
+    Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
+    UMat labels(1, wins_per_img.area(), CV_8U);
+
+    float scale = 1.f / (2.f * sigma * sigma);
+    Mat gaussian_lut(1, 512, CV_32FC1);
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+
+    if(!ocl_computeGradient(_img, grad, qangle, nbins, effect_size, gammaCorrection))
+        return false;
+
+    UMat gauss_w_lut;
+    gaussian_lut.copyTo(gauss_w_lut);
+    if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
+        return false;
+
+    if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, block_hists, (float)L2HysThreshold))
+        return false;
+
+    Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
+    wins_per_img = numPartsWithin(effect_size, winSize, win_stride);
+
+    int descr_size = blocks_per_win.area()*(int)block_hist_size;
+    int descr_width = (int)block_hist_size*blocks_per_win.width;
+
+    UMat descriptors(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+    switch (descr_format)
+    {
+    case DESCR_FORMAT_ROW_BY_ROW:
+        if(!ocl_extract_descrs_by_rows(winSize.height, winSize.width,
+            blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height,
+            effect_size.width, block_hists, descriptors, (int)block_hist_size, descr_size, descr_width))
+            return false;
+        break;
+    case DESCR_FORMAT_COL_BY_COL:
+        if(!ocl_extract_descrs_by_cols(winSize.height, winSize.width,
+            blockStride.height, blockStride.width, win_stride.height, win_stride.width, effect_size.height, effect_size.width,
+            block_hists, descriptors, (int)block_hist_size, descr_size, blocks_per_win.width, blocks_per_win.height))
+            return false;
+        break;
+    default:
+        return false;
+    }
+    descriptors.reshape(1, (int)descriptors.total()).getMat(ACCESS_READ).copyTo(_descriptors);
+    return true;
+}
+#endif //HAVE_OPENCL
+
+void HOGDescriptor::compute(InputArray _img, std::vector<float>& descriptors,
     Size winStride, Size padding, const std::vector<Point>& locations) const
 {
     if( winStride == Size() )
@@ -1037,11 +1389,18 @@ void HOGDescriptor::compute(const Mat& img, std::vector<float>& descriptors,
     Size cacheStride(gcd(winStride.width, blockStride.width),
                      gcd(winStride.height, blockStride.height));
 
+    Size imgSize = _img.size();
+
     size_t nwindows = locations.size();
     padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
     padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
-    Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
+    Size paddedImgSize(imgSize.width + padding.width*2, imgSize.height + padding.height*2);
 
+    CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && _img.isUMat(),
+        ocl_compute(_img, winStride, descriptors, DESCR_FORMAT_COL_BY_COL, blockSize,
+        cellSize, nbins, blockStride, winSize, (float)getWinSigma(), gammaCorrection, L2HysThreshold))
+
+    Mat img = _img.getMat();
     HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
 
     if( !nwindows )
@@ -1263,20 +1622,215 @@ private:
     Mutex* mtx;
 };
 
+#ifdef HAVE_OPENCL
+
+static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                               int win_stride_y, int win_stride_x, int height, int width,
+                               const UMat& block_hists, UMat detector,
+                               float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
+{
+    int nthreads;
+    bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
+    cv::String opts;
+
+    ocl::Kernel k;
+    int idx = 0;
+    switch (descr_size.width)
+    {
+    case 180:
+        nthreads = 180;
+        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.width);
+        idx = k.set(idx, descr_size.height);
+        break;
+
+    case 252:
+        nthreads = 256;
+        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.width);
+        idx = k.set(idx, descr_size.height);
+        break;
+
+    default:
+        nthreads = 256;
+        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
+        if(k.empty())
+            return false;
+        if(is_cpu)
+           opts = "-D CPU ";
+        else
+            opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
+        k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
+        if(k.empty())
+            return false;
+        idx = k.set(idx, descr_size.area());
+        idx = k.set(idx, descr_size.height);
+    }
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
+        block_stride_x;
+
+    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
+    size_t localThreads[3] = { nthreads, 1, 1 };
+
+    idx = k.set(idx, block_hist_size);
+    idx = k.set(idx, img_win_width);
+    idx = k.set(idx, img_block_width);
+    idx = k.set(idx, win_block_stride_x);
+    idx = k.set(idx, win_block_stride_y);
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(block_hists));
+    idx = k.set(idx, ocl::KernelArg::PtrReadOnly(detector));
+    idx = k.set(idx, free_coef);
+    idx = k.set(idx, threshold);
+    idx = k.set(idx, ocl::KernelArg::PtrWriteOnly(labels));
+
+    return k.run(2, globalThreads, localThreads, false);
+}
+
+static bool ocl_detect(InputArray img, std::vector<Point> &hits, double hit_threshold, Size win_stride,
+                       const UMat& oclSvmDetector, Size blockSize, Size cellSize, int nbins, Size blockStride, Size winSize,
+                       bool gammaCorrection, double L2HysThreshold, float sigma, float free_coef)
+{
+    hits.clear();
+    if (oclSvmDetector.empty())
+        return false;
+
+    Size imgSize = img.size();
+    Size effect_size = imgSize;
+    UMat grad(imgSize, CV_32FC2);
+    UMat qangle(imgSize, CV_8UC2);
+
+    const size_t block_hist_size = getBlockHistogramSize(blockSize, cellSize, nbins);
+    const Size blocks_per_img = numPartsWithin(imgSize, blockSize, blockStride);
+    UMat block_hists(1, static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
+
+    Size wins_per_img = numPartsWithin(imgSize, winSize, win_stride);
+    UMat labels(1, wins_per_img.area(), CV_8U);
+
+    float scale = 1.f / (2.f * sigma * sigma);
+    Mat gaussian_lut(1, 512, CV_32FC1);
+    int idx = 0;
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
+    for(int i=-8; i<8; i++)
+        for(int j=-8; j<8; j++)
+            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
+
+    if(!ocl_computeGradient(img, grad, qangle, nbins, effect_size, gammaCorrection))
+        return false;
+
+    UMat gauss_w_lut;
+    gaussian_lut.copyTo(gauss_w_lut);
+    if(!ocl_compute_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, grad, qangle, gauss_w_lut, block_hists, block_hist_size))
+        return false;
+
+    if(!ocl_normalize_hists(nbins, blockStride.width, blockStride.height, effect_size.height,
+        effect_size.width, block_hists, (float)L2HysThreshold))
+        return false;
+
+    Size blocks_per_win = numPartsWithin(winSize, blockSize, blockStride);
+
+    Size descr_size((int)block_hist_size*blocks_per_win.width, blocks_per_win.height);
+
+    if(!ocl_classify_hists(winSize.height, winSize.width, blockStride.height,
+        blockStride.width, win_stride.height, win_stride.width,
+        effect_size.height, effect_size.width, block_hists, oclSvmDetector,
+        free_coef, (float)hit_threshold, labels, descr_size, (int)block_hist_size))
+        return false;
+
+    Mat labels_host = labels.getMat(ACCESS_READ);
+    unsigned char *vec = labels_host.ptr();
+    for (int i = 0; i < wins_per_img.area(); i++)
+    {
+        int y = i / wins_per_img.width;
+        int x = i - wins_per_img.width * y;
+        if (vec[i])
+        {
+            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
+        }
+    }
+    return true;
+}
+
+static bool ocl_detectMultiScale(InputArray _img, std::vector<Rect> &found_locations, std::vector<double>& level_scale,
+                                              double hit_threshold, Size win_stride, double group_threshold,
+                                              const UMat& oclSvmDetector, Size blockSize, Size cellSize,
+                                              int nbins, Size blockStride, Size winSize, bool gammaCorrection,
+                                              double L2HysThreshold, float sigma, float free_coef)
+{
+    std::vector<Rect> all_candidates;
+    std::vector<Point> locations;
+    UMat image_scale;
+    Size imgSize = _img.size();
+    image_scale.create(imgSize, _img.type());
+
+    for (size_t i = 0; i<level_scale.size() ; i++)
+    {
+        double scale = level_scale[i];
+        Size effect_size = Size(cvRound(imgSize.width / scale), cvRound(imgSize.height / scale));
+        if (effect_size == imgSize)
+        {
+            if(!ocl_detect(_img, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
+                blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
+                return false;
+        }
+        else
+        {
+            resize(_img, image_scale, effect_size);
+            if(!ocl_detect(image_scale, locations, hit_threshold, win_stride, oclSvmDetector, blockSize, cellSize, nbins,
+                blockStride, winSize, gammaCorrection, L2HysThreshold, sigma, free_coef))
+                return false;
+        }
+        Size scaled_win_size(cvRound(winSize.width * scale),
+            cvRound(winSize.height * scale));
+        for (size_t j = 0; j < locations.size(); j++)
+            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
+    }
+    found_locations.assign(all_candidates.begin(), all_candidates.end());
+    cv::groupRectangles(found_locations, (int)group_threshold, 0.2);
+    return true;
+}
+#endif //HAVE_OPENCL
+
 void HOGDescriptor::detectMultiScale(
-    const Mat& img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
+    InputArray _img, std::vector<Rect>& foundLocations, std::vector<double>& foundWeights,
     double hitThreshold, Size winStride, Size padding,
     double scale0, double finalThreshold, bool useMeanshiftGrouping) const
 {
     double scale = 1.;
     int levels = 0;
 
+    Size imgSize = _img.size();
     std::vector<double> levelScale;
     for( levels = 0; levels < nlevels; levels++ )
     {
         levelScale.push_back(scale);
-        if( cvRound(img.cols/scale) < winSize.width ||
-                cvRound(img.rows/scale) < winSize.height ||
+        if( cvRound(imgSize.width/scale) < winSize.width ||
+            cvRound(imgSize.height/scale) < winSize.height ||
                 scale0 <= 1 )
             break;
         scale *= scale0;
@@ -1284,12 +1838,21 @@ void HOGDescriptor::detectMultiScale(
     levels = std::max(levels, 1);
     levelScale.resize(levels);
 
+    if(winStride == Size())
+        winStride = blockStride;
+
+    CV_OCL_RUN(_img.dims() <= 2 && _img.type() == CV_8UC1 && scale0 > 1 && winStride.width % blockStride.width == 0 &&
+        winStride.height % blockStride.height == 0 && padding == Size(0,0) && _img.isUMat(),
+        ocl_detectMultiScale(_img, foundLocations, levelScale, hitThreshold, winStride, finalThreshold, oclSvmDetector,
+        blockSize, cellSize, nbins, blockStride, winSize, gammaCorrection, L2HysThreshold, (float)getWinSigma(), free_coef));
+
     std::vector<Rect> allCandidates;
     std::vector<double> tempScales;
     std::vector<double> tempWeights;
     std::vector<double> foundScales;
-    Mutex mtx;
 
+    Mutex mtx;
+    Mat img = _img.getMat();
     Range range(0, (int)levelScale.size());
     HOGInvoker invoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates, &mtx, &tempWeights, &tempScales);
     parallel_for_(range, invoker);
@@ -1306,7 +1869,7 @@ void HOGDescriptor::detectMultiScale(
         groupRectangles(foundLocations, foundWeights, (int)finalThreshold, 0.2);
 }
 
-void HOGDescriptor::detectMultiScale(const Mat& img, std::vector<Rect>& foundLocations,
+void HOGDescriptor::detectMultiScale(InputArray img, std::vector<Rect>& foundLocations,
     double hitThreshold, Size winStride, Size padding,
     double scale0, double finalThreshold, bool useMeanshiftGrouping) const
 {
diff --git a/modules/objdetect/src/opencl/cascadedetect.cl b/modules/objdetect/src/opencl/cascadedetect.cl
index b36895805..09842ba4f 100644
--- a/modules/objdetect/src/opencl/cascadedetect.cl
+++ b/modules/objdetect/src/opencl/cascadedetect.cl
@@ -1,22 +1,43 @@
 ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
 ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
 
-typedef struct __attribute__((aligned(4))) OptFeature
+//
+// the code has been derived from the OpenCL Haar cascade kernel by
+//
+//    Niko Li, newlife20080214@gmail.com
+//    Wang Weiyan, wangweiyanster@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
+//
+
+
+typedef struct __attribute__((aligned(4))) OptHaarFeature
 {
     int4 ofs[3] __attribute__((aligned (4)));
     float4 weight __attribute__((aligned (4)));
 }
-OptFeature;
+OptHaarFeature;
+
+typedef struct __attribute__((aligned(4))) OptLBPFeature
+{
+    int16 ofs __attribute__((aligned (4)));
+}
+OptLBPFeature;
 
 typedef struct __attribute__((aligned(4))) Stump
 {
-    int featureIdx __attribute__((aligned (4)));
-    float threshold __attribute__((aligned (4))); // for ordered features only
-    float left __attribute__((aligned (4)));
-    float right __attribute__((aligned (4)));
+    float4 st __attribute__((aligned (4)));
 }
 Stump;
 
+typedef struct __attribute__((aligned(4))) Node
+{
+    int4 n __attribute__((aligned (4)));
+}
+Node;
+
 typedef struct __attribute__((aligned (4))) Stage
 {
     int first __attribute__((aligned (4)));
@@ -25,161 +46,615 @@ typedef struct __attribute__((aligned (4))) Stage
 }
 Stage;
 
-__kernel void runHaarClassifierStump(
-    __global const int* sum,
-    int sumstep, int sumoffset,
-    __global const int* sqsum,
-    int sqsumstep, int sqsumoffset,
-    __global const OptFeature* optfeatures,
+typedef struct __attribute__((aligned (4))) ScaleData
+{
+    float scale __attribute__((aligned (4)));
+    int szi_width __attribute__((aligned (4)));
+    int szi_height __attribute__((aligned (4)));
+    int layer_ofs __attribute__((aligned (4)));
+    int ystep __attribute__((aligned (4)));
+}
+ScaleData;
 
-    int nstages,
+#ifndef SUM_BUF_SIZE
+#define SUM_BUF_SIZE 0
+#endif
+
+#ifndef NODE_COUNT
+#define NODE_COUNT 1
+#endif
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runHaarClassifier(
+    int nscales, __global const ScaleData* scaleData,
+    __global const int* sum,
+    int _sumstep, int sumoffset,
+    __global const OptHaarFeature* optfeatures,
+
+    int splitstage, int nstages,
     __global const Stage* stages,
-    __global const Stump* stumps,
+    __global const Node* nodes,
+    __global const float* leaves0,
 
     volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize, int maxFaces)
+    int4 normrect, int sqofs, int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
-    sqsumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
+                        mad24(normrect.y, sumstep, normrect.x + normrect.z),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x),
+                        mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
+    int normarea = normrect.z * normrect.w;
+    float invarea = 1.f/normarea;
+    int lidx = ly*LOCAL_SIZE_X + lx;
 
-    if( ix < imgsize.x && iy < imgsize.y )
+    #if SUM_BUF_SIZE > 0
+    int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
+                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
+    #else
+    int4 nofs = nofs0;
+    #endif
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lnf = (__local float*)(lcount + 1);
+    __local float* lpartsum = lnf + LOCAL_SIZE;
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
     {
-        int ntrees;
-        int stageIdx, i;
-        float s = 0.f;
-        __global const Stump* stump = stumps;
-        __global const OptFeature* f;
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
 
-        __global const int* psum = sum + mad24(iy, sumstep, ix);
-        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
-        int normarea = normrect.z * normrect.w;
-        float invarea = 1.f/normarea;
-        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
-                      pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
-        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
-        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        float4 weight, vsval;
-        int4 ofs, ofs0, ofs1, ofs2;
-        nf = nf > 0 ? nf : 1.f;
-
-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
         {
-            ntrees = stages[stageIdx].ntrees;
-            s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++ )
-            {
-                f = optfeatures + stump->featureIdx;
-                weight = f->weight;
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+            __global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
 
-                ofs = f->ofs[0];
-                sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
-                ofs = f->ofs[1];
-                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
-                if( weight.z > 0 )
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
+            {
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                #if NODE_COUNT==1
+                __global const Stump* stump = (__global const Stump*)nodes;
+                #else
+                __global const Node* node = nodes;
+                __global const float* leaves = leaves0;
+                #endif
+                #if SUM_BUF_SIZE > 0
+                __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* psum = psum1;
+                #endif
+
+                __global const float* psqsum = (__global const float*)(psum1 + sqofs);
+                float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
+                float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
+                float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
+                nf = nf > 0 ? nf : 1.f;
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
                 {
-                    ofs = f->ofs[2];
-                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    #if NODE_COUNT==1
+                    for( i = 0; i < ntrees; i++ )
+                    {
+                        float4 st = stump[i].st;
+                        __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                        float4 weight = f->weight;
+
+                        int4 ofs = f->ofs[0];
+                        sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                        ofs = f->ofs[1];
+                        sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                        if( weight.z > 0 )
+                        {
+                            ofs = f->ofs[2];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                        }
+
+                        s += (sval < st.y*nf) ? st.z : st.w;
+                    }
+                    stump += ntrees;
+                    #else
+                    for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
+                    {
+                        int idx = 0;
+                        do
+                        {
+                            int4 n = node[idx].n;
+                            __global const OptHaarFeature* f = optfeatures + n.x;
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+
+                            sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                        }
+                        while(idx > 0);
+                        s += leaves[-idx];
+                    }
+                    #endif
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
                 }
 
-                s += (sval < stump->threshold*nf) ? stump->left : stump->right;
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                    lnf[count] = nf;
+                }
             }
 
-            if( s < stages[stageIdx].threshold )
-                break;
-        }
-
-        if( stageIdx == nstages )
-        {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
             {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                int nrects = lcount[0];
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    #if NODE_COUNT == 1
+                    __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
+                    #else
+                    __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
+                    __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
+                    #endif
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+                        nf = lnf[nr];
+
+                        {
+                        int ntrees0 = ntrees_p*partidx;
+                        int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                        int ix1 = idxval & 255, iy1 = idxval >> 8;
+                        #if SUM_BUF_SIZE > 0
+                        __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                        #else
+                        __global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
+                        #endif
+
+                        #if NODE_COUNT == 1
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            float4 st = stump[i].st;
+                            __global const OptHaarFeature* f = optfeatures + as_int(st.x);
+                            float4 weight = f->weight;
+
+                            int4 ofs = f->ofs[0];
+                            float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                            ofs = f->ofs[1];
+                            sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                            //if( weight.z > 0 )
+                            {
+                                ofs = f->ofs[2];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                            }
+
+                            partsum += (sval < st.y*nf) ? st.z : st.w;
+                        }
+                        #else
+                        for( i = ntrees0; i < ntrees1; i++ )
+                        {
+                            int idx = 0;
+                            do
+                            {
+                                int4 n = node[i*2 + idx].n;
+                                __global const OptHaarFeature* f = optfeatures + n.x;
+                                float4 weight = f->weight;
+                                int4 ofs = f->ofs[0];
+
+                                float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
+                                ofs = f->ofs[1];
+                                sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.y;
+                                if( weight.z > 0 )
+                                {
+                                    ofs = f->ofs[2];
+                                    sval += (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.z;
+                                }
+
+                                idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
+                            }
+                            while(idx > 0);
+                            partsum += leaves[i*3-idx];
+                        }
+                        #endif
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                            lnf[count] = nf;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
+            {
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
             }
         }
     }
 }
 
-#if 0
-__kernel void runLBPClassifierStump(
-    __global const int* sum,
-    int sumstep, int sumoffset,
-    __global const int* sqsum,
-    int sqsumstep, int sqsumoffset,
-    __global const OptFeature* optfeatures,
+#undef CALC_SUM_OFS_
+#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
 
-    int nstages,
+__kernel void runLBPClassifierStumpSimple(
+    int nscales, __global const ScaleData* scaleData,
+    __global const int* sum,
+    int _sumstep, int sumoffset,
+    __global const OptLBPFeature* optfeatures,
+
+    int splitstage, int nstages,
     __global const Stage* stages,
     __global const Stump* stumps,
     __global const int* bitsets,
     int bitsetSize,
 
     volatile __global int* facepos,
-    int2 imgsize, int xyscale, float factor,
-    int4 normrect, int2 windowsize, int maxFaces)
+    int2 windowsize, int maxFaces)
 {
-    int ix = get_global_id(0)*xyscale*VECTOR_SIZE;
-    int iy = get_global_id(1)*xyscale;
-    sumstep /= sizeof(int);
-    sqsumstep /= sizeof(int);
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int local_size_x = get_local_size(0);
+    int local_size_y = get_local_size(1);
+    int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
+    int ngroups = get_num_groups(0)*get_num_groups(1);
+    int scaleIdx, tileIdx, stageIdx;
+    int startStage = 0, endStage = nstages;
+    int sumstep = (int)(_sumstep/sizeof(int));
 
-    if( ix < imgsize.x && iy < imgsize.y )
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
     {
-        int ntrees;
-        int stageIdx, i;
-        float s = 0.f;
-        __global const Stump* stump = stumps;
-        __global const int* bitset = bitsets;
-        __global const OptFeature* f;
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
+                             (worksize.y/ystep + local_size_y-1)/local_size_y);
+        int totalTiles = ntiles.x*ntiles.y;
 
-        __global const int* psum = sum + mad24(iy, sumstep, ix);
-        __global const int* pnsum = psum + mad24(normrect.y, sumstep, normrect.x);
-        int normarea = normrect.z * normrect.w;
-        float invarea = 1.f/normarea;
-        float sval = (pnsum[0] - pnsum[normrect.z] - pnsum[mul24(normrect.w, sumstep)] +
-        pnsum[mad24(normrect.w, sumstep, normrect.z)])*invarea;
-        float sqval = (sqsum[mad24(iy + normrect.y, sqsumstep, ix + normrect.x)])*invarea;
-        float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
-        float4 weight;
-        int4 ofs;
-        nf = nf > 0 ? nf : 1.f;
-
-        for( stageIdx = 0; stageIdx < nstages; stageIdx++ )
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
         {
-            ntrees = stages[stageIdx].ntrees;
-            s = 0.f;
-            for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+            int iy = ((tileIdx / ntiles.x)*local_size_y + ly)*ystep;
+            int ix = ((tileIdx % ntiles.x)*local_size_x + lx)*ystep;
+
+            if( ix < worksize.x && iy < worksize.y )
             {
-                f = optfeatures + stump->featureIdx;
+                __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;
 
-                weight = f->weight;
+                for( stageIdx = 0; stageIdx < endStage; stageIdx++ )
+                {
+                    int i, ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;
 
-                // compute LBP feature to val
-                s += (bitset[val >> 5] & (1 << (val & 31))) ? stump->left : stump->right;
-            }
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
 
-            if( s < stages[stageIdx].threshold )
-            break;
-        }
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
 
-        if( stageIdx == nstages )
-        {
-            int nfaces = atomic_inc(facepos);
-            if( nfaces < maxFaces )
-            {
-                volatile __global int* face = facepos + 1 + nfaces*4;
-                face[0] = convert_int_rte(ix*factor);
-                face[1] = convert_int_rte(iy*factor);
-                face[2] = convert_int_rte(windowsize.x*factor);
-                face[3] = convert_int_rte(windowsize.y*factor);
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }
+
+                if( stageIdx == nstages )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        face[0] = scaleIdx;
+                        face[1] = ix;
+                        face[2] = iy;
+                    }
+                }
+            }
+        }
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
+void runLBPClassifierStump(
+    int nscales, __global const ScaleData* scaleData,
+    __global const int* sum,
+    int _sumstep, int sumoffset,
+    __global const OptLBPFeature* optfeatures,
+
+    int splitstage, int nstages,
+    __global const Stage* stages,
+    __global const Stump* stumps,
+    __global const int* bitsets,
+    int bitsetSize,
+
+    volatile __global int* facepos,
+    int2 windowsize, int maxFaces)
+{
+    int lx = get_local_id(0);
+    int ly = get_local_id(1);
+    int groupIdx = get_group_id(0);
+    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
+    int scaleIdx, tileIdx, stageIdx;
+    int sumstep = (int)(_sumstep/sizeof(int));
+    int lidx = ly*LOCAL_SIZE_X + lx;
+
+    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
+    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
+    #if SUM_BUF_SIZE > 0
+    __local int* ibuf = lstore;
+    __local int* lcount = ibuf + SUM_BUF_SIZE;
+    #else
+    __local int* lcount = lstore;
+    #endif
+    __local float* lpartsum = (__local float*)(lcount + 1);
+    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
+
+    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
+    {
+        __global const ScaleData* s = scaleData + scaleIdx;
+        int ystep = s->ystep;
+        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
+        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
+                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
+        int totalTiles = ntiles.x*ntiles.y;
+
+        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
+        {
+            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
+            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
+            int ix = lx, iy = ly;
+            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
+
+            if( ix0 >= worksize.x || iy0 >= worksize.y )
+                continue;
+            #if SUM_BUF_SIZE > 0
+            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
+            {
+                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
+                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            #endif
+
+            if( lidx == 0 )
+                lcount[0] = 0;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
+            {
+                __global const Stump* stump = stumps;
+                __global const int* bitset = bitsets;
+                #if SUM_BUF_SIZE > 0
+                __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
+                #else
+                __global const int* p = psum0 + mad24(iy, sumstep, ix);
+                #endif
+
+                for( stageIdx = 0; stageIdx < splitstage; stageIdx++ )
+                {
+                    int ntrees = stages[stageIdx].ntrees;
+                    float s = 0.f;
+                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
+                    {
+                        float4 st = stump->st;
+                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                        int16 ofs = f->ofs;
+
+                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+
+                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
+                    }
+
+                    if( s < stages[stageIdx].threshold )
+                        break;
+                }
+
+                if( stageIdx == splitstage && (ystep == 1 || ((ix | iy) & 1) == 0) )
+                {
+                    int count = atomic_inc(lcount);
+                    lbuf[count] = (int)(ix | (iy << 8));
+                }
+            }
+
+            for( stageIdx = splitstage; stageIdx < nstages; stageIdx++ )
+            {
+                int nrects = lcount[0];
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if( nrects == 0 )
+                    break;
+                if( lidx == 0 )
+                    lcount[0] = 0;
+
+                {
+                    __global const Stump* stump = stumps + stages[stageIdx].first;
+                    __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
+                    int nparts = LOCAL_SIZE / nrects;
+                    int ntrees = stages[stageIdx].ntrees;
+                    int ntrees_p = (ntrees + nparts - 1)/nparts;
+                    int nr = lidx / nparts;
+                    int partidx = -1, idxval = 0;
+                    float partsum = 0.f, nf = 0.f;
+
+                    if( nr < nrects )
+                    {
+                        partidx = lidx % nparts;
+                        idxval = lbuf[nr];
+
+                        {
+                            int ntrees0 = ntrees_p*partidx;
+                            int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
+                            int ix1 = idxval & 255, iy1 = idxval >> 8;
+                            #if SUM_BUF_SIZE > 0
+                            __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
+                            #else
+                            __global const int* p = psum0 + mad24(iy1, sumstep, ix1);
+                            #endif
+
+                            for( i = ntrees0; i < ntrees1; i++ )
+                            {
+                                float4 st = stump[i].st;
+                                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
+                                int16 ofs = f->ofs;
+
+                                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
+                                    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
+
+                                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
+
+                                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
+                                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
+                                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
+
+                                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
+                                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
+                                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
+                                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
+                                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
+
+                                partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
+                            }
+                        }
+                    }
+                    lpartsum[lidx] = partsum;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+
+                    if( partidx == 0 )
+                    {
+                        float s = lpartsum[nr*nparts];
+                        for( i = 1; i < nparts; i++ )
+                            s += lpartsum[i + nr*nparts];
+                        if( s >= stages[stageIdx].threshold )
+                        {
+                            int count = atomic_inc(lcount);
+                            lbuf[count] = idxval;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if( stageIdx == nstages )
+            {
+                int nrects = lcount[0];
+                if( lidx < nrects )
+                {
+                    int nfaces = atomic_inc(facepos);
+                    if( nfaces < maxFaces )
+                    {
+                        volatile __global int* face = facepos + 1 + nfaces*3;
+                        int val = lbuf[lidx];
+                        face[0] = scaleIdx;
+                        face[1] = ix0 + (val & 255);
+                        face[2] = iy0 + (val >> 8);
+                    }
+                }
             }
         }
     }
 }
-#endif
diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl
similarity index 98%
rename from modules/ocl/src/opencl/objdetect_hog.cl
rename to modules/objdetect/src/opencl/objdetect_hog.cl
index 0d2f26f96..e931e82b5 100644
--- a/modules/ocl/src/opencl/objdetect_hog.cl
+++ b/modules/objdetect/src/opencl/objdetect_hog.cl
@@ -50,6 +50,14 @@
 #define NTHREADS 256
 #define CV_PI_F 3.1415926535897932384626433832795f
 
+#ifdef INTEL_DEVICE
+#define QANGLE_TYPE		int
+#define QANGLE_TYPE2	int2
+#else
+#define QANGLE_TYPE		uchar
+#define QANGLE_TYPE2	uchar2
+#endif
+
 //----------------------------------------------------------------------------
 // Histogram computation
 // 12 threads for a cell, 12x4 threads per block
@@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
     const int cnbins, const int cblock_hist_size, const int img_block_width,
     const int blocks_in_group, const int blocks_total,
     const int grad_quadstep, const int qangle_step,
-    __global const float* grad, __global const uchar* qangle,
+    __global const float* grad, __global const QANGLE_TYPE* qangle,
     __global const float* gauss_w_lut,
     __global float* block_hists, __local float* smem)
 {
@@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
 
     __global const float* grad_ptr = (gid < blocks_total) ?
         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
-    __global const uchar* qangle_ptr = (gid < blocks_total) ?
+    __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
 
     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
     {
         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
-        uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+        QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
 
         grad_ptr += grad_quadstep;
         qangle_ptr += qangle_step;
@@ -200,7 +208,7 @@ __kernel void normalize_hists_36_kernel(__global float* block_hists,
 //-------------------------------------------------------------
 //  Normalization of histograms via L2Hys_norm
 //
-static float reduce_smem(volatile __local float* smem, int size)
+inline float reduce_smem(volatile __local float* smem, int size)
 {
     unsigned int tid = get_local_id(0);
     float sum = smem[tid];
@@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel(
 __kernel void compute_gradients_8UC4_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    const __global uchar4 * img, __global float * grad, __global uchar * qangle,
+    const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
@@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel(
 __kernel void compute_gradients_8UC1_kernel(
     const int height, const int width,
     const int img_step, const int grad_quadstep, const int qangle_step,
-    __global const uchar * img, __global float * grad, __global uchar * qangle,
+    __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
     const float angle_scale, const char correct_gamma, const int cnbins)
 {
     const int x = get_global_id(0);
diff --git a/modules/ocl/test/test_pyramids.cpp b/modules/objdetect/test/opencl/test_hogdetector.cpp
similarity index 53%
rename from modules/ocl/test/test_pyramids.cpp
rename to modules/objdetect/test/opencl/test_hogdetector.cpp
index 2d861b627..8568352b6 100644
--- a/modules/ocl/test/test_pyramids.cpp
+++ b/modules/objdetect/test/opencl/test_hogdetector.cpp
@@ -12,10 +12,17 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Yao Wang yao@multicorewareinc.com
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Shengen Yan, yanshengen@gmail.com
+//    Jiang Liyuan,jlyuan001.good@163.com
+//    Rock Li, Rock.Li@amd.com
+//    Zailong Wu, bullet@yeah.net
+//    Yao Wang, bitwangyaoyao@gmail.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -43,75 +50,72 @@
 //
 //M*/
 
-
 #include "test_precomp.hpp"
-#include <iomanip>
+#include "opencv2/ts/ocl_test.hpp"
 
 #ifdef HAVE_OPENCL
 
-using namespace cv;
-using namespace testing;
-using namespace std;
+namespace cvtest {
+namespace ocl {
 
-PARAM_TEST_CASE(PyrBase, MatDepth, Channels)
+///////////////////// HOG /////////////////////////////
+PARAM_TEST_CASE(HOG, Size, MatType)
 {
-    int depth;
-    int channels;
-
-    Mat dst_cpu;
-    ocl::oclMat gdst;
-
+    Size winSize;
+    int type;
+    Mat img;
+    UMat uimg;
     virtual void SetUp()
     {
-        depth = GET_PARAM(0);
-        channels = GET_PARAM(1);
+        winSize = GET_PARAM(0);
+        type = GET_PARAM(1);
+        img = readImage("cascadeandhog/images/image_00000000_0.png", IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+        img.copyTo(uimg);
     }
 };
 
-/////////////////////// PyrDown //////////////////////////
-
-typedef PyrBase PyrDown;
-
-OCL_TEST_P(PyrDown, Mat)
+OCL_TEST_P(HOG, GetDescriptors)
 {
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(depth, channels), 0, 255);
-        ocl::oclMat gsrc(src);
+    HOGDescriptor hog;
+    hog.gammaCorrection = true;
 
-        pyrDown(src, dst_cpu);
-        ocl::pyrDown(gsrc, gdst);
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
 
-        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), depth == CV_32F ? 1e-4f : 1.0f);
-    }
+    std::vector<float> cpu_descriptors;
+    std::vector<float> gpu_descriptors;
+
+    OCL_OFF(hog.compute(img, cpu_descriptors, hog.winSize));
+    OCL_ON(hog.compute(uimg, gpu_descriptors, hog.winSize));
+
+    Mat cpu_desc(cpu_descriptors), gpu_desc(gpu_descriptors);
+
+    EXPECT_MAT_SIMILAR(cpu_desc, gpu_desc, 1e-1);
 }
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrDown, Combine(
-                            Values(CV_8U, CV_16U, CV_16S, CV_32F),
-                            Values(1, 3, 4)));
-
-/////////////////////// PyrUp //////////////////////////
-
-typedef PyrBase PyrUp;
-
-OCL_TEST_P(PyrUp, Accuracy)
+OCL_TEST_P(HOG, Detect)
 {
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size size(MWIDTH, MHEIGHT);
-        Mat src = randomMat(size, CV_MAKETYPE(depth, channels), 0, 255);
-        ocl::oclMat gsrc(src);
+    HOGDescriptor hog;
+    hog.winSize = winSize;
+    hog.gammaCorrection = true;
 
-        pyrUp(src, dst_cpu);
-        ocl::pyrUp(gsrc, gdst);
+    if (winSize.width == 48 && winSize.height == 96)
+        hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+    else
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
 
-        EXPECT_MAT_NEAR(dst_cpu, Mat(gdst), (depth == CV_32F ? 1e-4f : 1.0));
-    }
+    std::vector<Rect> cpu_found;
+    std::vector<Rect> gpu_found;
+
+    OCL_OFF(hog.detectMultiScale(img, cpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
+    OCL_ON(hog.detectMultiScale(uimg, gpu_found, 0, Size(8, 8), Size(0, 0), 1.05, 6));
+
+    EXPECT_LT(checkRectSimilarity(img.size(), cpu_found, gpu_found), 1.0);
 }
 
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+                            testing::Values(Size(64, 128), Size(48, 96)),
+                            testing::Values( MatType(CV_8UC1) ) ) );
 
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, PyrUp, Combine(
-                            Values(CV_8U, CV_16U, CV_16S, CV_32F),
-                            Values(1, 3, 4)));
-#endif // HAVE_OPENCL
+}}
+#endif
diff --git a/modules/objdetect/test/test_cascadeandhog.cpp b/modules/objdetect/test/test_cascadeandhog.cpp
index a30109905..746a48ca9 100644
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@@ -257,6 +257,7 @@ int CV_DetectorTest::runTestCase( int detectorIdx, vector<vector<Rect> >& object
     string dataPath = ts->get_data_path(), detectorFilename;
     if( !detectorFilenames[detectorIdx].empty() )
         detectorFilename = dataPath + detectorFilenames[detectorIdx];
+    printf("detector %s\n", detectorFilename.c_str());
 
     for( int ii = 0; ii < (int)imageFilenames.size(); ++ii )
     {
@@ -573,7 +574,7 @@ public:
         Size winStride = Size(), Size padding = Size(),
         const vector<Point>& locations = vector<Point>()) const;
 
-    virtual void compute(const Mat& img, vector<float>& descriptors,
+    virtual void compute(InputArray img, vector<float>& descriptors,
         Size winStride = Size(), Size padding = Size(),
         const vector<Point>& locations = vector<Point>()) const;
 
@@ -1106,9 +1107,11 @@ void HOGDescriptorTester::detect(const Mat& img, vector<Point>& hits, double hit
     detect(img, hits, weightsV, hitThreshold, winStride, padding, locations);
 }
 
-void HOGDescriptorTester::compute(const Mat& img, vector<float>& descriptors,
+void HOGDescriptorTester::compute(InputArray _img, vector<float>& descriptors,
     Size winStride, Size padding, const vector<Point>& locations) const
 {
+    Mat img = _img.getMat();
+
     if( winStride == Size() )
         winStride = cellSize;
     Size cacheStride(gcd(winStride.width, blockStride.width),
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
deleted file mode 100644
index db9d64e1e..000000000
--- a/modules/ocl/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-if(NOT HAVE_OPENCL)
-  ocv_module_disable(ocl)
-  return()
-endif()
-
-set(the_description "OpenCL-accelerated Computer Vision")
-ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_calib3d opencv_ml "${OPENCL_LIBRARIES}")
-if(TARGET opencv_test_ocl)
-  target_link_libraries(opencv_test_ocl "${OPENCL_LIBRARIES}")
-endif()
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
diff --git a/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst b/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
deleted file mode 100644
index 824366927..000000000
--- a/modules/ocl/doc/camera_calibration_and_3D_reconstruction.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-Camera Calibration and 3D Reconstruction
-========================================
-
-.. highlight:: cpp
-
-
-
-ocl::StereoBM_OCL
----------------------
-.. ocv:class:: ocl::StereoBM_OCL
-
-Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
-
-    class CV_EXPORTS StereoBM_OCL
-    {
-    public:
-        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
-
-        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-        //! the default constructor
-        StereoBM_OCL();
-        //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
-        StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
-
-        //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
-        //! Output disparity has CV_8U type.
-        void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
-
-        //! Some heuristics that tries to estmate
-        // if current GPU will be faster then CPU in this algorithm.
-        // It queries current active device.
-        static bool checkIfGpuCallReasonable();
-
-        int preset;
-        int ndisp;
-        int winSize;
-
-        // If avergeTexThreshold  == 0 => post procesing is disabled
-        // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
-        // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
-        // i.e. input left image is low textured.
-        float avergeTexThreshold;
-    private:
-        /* hidden */
-    };
-
-
-The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
-
-.. math::
-    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
-
-This means that the input left image is low textured.
-
-
-ocl::StereoBM_OCL::StereoBM_OCL
------------------------------------
-Enables :ocv:class:`ocl::StereoBM_OCL` constructors.
-
-.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL()
-
-.. ocv:function:: ocl::StereoBM_OCL::StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
-
-    :param preset: Parameter presetting:
-
-        * **BASIC_PRESET** Basic mode without pre-processing.
-
-        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
-
-    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
-
-    :param winSize: Block size.
-
-
-
-ocl::StereoBM_OCL::operator ()
-----------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void ocl::StereoBM_OCL::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
-
-    :param left: Left image. Only  ``CV_8UC1``  type is supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
-
-
-ocl::StereoBM_OCL::checkIfGpuCallReasonable
------------------------------------------------
-Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
-
-.. ocv:function:: bool ocl::StereoBM_OCL::checkIfGpuCallReasonable()
-
-ocl::StereoBeliefPropagation
---------------------------------
-.. ocv:class:: ocl::StereoBeliefPropagation
-
-Class computing stereo correspondence using the belief propagation algorithm. ::
-
-    class CV_EXPORTS StereoBeliefPropagation
-    {
-    public:
-        enum { DEFAULT_NDISP  = 64 };
-        enum { DEFAULT_ITERS  = 5  };
-        enum { DEFAULT_LEVELS = 5  };
-        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
-        explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
-                                         int iters  = DEFAULT_ITERS,
-                                         int levels = DEFAULT_LEVELS,
-                                         int msg_type = CV_16S);
-        StereoBeliefPropagation(int ndisp, int iters, int levels,
-                                float max_data_term, float data_weight,
-                                float max_disc_term, float disc_single_jump,
-                                int msg_type = CV_32F);
-        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
-        void operator()(const oclMat &data, oclMat &disparity);
-        int ndisp;
-        int iters;
-        int levels;
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-        int msg_type;
-    private:
-        /* hidden */
-    };
-
-The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
-
-.. note::
-
-    ``StereoBeliefPropagation`` requires a lot of memory for message storage:
-
-    .. math::
-
-        width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)
-
-    and for data cost storage:
-
-    .. math::
-
-        width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})
-
-    ``width_step`` is the number of bytes in a line including padding.
-
-
-
-ocl::StereoBeliefPropagation::StereoBeliefPropagation
----------------------------------------------------------
-Enables the :ocv:class:`ocl::StereoBeliefPropagation` constructors.
-
-.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_16S)
-
-.. ocv:function:: ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param max_data_term: Threshold for data cost truncation.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Threshold for discontinuity truncation.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
-``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
-
-.. math::
-
-    DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)
-
-.. math::
-
-    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
-
-For more details, see [Felzenszwalb2006]_.
-
-By default, :ocv:class:`ocl::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
-
-.. math::
-
-    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
-
-
-
-ocl::StereoBeliefPropagation::estimateRecommendedParams
------------------------------------------------------------
-Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
-
-.. ocv:function:: void ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
-
-
-
-ocl::StereoBeliefPropagation::operator ()
----------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
-
-.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
-
-.. ocv:function:: void ocl::StereoBeliefPropagation::operator ()(const oclMat& data, oclMat& disparity)
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the type is retained.
-
-ocl::StereoConstantSpaceBP
-------------------------------
-.. ocv:class:: ocl::StereoConstantSpaceBP
-
-Class computing stereo correspondence using the constant space belief propagation algorithm. ::
-
-    class CV_EXPORTS StereoConstantSpaceBP
-    {
-    public:
-        enum { DEFAULT_NDISP    = 128 };
-        enum { DEFAULT_ITERS    = 8   };
-        enum { DEFAULT_LEVELS   = 4   };
-        enum { DEFAULT_NR_PLANE = 4   };
-        static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
-        explicit StereoConstantSpaceBP(
-            int ndisp    = DEFAULT_NDISP,
-            int iters    = DEFAULT_ITERS,
-            int levels   = DEFAULT_LEVELS,
-            int nr_plane = DEFAULT_NR_PLANE,
-            int msg_type = CV_32F);
-        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-            float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
-            int min_disp_th = 0,
-            int msg_type = CV_32F);
-        void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
-        int ndisp;
-        int iters;
-        int levels;
-        int nr_plane;
-        float max_data_term;
-        float data_weight;
-        float max_disc_term;
-        float disc_single_jump;
-        int min_disp_th;
-        int msg_type;
-        bool use_local_init_data_cost;
-    private:
-        /* hidden */
-    };
-
-The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
-
-
-ocl::StereoConstantSpaceBP::StereoConstantSpaceBP
------------------------------------------------------
-Enables the :ocv:class:`ocl::StereoConstantSpaceBP` constructors.
-
-.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
-
-.. ocv:function:: ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
-
-    :param ndisp: Number of disparities.
-
-    :param iters: Number of BP iterations on each level.
-
-    :param levels: Number of levels.
-
-    :param nr_plane: Number of disparity levels on the first level.
-
-    :param max_data_term: Truncation of data cost.
-
-    :param data_weight: Data weight.
-
-    :param max_disc_term: Truncation of discontinuity.
-
-    :param disc_single_jump: Discontinuity single jump.
-
-    :param min_disp_th: Minimal disparity threshold.
-
-    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
-
-``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
-
-.. math::
-
-    DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)
-
-.. math::
-
-    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
-
-For more details, see [Yang2010]_.
-
-By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
-
-.. math::
-
-    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
-
-
-
-ocl::StereoConstantSpaceBP::estimateRecommendedParams
----------------------------------------------------------
-Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
-
-.. ocv:function:: void ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
-
-
-
-ocl::StereoConstantSpaceBP::operator ()
--------------------------------------------
-Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
-
-.. ocv:function:: void ocl::StereoConstantSpaceBP::operator ()(const oclMat& left, const oclMat& right, oclMat& disparity)
-
-    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
-
-    :param right: Right image with the same size and the same type as the left one.
-
-    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
diff --git a/modules/ocl/doc/data_structures.rst b/modules/ocl/doc/data_structures.rst
deleted file mode 100644
index bde3d14af..000000000
--- a/modules/ocl/doc/data_structures.rst
+++ /dev/null
@@ -1,189 +0,0 @@
-Data Structures
-=============================
-
-.. ocv:class:: ocl::oclMat
-
-OpenCV C++ 1-D or 2-D dense array class ::
-
-    class CV_EXPORTS oclMat
-    {
-    public:
-        //! default constructor
-        oclMat();
-        //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-        oclMat(int rows, int cols, int type);
-        oclMat(Size size, int type);
-        //! constucts oclMatrix and fills it with the specified value _s.
-        oclMat(int rows, int cols, int type, const Scalar &s);
-        oclMat(Size size, int type, const Scalar &s);
-        //! copy constructor
-        oclMat(const oclMat &m);
-
-        //! constructor for oclMatrix headers pointing to user-allocated data
-        oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
-        oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
-
-        //! creates a matrix header for a part of the bigger matrix
-        oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
-        oclMat(const oclMat &m, const Rect &roi);
-
-        //! builds oclMat from Mat. Perfom blocking upload to device.
-        explicit oclMat (const Mat &m);
-
-        //! destructor - calls release()
-        ~oclMat();
-
-        //! assignment operators
-        oclMat &operator = (const oclMat &m);
-        //! assignment operator. Perfom blocking upload to device.
-        oclMat &operator = (const Mat &m);
-        oclMat &operator = (const oclMatExpr& expr);
-
-        //! pefroms blocking upload data to oclMat.
-        void upload(const cv::Mat &m);
-
-
-        //! downloads data from device to host memory. Blocking calls.
-        operator Mat() const;
-        void download(cv::Mat &m) const;
-
-        //! convert to _InputArray
-        operator _InputArray();
-
-        //! convert to _OutputArray
-        operator _OutputArray();
-
-        //! returns a new oclMatrix header for the specified row
-        oclMat row(int y) const;
-        //! returns a new oclMatrix header for the specified column
-        oclMat col(int x) const;
-        //! ... for the specified row span
-        oclMat rowRange(int startrow, int endrow) const;
-        oclMat rowRange(const Range &r) const;
-        //! ... for the specified column span
-        oclMat colRange(int startcol, int endcol) const;
-        oclMat colRange(const Range &r) const;
-
-        //! returns deep copy of the oclMatrix, i.e. the data is copied
-        oclMat clone() const;
-
-        //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-        // It calls m.create(this->size(), this->type()).
-        // It supports any data type
-        void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
-
-        //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
-        void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
-
-        void assignTo( oclMat &m, int type = -1 ) const;
-
-        //! sets every oclMatrix element to s
-        oclMat& operator = (const Scalar &s);
-        //! sets some of the oclMatrix elements to s, according to the mask
-        oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat());
-        //! creates alternative oclMatrix header for the same data, with different
-        // number of channels and/or different number of rows. see cvReshape.
-        oclMat reshape(int cn, int rows = 0) const;
-
-        //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
-        // previous data is unreferenced if needed.
-        void create(int rows, int cols, int type);
-        void create(Size size, int type);
-
-        //! allocates new oclMatrix with specified device memory type.
-        void createEx(int rows, int cols, int type, DevMemRW rw_type, DevMemType mem_type);
-        void createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type);
-
-        //! decreases reference counter;
-        // deallocate the data when reference counter reaches 0.
-        void release();
-
-        //! swaps with other smart pointer
-        void swap(oclMat &mat);
-
-        //! locates oclMatrix header within a parent oclMatrix. See below
-        void locateROI( Size &wholeSize, Point &ofs ) const;
-        //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
-        oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
-        //! extracts a rectangular sub-oclMatrix
-        // (this is a generalized form of row, rowRange etc.)
-        oclMat operator()( Range rowRange, Range colRange ) const;
-        oclMat operator()( const Rect &roi ) const;
-
-        oclMat& operator+=( const oclMat& m );
-        oclMat& operator-=( const oclMat& m );
-        oclMat& operator*=( const oclMat& m );
-        oclMat& operator/=( const oclMat& m );
-
-        //! returns true if the oclMatrix data is continuous
-        // (i.e. when there are no gaps between successive rows).
-        // similar to CV_IS_oclMat_CONT(cvoclMat->type)
-        bool isContinuous() const;
-        //! returns element size in bytes,
-        // similar to CV_ELEM_SIZE(cvMat->type)
-        size_t elemSize() const;
-        //! returns the size of element channel in bytes.
-        size_t elemSize1() const;
-        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
-        int type() const;
-        //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl
-        //! 3 channels element actually use 4 channel space
-        int ocltype() const;
-        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
-        int depth() const;
-        //! returns element type, similar to CV_MAT_CN(cvMat->type)
-        int channels() const;
-        //! returns element type, return 4 for 3 channels element,
-        //!becuase 3 channels element actually use 4 channel space
-        int oclchannels() const;
-        //! returns step/elemSize1()
-        size_t step1() const;
-        //! returns oclMatrix size:
-        // width == number of columns, height == number of rows
-        Size size() const;
-        //! returns true if oclMatrix data is NULL
-        bool empty() const;
-
-        //! matrix transposition
-        oclMat t() const;
-
-        /*! includes several bit-fields:
-          - the magic signature
-          - continuity flag
-          - depth
-          - number of channels
-          */
-        int flags;
-        //! the number of rows and columns
-        int rows, cols;
-        //! a distance between successive rows in bytes; includes the gap if any
-        size_t step;
-        //! pointer to the data(OCL memory object)
-        uchar *data;
-
-        //! pointer to the reference counter;
-        // when oclMatrix points to user-allocated data, the pointer is NULL
-        int *refcount;
-
-        //! helper fields used in locateROI and adjustROI
-        //datastart and dataend are not used in current version
-        uchar *datastart;
-        uchar *dataend;
-
-        //! OpenCL context associated with the oclMat object.
-        Context *clCxt;
-        //add offset for handle ROI, calculated in byte
-        int offset;
-        //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
-        int wholerows;
-        int wholecols;
-    };
-
-Basically speaking, the ``oclMat`` is the mirror of ``Mat`` with the extension of OCL feature, the members have the same meaning and useage of ``Mat`` except following:
-
-* ``datastart`` and ``dataend`` are replaced with ``wholerows`` and ``wholecols``
-
-* Only basic flags are supported in ``oclMat`` (i.e. depth number of channels)
-
-* All the 3-channel matrix (i.e. RGB image) are represented by 4-channel matrix in ``oclMat``. It means 3-channel image have 4-channel space with the last channel unused. We provide a transparent interface to handle the difference between OpenCV ``Mat`` and ``oclMat``.
-    For example: If a ``oclMat`` has 3 channels, ``channels()`` returns 3 and ``oclchannels()`` returns 4
diff --git a/modules/ocl/doc/feature_detection_and_description.rst b/modules/ocl/doc/feature_detection_and_description.rst
deleted file mode 100644
index b93d32f1a..000000000
--- a/modules/ocl/doc/feature_detection_and_description.rst
+++ /dev/null
@@ -1,649 +0,0 @@
-Feature Detection And Description
-=================================
-
-.. highlight:: cpp
-
-ocl::Canny
--------------------
-Finds edges in an image using the [Canny86]_ algorithm.
-
-.. ocv:function:: void ocl::Canny(const oclMat& image, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
-
-.. ocv:function:: void ocl::Canny(const oclMat& image, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false)
-
-.. ocv:function:: void ocl::Canny(const oclMat& dx, const oclMat& dy, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
-
-.. ocv:function:: void ocl::Canny(const oclMat& dx, const oclMat& dy, CannyBuf& buf, oclMat& edges, double low_thresh, double high_thresh, bool L2gradient = false)
-
-    :param image: Single-channel 8-bit input image.
-
-    :param dx: First derivative of image in the vertical direction. Support only ``CV_32S`` type.
-
-    :param dy: First derivative of image in the horizontal direction. Support only ``CV_32S`` type.
-
-    :param edges: Output edge map. It has the same size and type as  ``image`` .
-
-    :param low_thresh: First threshold for the hysteresis procedure.
-
-    :param high_thresh: Second threshold for the hysteresis procedure.
-
-    :param apperture_size: Aperture size for the  :ocv:func:`Sobel`  operator.
-
-    :param L2gradient: Flag indicating whether a more accurate  :math:`L_2`  norm  :math:`=\sqrt{(dI/dx)^2 + (dI/dy)^2}`  should be used to compute the image gradient magnitude ( ``L2gradient=true`` ), or a faster default  :math:`L_1`  norm  :math:`=|dI/dx|+|dI/dy|`  is enough ( ``L2gradient=false`` ).
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-.. seealso:: :ocv:func:`Canny`
-
-
-ocl::BruteForceMatcher_OCL_base
------------------------------------
-.. ocv:class:: ocl::BruteForceMatcher_OCL_base
-
-Brute-force descriptor matcher. For each descriptor in the first set, this matcher finds the closest descriptor in the second set by trying each one. This descriptor matcher supports masking permissible matches between descriptor sets. ::
-
-    class BruteForceMatcher_OCL_base
-    {
-    public:
-            enum DistType {L1Dist = 0, L2Dist, HammingDist};
-
-        // Add descriptors to train descriptor collection.
-        void add(const std::vector<oclMat>& descCollection);
-
-        // Get train descriptors collection.
-        const std::vector<oclMat>& getTrainDescriptors() const;
-
-        // Clear train descriptors collection.
-        void clear();
-
-        // Return true if there are no train descriptors in collection.
-        bool empty() const;
-
-        // Return true if the matcher supports mask in match methods.
-        bool isMaskSupported() const;
-
-        void matchSingle(const oclMat& query, const oclMat& train,
-            oclMat& trainIdx, oclMat& distance,
-            const oclMat& mask = oclMat());
-
-        static void matchDownload(const oclMat& trainIdx,
-            const oclMat& distance, std::vector<DMatch>& matches);
-        static void matchConvert(const Mat& trainIdx,
-            const Mat& distance, std::vector<DMatch>& matches);
-
-        void match(const oclMat& query, const oclMat& train,
-            std::vector<DMatch>& matches, const oclMat& mask = oclMat());
-
-        void makeGpuCollection(oclMat& trainCollection, oclMat& maskCollection,
-            const vector<oclMat>& masks = std::vector<oclMat>());
-
-        void matchCollection(const oclMat& query, const oclMat& trainCollection,
-            oclMat& trainIdx, oclMat& imgIdx, oclMat& distance,
-            const oclMat& maskCollection);
-
-        static void matchDownload(const oclMat& trainIdx, oclMat& imgIdx,
-            const oclMat& distance, std::vector<DMatch>& matches);
-        static void matchConvert(const Mat& trainIdx, const Mat& imgIdx,
-            const Mat& distance, std::vector<DMatch>& matches);
-
-        void match(const oclMat& query, std::vector<DMatch>& matches,
-            const std::vector<oclMat>& masks = std::vector<oclMat>());
-
-        void knnMatchSingle(const oclMat& query, const oclMat& train,
-            oclMat& trainIdx, oclMat& distance, oclMat& allDist, int k,
-            const oclMat& mask = oclMat());
-
-        static void knnMatchDownload(const oclMat& trainIdx, const oclMat& distance,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-        static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-
-        void knnMatch(const oclMat& query, const oclMat& train,
-            std::vector< std::vector<DMatch> >& matches, int k,
-            const oclMat& mask = oclMat(), bool compactResult = false);
-
-        void knnMatch2Collection(const oclMat& query, const oclMat& trainCollection,
-            oclMat& trainIdx, oclMat& imgIdx, oclMat& distance,
-            const oclMat& maskCollection = oclMat());
-
-        static void knnMatch2Download(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-        static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-
-        void knnMatch(const oclMat& query, std::vector< std::vector<DMatch> >& matches, int k,
-            const std::vector<oclMat>& masks = std::vector<oclMat>(),
-            bool compactResult = false);
-
-        void radiusMatchSingle(const oclMat& query, const oclMat& train,
-            oclMat& trainIdx, oclMat& distance, oclMat& nMatches, float maxDistance,
-            const oclMat& mask = oclMat());
-
-        static void radiusMatchDownload(const oclMat& trainIdx, const oclMat& distance, const oclMat& nMatches,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-        static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-
-        void radiusMatch(const oclMat& query, const oclMat& train,
-            std::vector< std::vector<DMatch> >& matches, float maxDistance,
-            const oclMat& mask = oclMat(), bool compactResult = false);
-
-        void radiusMatchCollection(const oclMat& query, oclMat& trainIdx, oclMat& imgIdx, oclMat& distance, oclMat& nMatches, float maxDistance,
-            const std::vector<oclMat>& masks = std::vector<oclMat>());
-
-        static void radiusMatchDownload(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, const oclMat& nMatches,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-        static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
-            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
-
-        void radiusMatch(const oclMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
-            const std::vector<oclMat>& masks = std::vector<oclMat>(), bool compactResult = false);
-
-                DistType distType;
-
-    private:
-        std::vector<oclMat> trainDescCollection;
-    };
-
-
-The class ``BruteForceMatcher_OCL_base`` has an interface similar to the class :ocv:class:`DescriptorMatcher`. It has two groups of ``match`` methods: for matching descriptors of one image with another image or with an image set. Also, all functions have an alternative to save results either to the GPU memory or to the CPU memory. ``BruteForceMatcher_OCL_base`` supports only the ``L1<float>``, ``L2<float>``, and ``Hamming`` distance types.
-
-.. seealso:: :ocv:class:`DescriptorMatcher`, :ocv:class:`BFMatcher`
-
-
-
-ocl::BruteForceMatcher_OCL_base::match
-------------------------------------------
-Finds the best match for each descriptor from a query set with train descriptors.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::match(const oclMat& query, const oclMat& train, std::vector<DMatch>& matches, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat& query, const oclMat& train, oclMat& trainIdx, oclMat& distance, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::match(const oclMat& query, std::vector<DMatch>& matches, const std::vector<oclMat>& masks = std::vector<oclMat>())
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchCollection( const oclMat& query, const oclMat& trainCollection, oclMat& trainIdx, oclMat& imgIdx, oclMat& distance, const oclMat& masks=oclMat() )
-
-.. seealso:: :ocv:func:`DescriptorMatcher::match`
-
-
-
-ocl::BruteForceMatcher_OCL_base::makeGpuCollection
-------------------------------------------------------
-Performs a GPU collection of train descriptors and masks in a suitable format for the :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` function.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::makeGpuCollection(oclMat& trainCollection, oclMat& maskCollection, const vector<oclMat>& masks = std::vector<oclMat>())
-
-
-ocl::BruteForceMatcher_OCL_base::matchDownload
---------------------------------------------------
-Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: static void ocl::BruteForceMatcher_OCL_base::matchDownload( const oclMat& trainIdx, const oclMat& distance, std::vector<DMatch>& matches )
-
-.. ocv:function:: static void ocl::BruteForceMatcher_OCL_base::matchDownload( const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, std::vector<DMatch>& matches )
-
-
-ocl::BruteForceMatcher_OCL_base::matchConvert
--------------------------------------------------
-Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::matchCollection` to vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>&matches)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>&matches)
-
-
-
-ocl::BruteForceMatcher_OCL_base::knnMatch
----------------------------------------------
-Finds the ``k`` best matches for each descriptor from a query set with train descriptors.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, int k, const oclMat& mask = oclMat(), bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat& query, const oclMat& train, oclMat& trainIdx, oclMat& distance, oclMat& allDist, int k, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat& query, std::vector< std::vector<DMatch> >&matches, int k, const std::vector<oclMat>&masks = std::vector<oclMat>(), bool compactResult = false )
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat& query, const oclMat& trainCollection, oclMat& trainIdx, oclMat& imgIdx, oclMat& distance, const oclMat& maskCollection = oclMat())
-
-    :param query: Query set of descriptors.
-
-    :param train: Training set of descriptors. It is not be added to train descriptors collection stored in the class object.
-
-    :param k: Number of the best matches per each query descriptor (or less if it is not possible).
-
-    :param mask: Mask specifying permissible matches between the input query and train matrices of descriptors.
-
-    :param compactResult: If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-The function returns detected ``k`` (or less if not possible) matches in the increasing order by distance.
-
-The third variant of the method stores the results in GPU memory.
-
-.. seealso:: :ocv:func:`DescriptorMatcher::knnMatch`
-
-
-
-ocl::BruteForceMatcher_OCL_base::knnMatchDownload
------------------------------------------------------
-Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat& trainIdx, const oclMat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch2Download(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
-
-If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-
-ocl::BruteForceMatcher_OCL_base::knnMatchConvert
-----------------------------------------------------
-Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::knnMatch2Collection` to CPU vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatchConvert(const Mat& trainIdx, const Mat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
-
-If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-
-ocl::BruteForceMatcher_OCL_base::radiusMatch
-------------------------------------------------
-For each query descriptor, finds the best matches with a distance less than a given threshold.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat& query, const oclMat& train, std::vector< std::vector<DMatch> >&matches, float maxDistance, const oclMat& mask = oclMat(), bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat& query, const oclMat& train, oclMat& trainIdx, oclMat& distance, oclMat& nMatches, float maxDistance, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat& query, std::vector< std::vector<DMatch> >&matches, float maxDistance, const std::vector<oclMat>& masks = std::vector<oclMat>(), bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat& query, oclMat& trainIdx, oclMat& imgIdx, oclMat& distance, oclMat& nMatches, float maxDistance, const std::vector<oclMat>& masks = std::vector<oclMat>())
-
-    :param query: Query set of descriptors.
-
-    :param train: Training set of descriptors. It is not added to train descriptors collection stored in the class object.
-
-    :param maxDistance: Distance threshold.
-
-    :param mask: Mask specifying permissible matches between the input query and train matrices of descriptors.
-
-    :param compactResult: If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-The function returns detected matches in the increasing order by distance.
-
-The methods work only on devices with the compute capability  :math:`>=` 1.1.
-
-The third variant of the method stores the results in GPU memory and does not store the points by the distance.
-
-.. seealso:: :ocv:func:`DescriptorMatcher::radiusMatch`
-
-
-
-ocl::BruteForceMatcher_OCL_base::radiusMatchDownload
---------------------------------------------------------
-Downloads matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat& trainIdx, const oclMat& distance, const oclMat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, const oclMat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
-
-If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-
-
-ocl::BruteForceMatcher_OCL_base::radiusMatchConvert
--------------------------------------------------------
-Converts matrices obtained via :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchSingle` or :ocv:func:`ocl::BruteForceMatcher_OCL_base::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
-
-.. ocv:function:: void ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
-
-If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
-
-
-ocl::FAST_OCL
-------------------
-.. ocv:class:: ocl::FAST_OCL
-
-Class used for corner detection using the FAST algorithm. ::
-
-        class CV_EXPORTS FAST_OCL
-        {
-        public:
-            enum
-            {
-                X_ROW = 0,
-                Y_ROW,
-                RESPONSE_ROW,
-                ROWS_COUNT
-            };
-
-            // all features have same size
-            static const int FEATURE_SIZE = 7;
-
-            explicit FAST_OCL(int threshold, bool nonmaxSupression = true, double keypointsRatio = 0.05);
-
-            //! finds the keypoints using FAST detector
-            //! supports only CV_8UC1 images
-            void operator ()(const oclMat& image, const oclMat& mask, oclMat& keypoints);
-            void operator ()(const oclMat& image, const oclMat& mask, std::vector<KeyPoint>& keypoints);
-
-            //! download keypoints from device to host memory
-            static void downloadKeypoints(const oclMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-            //! convert keypoints to KeyPoint vector
-            static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
-
-            //! release temporary buffer's memory
-            void release();
-
-            bool nonmaxSupression;
-
-            int threshold;
-
-            //! max keypoints = keypointsRatio * img.size().area()
-            double keypointsRatio;
-
-            //! find keypoints and compute it's response if nonmaxSupression is true
-            //! return count of detected keypoints
-            int calcKeyPointsLocation(const oclMat& image, const oclMat& mask);
-
-            //! get final array of keypoints
-            //! performs nonmax supression if needed
-            //! return final count of keypoints
-            int getKeyPoints(oclMat& keypoints);
-
-        private:
-            // Hidden
-        };
-
-
-The class ``FAST_OCL`` implements FAST corner detection algorithm.
-
-.. seealso:: :ocv:func:`FAST`
-
-
-
-ocl::FAST_OCL::FAST_OCL
---------------------------
-Constructor.
-
-.. ocv:function:: ocl::FAST_OCL::FAST_OCL(int threshold, bool nonmaxSupression = true, double keypointsRatio = 0.05)
-
-    :param threshold: Threshold on difference between intensity of the central pixel and pixels on a circle around this pixel.
-
-    :param nonmaxSupression: If it is true, non-maximum suppression is applied to detected corners (keypoints).
-
-    :param keypointsRatio: Inner buffer size for keypoints store is determined as (keypointsRatio * image_width * image_height).
-
-
-
-ocl::FAST_OCL::operator ()
-----------------------------
-Finds the keypoints using FAST detector.
-
-.. ocv:function:: void ocl::FAST_OCL::operator ()(const oclMat& image, const oclMat& mask, oclMat& keypoints)
-.. ocv:function:: void ocl::FAST_OCL::operator ()(const oclMat& image, const oclMat& mask, std::vector<KeyPoint>& keypoints)
-
-    :param image: Image where keypoints (corners) are detected. Only 8-bit grayscale images are supported.
-
-    :param mask: Optional input mask that marks the regions where we should detect features.
-
-    :param keypoints: The output vector of keypoints. Can be stored both in host or device memory. For device memory:
-
-            * X_ROW of keypoints will contain the horizontal coordinate of the i'th point
-            * Y_ROW of keypoints will contain the vertical coordinate of the i'th point
-            * RESPONSE_ROW will contain response of i'th point (if non-maximum suppression is applied)
-
-
-
-ocl::FAST_OCL::downloadKeypoints
-----------------------------------
-Download keypoints from device to host memory.
-
-.. ocv:function:: void ocl::FAST_OCL::downloadKeypoints(const oclMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-
-
-
-ocl::FAST_OCL::convertKeypoints
----------------------------------
-Converts keypoints from OpenCL representation to vector of ``KeyPoint``.
-
-.. ocv:function:: void ocl::FAST_OCL::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-
-
-
-ocl::FAST_OCL::release
-------------------------
-Releases inner buffer memory.
-
-.. ocv:function:: void ocl::FAST_OCL::release()
-
-
-
-ocl::FAST_OCL::calcKeyPointsLocation
---------------------------------------
-Find keypoints. If ``nonmaxSupression`` is true, responses are computed and eliminates keypoints with the smaller responses from 9-neighborhood regions.
-
-.. ocv:function:: int ocl::FAST_OCL::calcKeyPointsLocation(const oclMat& image, const oclMat& mask)
-
-    :param image: Image where keypoints (corners) are detected. Only 8-bit grayscale images are supported.
-
-    :param mask: Optional input mask that marks the regions where we should detect features.
-
-The function returns the amount of detected keypoints.
-
-
-
-ocl::FAST_OCL::getKeyPoints
------------------------------
-Gets final array of keypoints.
-
-.. ocv:function:: int ocl::FAST_OCL::getKeyPoints(oclMat& keypoints)
-
-    :param keypoints: The output vector of keypoints.
-
-The function performs non-max suppression if needed and returns the final amount of keypoints.
-
-
-
-ocl::HOGDescriptor
-----------------------
-
-.. ocv:struct:: ocl::HOGDescriptor
-
-The class implements Histogram of Oriented Gradients ([Dalal2005]_) object detector. ::
-
-    struct CV_EXPORTS HOGDescriptor
-    {
-        enum { DEFAULT_WIN_SIGMA = -1 };
-        enum { DEFAULT_NLEVELS = 64 };
-        enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-        HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                      Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                      int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                      double threshold_L2hys=0.2, bool gamma_correction=true,
-                      int nlevels=DEFAULT_NLEVELS);
-
-        size_t getDescriptorSize() const;
-        size_t getBlockHistogramSize() const;
-
-        void setSVMDetector(const vector<float>& detector);
-
-        static vector<float> getDefaultPeopleDetector();
-        static vector<float> getPeopleDetector48x96();
-        static vector<float> getPeopleDetector64x128();
-
-        void detect(const oclMat& img, vector<Point>& found_locations,
-                    double hit_threshold=0, Size win_stride=Size(),
-                    Size padding=Size());
-
-        void detectMultiScale(const oclMat& img, vector<Rect>& found_locations,
-                              double hit_threshold=0, Size win_stride=Size(),
-                              Size padding=Size(), double scale0=1.05,
-                              int group_threshold=2);
-
-        void getDescriptors(const oclMat& img, Size win_stride,
-                            oclMat& descriptors,
-                            int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-        Size win_size;
-        Size block_size;
-        Size block_stride;
-        Size cell_size;
-        int nbins;
-        double win_sigma;
-        double threshold_L2hys;
-        bool gamma_correction;
-        int nlevels;
-
-    private:
-        // Hidden
-    }
-
-
-Interfaces of all methods are kept similar to the ``CPU HOG`` descriptor and detector analogues as much as possible.
-
-.. note::
-
-   (Ocl) An example using the HOG descriptor can be found at opencv_source_code/samples/ocl/hog.cpp
-
-ocl::HOGDescriptor::HOGDescriptor
--------------------------------------
-Creates the ``HOG`` descriptor and detector.
-
-.. ocv:function:: ocl::HOGDescriptor::HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, double threshold_L2hys=0.2, bool gamma_correction=true, int nlevels=DEFAULT_NLEVELS)
-
-   :param win_size: Detection window size. Align to block size and block stride.
-
-   :param block_size: Block size in pixels. Align to cell size. Only (16,16) is supported for now.
-
-   :param block_stride: Block stride. It must be a multiple of cell size.
-
-   :param cell_size: Cell size. Only (8, 8) is supported for now.
-
-   :param nbins: Number of bins. Only 9 bins per cell are supported for now.
-
-   :param win_sigma: Gaussian smoothing window parameter.
-
-   :param threshold_L2hys: L2-Hys normalization method shrinkage.
-
-   :param gamma_correction: Flag to specify whether the gamma correction preprocessing is required or not.
-
-   :param nlevels: Maximum number of detection window increases.
-
-
-
-ocl::HOGDescriptor::getDescriptorSize
------------------------------------------
-Returns the number of coefficients required for the classification.
-
-.. ocv:function:: size_t ocl::HOGDescriptor::getDescriptorSize() const
-
-
-
-ocl::HOGDescriptor::getBlockHistogramSize
----------------------------------------------
-Returns the block histogram size.
-
-.. ocv:function:: size_t ocl::HOGDescriptor::getBlockHistogramSize() const
-
-
-
-ocl::HOGDescriptor::setSVMDetector
---------------------------------------
-Sets coefficients for the linear SVM classifier.
-
-.. ocv:function:: void ocl::HOGDescriptor::setSVMDetector(const vector<float>& detector)
-
-
-
-ocl::HOGDescriptor::getDefaultPeopleDetector
-------------------------------------------------
-Returns coefficients of the classifier trained for people detection (for default window size).
-
-.. ocv:function:: static vector<float> ocl::HOGDescriptor::getDefaultPeopleDetector()
-
-
-
-ocl::HOGDescriptor::getPeopleDetector48x96
-----------------------------------------------
-Returns coefficients of the classifier trained for people detection (for 48x96 windows).
-
-.. ocv:function:: static vector<float> ocl::HOGDescriptor::getPeopleDetector48x96()
-
-
-
-ocl::HOGDescriptor::getPeopleDetector64x128
------------------------------------------------
-Returns coefficients of the classifier trained for people detection (for 64x128 windows).
-
-.. ocv:function:: static vector<float> ocl::HOGDescriptor::getPeopleDetector64x128()
-
-
-
-ocl::HOGDescriptor::detect
-------------------------------
-Performs object detection without a multi-scale window.
-
-.. ocv:function:: void ocl::HOGDescriptor::detect(const oclMat& img, vector<Point>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size())
-
-   :param img: Source image.  ``CV_8UC1``  and  ``CV_8UC4`` types are supported for now.
-
-   :param found_locations: Left-top corner points of detected objects boundaries.
-
-   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. Usually it is 0 and should be specfied in the detector coefficients (as the last free coefficient). But if the free coefficient is omitted (which is allowed), you can specify it manually here.
-
-   :param win_stride: Window stride. It must be a multiple of block stride.
-
-   :param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-
-
-
-ocl::HOGDescriptor::detectMultiScale
-----------------------------------------
-Performs object detection with a multi-scale window.
-
-.. ocv:function:: void ocl::HOGDescriptor::detectMultiScale(const oclMat& img, vector<Rect>& found_locations, double hit_threshold=0, Size win_stride=Size(), Size padding=Size(), double scale0=1.05, int group_threshold=2)
-
-   :param img: Source image. See  :ocv:func:`ocl::HOGDescriptor::detect`  for type limitations.
-
-   :param found_locations: Detected objects boundaries.
-
-   :param hit_threshold: Threshold for the distance between features and SVM classifying plane. See  :ocv:func:`ocl::HOGDescriptor::detect`  for details.
-
-   :param win_stride: Window stride. It must be a multiple of block stride.
-
-   :param padding: Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-
-   :param scale0: Coefficient of the detection window increase.
-
-   :param group_threshold: Coefficient to regulate the similarity threshold. When detected, some objects can be covered by many rectangles. 0 means not to perform grouping. See  :ocv:func:`groupRectangles` .
-
-
-
-ocl::HOGDescriptor::getDescriptors
---------------------------------------
-Returns block descriptors computed for the whole image.
-
-.. ocv:function:: void ocl::HOGDescriptor::getDescriptors(const oclMat& img, Size win_stride, oclMat& descriptors, int descr_format=DESCR_FORMAT_COL_BY_COL)
-
-   :param img: Source image. See  :ocv:func:`ocl::HOGDescriptor::detect`  for type limitations.
-
-   :param win_stride: Window stride. It must be a multiple of block stride.
-
-   :param descriptors: 2D array of descriptors.
-
-   :param descr_format: Descriptor storage format:
-
-        * **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
-
-        * **DESCR_FORMAT_COL_BY_COL** - Column-major order.
-
-The function is mainly used to learn the classifier.
diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst
deleted file mode 100644
index 33f1b2796..000000000
--- a/modules/ocl/doc/image_filtering.rst
+++ /dev/null
@@ -1,713 +0,0 @@
-Image Filtering
-=============================
-
-.. highlight:: cpp
-
-ocl::BaseRowFilter_GPU
---------------------------
-.. ocv:class:: ocl::BaseRowFilter_GPU
-
-Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
-
-    class CV_EXPORTS BaseRowFilter_GPU
-    {
-    public:
-        BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
-        virtual ~BaseRowFilter_GPU() {}
-        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-        int ksize, anchor, bordertype;
-    };
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
-
-ocl::BaseColumnFilter_GPU
------------------------------
-.. ocv:class:: ocl::BaseColumnFilter_GPU
-
-Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
-
-    class CV_EXPORTS BaseColumnFilter_GPU
-    {
-    public:
-        BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
-        virtual ~BaseColumnFilter_GPU() {}
-        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-        int ksize, anchor, bordertype;
-    };
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`.
-
-ocl::BaseFilter_GPU
------------------------
-.. ocv:class:: ocl::BaseFilter_GPU
-
-Base class for non-separable 2D filters. ::
-
-    class CV_EXPORTS BaseFilter_GPU
-    {
-    public:
-        BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
-            : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
-        virtual ~BaseFilter_GPU() {}
-        virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-        Size ksize;
-        Point anchor;
-        int borderType;
-    };
-
-.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`ocl::FilterEngine_GPU`
-
-ocl::FilterEngine_GPU
-------------------------
-.. ocv:class:: ocl::FilterEngine_GPU
-
-Base class for the Filter Engine. ::
-
-    class CV_EXPORTS FilterEngine_GPU
-    {
-    public:
-        virtual ~FilterEngine_GPU() {}
-
-        virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
-    };
-
-The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`ocl::filter2D`, :ocv:func:`ocl::erode`, :ocv:func:`ocl::Sobel` , and others.
-
-By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
-
-    while (...)
-    {
-        ocl::oclMat src = getImg();
-        ocl::oclMat dst;
-        // Allocate and release buffers at each iterations
-        ocl::GaussianBlur(src, dst, ksize, sigma1);
-    }
-
-    // Allocate buffers only once
-    cv::Ptr<ocl::FilterEngine_GPU> filter =
-        ocl::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
-    while (...)
-    {
-        ocl::oclMat src = getImg();
-        ocl::oclMat dst;
-        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-    // Release buffers only once
-    filter.release();
-
-
-``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
-
-.. note:: The GPU filters do not support the in-place mode.
-
-.. seealso:: :ocv:class:`ocl::BaseRowFilter_GPU`, :ocv:class:`ocl::BaseColumnFilter_GPU`, :ocv:class:`ocl::BaseFilter_GPU`, :ocv:func:`ocl::createFilter2D_GPU`, :ocv:func:`ocl::createSeparableFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`, :ocv:func:`ocl::createMorphologyFilter_GPU`, :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`ocl::createDerivFilter_GPU`, :ocv:func:`ocl::createGaussianFilter_GPU`
-
-ocl::createFilter2D_GPU
----------------------------
-Creates a non-separable filter engine with the specified filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createFilter2D_GPU( const Ptr<BaseFilter_GPU> filter2D)
-
-    :param filter2D: Non-separable 2D filter.
-
-Usually this function is used inside such high-level functions as :ocv:func:`ocl::createLinearFilter_GPU`, :ocv:func:`ocl::createBoxFilter_GPU`.
-
-
-ocl::createSeparableFilter_GPU
-----------------------------------
-Creates a separable filter engine with the specified filters.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter, const Ptr<BaseColumnFilter_GPU> &columnFilter)
-
-    :param rowFilter: "Horizontal" 1D filter.
-
-    :param columnFilter: "Vertical" 1D filter.
-
-Usually this function is used inside such high-level functions as :ocv:func:`ocl::createSeparableLinearFilter_GPU`.
-
-ocl::createBoxFilter_GPU
-----------------------------
-Creates a normalized 2D box filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createBoxFilter_GPU(int srcType, int dstType, const Size &ksize, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getBoxFilter_GPU(int srcType, int dstType, const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type.
-
-    :param dstType: Output image type.  It supports only the same values as the source type.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-    :param borderType: Border type.
-
-.. seealso:: :ocv:func:`boxFilter`
-
-ocl::boxFilter
-------------------
-Smooths the image using the normalized box filter.
-
-.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
-
-    :param src: Input image.
-
-    :param dst: Output image type. The size and type is the same as ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
-
-    :param borderType: Border type.
-
-Smoothes image using box filter.
-
-ocl::blur
--------------
-Acts as a synonym for the normalized box filter.
-
-.. ocv:function:: void ocl::blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_CONSTANT)
-
-    :param src: Input image.
-
-    :param dst: Output image type with the same size and type as  ``src`` .
-
-    :param ksize: Kernel size.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Border type.
-
-.. seealso:: :ocv:func:`blur`, :ocv:func:`ocl::boxFilter`
-
-ocl::createMorphologyFilter_GPU
------------------------------------
-Creates a 2D morphological filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor = Point(-1, -1), int iterations = 1)
-
-.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor = Point(-1, -1))
-
-    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
-
-    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
-
-    :param kernel: 2D 8-bit structuring element for the morphological operation.
-
-    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
-
-    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
-
-.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
-
-.. seealso:: :ocv:func:`createMorphologyFilter`
-
-ocl::createLinearFilter_GPU
--------------------------------
-Creates a non-separable linear filter.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
-
-    :param srcType: Input image type..
-
-    :param dstType: Output image type. The same type as ``src`` is supported.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-.. seealso:: :ocv:func:`createLinearFilter`
-
-
-ocl::filter2D
------------------
-Applies the non-separable 2D linear filter to an image.
-
-.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT)
-
-    :param src: Source image.
-
-    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
-
-    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
-
-    :param kernel: 2D array of filter coefficients.
-
-    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
-
-    :param delta: optional value added to the filtered pixels before storing them in ``dst``. Value '0' is supported only.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
-
-ocl::getLinearRowFilter_GPU
--------------------------------
-Creates a primitive row filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseRowFilter_GPU> ocl::getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel, int anchor = -1, int bordertype = BORDER_DEFAULT)
-
-    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
-
-    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`.
-
-.. seealso:: :ocv:func:`createSeparableLinearFilter` .
-
-
-ocl::getLinearColumnFilter_GPU
-----------------------------------
-Creates a primitive column filter with the specified kernel.
-
-.. ocv:function:: Ptr<BaseColumnFilter_GPU> ocl::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0)
-
-    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
-
-    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
-
-    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
-
-    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` .
-
-    :param delta: default value is 0.0.
-
-.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-ocl::createSeparableLinearFilter_GPU
-----------------------------------------
-Creates a separable linear filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
-
-    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
-
-    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
-
-    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
-
-    :param delta: default value is 0.0.
-
-    :param bordertype: Pixel extrapolation method.
-
-.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
-
-
-ocl::sepFilter2D
---------------------
-Applies a separable 2D linear filter to an image.
-
-.. ocv:function:: void ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
-
-    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dst: Destination image with the same size and number of channels as  ``src`` .
-
-    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
-
-    :param kernelX: Horizontal filter coefficients.
-
-    :param kernelY: Vertical filter coefficients.
-
-    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
-
-    :param delta: default value is 0.0.
-
-    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
-
-.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
-
-ocl::createDerivFilter_GPU
-------------------------------
-Creates a filter engine for the generalized Sobel operator.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
-
-    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
-
-    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
-
-    :param dx: Derivative order in respect of x.
-
-    :param dy: Derivative order in respect of y.
-
-    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
-
-    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
-
-.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
-
-
-ocl::Sobel
-------------------
-Returns void
-
-.. ocv:function:: void ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size as src
-
-    :param ddepth: The destination image depth
-
-    :param dx: Order of the derivative x
-
-    :param dy: Order of the derivative y
-
-    :param ksize: Size of the extended Sobel kernel
-
-    :param scale: The optional scale factor for the computed derivative values(by default, no scaling is applied)
-
-    :param delta: The optional delta value, added to the results prior to storing them in dst
-
-    :param bordertype: Pixel extrapolation method.
-
-The function computes the first x- or y- spatial image derivative using Sobel operator. Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
-
-ocl::Scharr
-------------------
-Returns void
-
-.. ocv:function:: void ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size as src
-
-    :param ddepth: The destination image depth
-
-    :param dx: Order of the derivative x
-
-    :param dy: Order of the derivative y
-
-    :param scale: The optional scale factor for the computed derivative values(by default, no scaling is applied)
-
-    :param delta: The optional delta value, added to the results prior to storing them in dst
-
-    :param bordertype: Pixel extrapolation method.
-
-The function computes the first x- or y- spatial image derivative using Scharr operator. Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
-
-ocl::createGaussianFilter_GPU
----------------------------------
-Creates a Gaussian filter engine.
-
-.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
-
-    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
-
-    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
-
-    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
-
-    :param bordertype: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate`.
-
-.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
-
-ocl::GaussianBlur
----------------------
-Returns void
-
-.. ocv:function:: void ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param ksize: The Gaussian kernel size; ksize.width and ksize.height can differ, but they both must be positive and odd. Or, they can be zero's, then they are computed from sigma
-
-    :param sigma1sigma2: The Gaussian kernel standard deviations in X and Y direction. If sigmaY is zero, it is set to be equal to sigmaX. If they are both zeros, they are computed from ksize.width and ksize.height. To fully control the result regardless of possible future modification of all this semantics, it is recommended to specify all of ksize, sigmaX and sigmaY
-
-    :param bordertype: Pixel extrapolation method.
-
-The function convolves the source image with the specified Gaussian kernel. In-place filtering is supported.  Surpport 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data type.
-
-ocl::Laplacian
-------------------
-Returns void
-
-.. ocv:function:: void ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param ddepth: The desired depth of the destination image
-
-    :param ksize: The aperture size used to compute the second-derivative filters. It must be positive and odd
-
-    :param scale: The optional scale factor for the computed Laplacian values (by default, no scaling is applied
-
-    :param delta: Optional delta value that is added to the results prior to storing them in  ``dst`` . Supported value is 0 only.
-
-    :param bordertype: Pixel extrapolation method.
-
-The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator.
-
-ocl::ConvolveBuf
-----------------
-.. ocv:struct:: ocl::ConvolveBuf
-
-Class providing a memory buffer for :ocv:func:`ocl::convolve` function, plus it allows to adjust some specific parameters. ::
-
-    struct CV_EXPORTS ConvolveBuf
-    {
-        Size result_size;
-        Size block_size;
-        Size user_block_size;
-        Size dft_size;
-        int spect_len;
-
-        oclMat image_spect, templ_spect, result_spect;
-        oclMat image_block, templ_block, result_data;
-
-        void create(Size image_size, Size templ_size);
-        static Size estimateBlockSize(Size result_size, Size templ_size);
-    };
-
-You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-
-ocl::ConvolveBuf::create
-------------------------
-.. ocv:function:: ocl::ConvolveBuf::create(Size image_size, Size templ_size)
-
-Constructs a buffer for :ocv:func:`ocl::convolve` function with respective arguments.
-
-ocl::convolve
-------------------
-Returns void
-
-.. ocv:function:: void ocl::convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr=false)
-
-.. ocv:function:: void ocl::convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf)
-
-    :param image: The source image. Only  ``CV_32FC1`` images are supported for now.
-
-    :param temp1: Convolution kernel, a single-channel floating point matrix. The size is not greater than the  ``image`` size. The type is the same as  ``image``.
-
-    :param result: The destination image
-
-    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
-
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`ocl::ConvolveBuf`.
-
-Convolves an image with the kernel. Supports only CV_32FC1 data types and do not support ROI.
-
-ocl::bilateralFilter
-------------------------
-Returns void
-
-.. ocv:function:: void ocl::bilateralFilter(const oclMat &src, oclMat &dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; will have the same size and the same type as src
-
-    :param d: The diameter of each pixel neighborhood, that is used during filtering. If it is non-positive, it's computed from sigmaSpace
-
-    :param sigmaColor: Filter sigma in the color space. Larger value of the parameter means that farther colors within the pixel neighborhood (see sigmaSpace) will be mixed together, resulting in larger areas of semi-equal color
-
-    :param sigmaSpave: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param borderType: Pixel extrapolation method.
-
-Applies bilateral filter to the image. Supports 8UC1 8UC4 data types.
-
-ocl::adaptiveBilateralFilter
---------------------------------
-Returns void
-
-.. ocv:function:: void ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, double maxSigmaColor = 20.0, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT)
-
-    :param src: The source image
-
-    :param dst: The destination image; will have the same size and the same type as src
-
-    :param ksize: The kernel size. This is the neighborhood where the local variance will be calculated, and where pixels will contribute (in a weighted manner).
-
-    :param sigmaSpace: Filter sigma in the coordinate space. Larger value of the parameter means that farther pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param maxSigmaColor: Maximum allowed sigma color (will clamp the value calculated in the ksize neighborhood. Larger value of the parameter means that more dissimilar pixels will influence each other (as long as their colors are close enough; see sigmaColor). Then d>0, it specifies the neighborhood size regardless of sigmaSpace, otherwise d is proportional to sigmaSpace.
-
-    :param borderType: Pixel extrapolation method.
-
-A main part of our strategy will be to load each raw pixel once, and reuse it to calculate all pixels in the output (filtered) image that need this pixel value. The math of the filter is that of the usual bilateral filter, except that the sigma color is calculated in the neighborhood, and clamped by the optional input value.
-
-Local memory organization
-
-
-.. image:: images/adaptiveBilateralFilter.jpg
-                 :height: 250pt
-                 :width:  350pt
-                 :alt: Introduction Icon
-
-.. note:: We partition the image to non-overlapping blocks of size (Ux, Uy). Each such block will correspond to the pixel locations where we will calculate the filter result in one workgroup. Considering neighbourhoods of sizes (kx, ky), where kx = 2 dx + 1, and ky = 2 dy + 1 (in image ML, dx = dy = 1, and kx = ky = 3), it is clear that we need to load data of size Wx = Ux + 2 dx, Wy = Uy + 2 dy. Furthermore, if (Sx, Sy) is the top left pixel coordinates for a particular block, and (Sx + Ux - 1, Sy + Uy -1) is to botom right coordinate of the block, we need to load data starting at top left coordinate (PSx, PSy) = (Sx - dx, Sy - dy), and ending at bottom right coordinate (Sx + Ux - 1 + dx, Sy + Uy - 1 + dy). The workgroup layout is (Wx,1). However, to take advantage of the natural hardware properties (preferred wavefront sizes), we restrict Wx to be a multiple of that preferred wavefront size (for current AMD hardware this is typically 64). Each thread in the workgroup will load Wy elements (under the constraint that Wx*Wy*pixel width <= max local memory).
-
-Applies bilateral filter to the image. Supports 8UC1 8UC3 data types.
-
-ocl::copyMakeBorder
------------------------
-Returns void
-
-.. ocv:function:: void ocl::copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar())
-
-    :param src: The source image
-
-    :param dst: The destination image; will have the same type as src and the size size(src.cols+left+right, src.rows+top+bottom)
-
-    :param topbottomleftright: Specify how much pixels in each direction from the source image rectangle one needs to extrapolate, e.g. top=1, bottom=1, left=1, right=1mean that 1 pixel-wide border needs to be built
-
-    :param bordertype: Pixel extrapolation method.
-
-    :param value: The border value if borderType==BORDER CONSTANT
-
-Forms a border around the image. Supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 data types.
-
-ocl::dilate
-------------------
-Returns void
-
-.. ocv:function:: void ocl::dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue())
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param kernel: The structuring element used for dilation. If element=Mat(), a 3times 3 rectangular structuring element is used
-
-    :param anchor: Position of the anchor within the element. The default value (-1, -1) means that the anchor is at the element center, only default value is supported
-
-    :param iterations: The number of times dilation is applied
-
-    :param bordertype: Pixel extrapolation method.
-
-    :param value: The border value if borderType==BORDER CONSTANT
-
-The function dilates the source image using the specified structuring element that determines the shape of a pixel neighborhood over which the maximum is taken. Supports 8UC1 8UC4 data types.
-
-ocl::erode
-------------------
-Returns void
-
-.. ocv:function:: void ocl::erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue())
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param kernel: The structuring element used for dilation. If element=Mat(), a 3times 3 rectangular structuring element is used
-
-    :param anchor: Position of the anchor within the element. The default value (-1, -1) means that the anchor is at the element center, only default value is supported
-
-    :param iterations: The number of times dilation is applied
-
-    :param bordertype: Pixel extrapolation method.
-
-    :param value: The border value if borderType==BORDER CONSTANT
-
-The function erodes the source image using the specified structuring element that determines the shape of a pixel neighborhood over which the minimum is taken. Supports 8UC1 8UC4 data types.
-
-ocl::morphologyEx
----------------------
-Returns void
-
-.. ocv:function:: void ocl::morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue())
-
-    :param src: The source image
-
-    :param dst: The destination image; It will have the same size and the same type as src
-
-    :param op: Type of morphological operation, one of the following: ERODE DILTATE OPEN CLOSE GRADIENT TOPHAT BLACKHAT
-
-    :param kernel: The structuring element used for dilation. If element=Mat(), a 3times 3 rectangular structuring element is used
-
-    :param anchor: Position of the anchor within the element. The default value (-1, -1) means that the anchor is at the element center, only default value is supported
-
-    :param iterations: The number of times dilation is applied
-
-    :param bordertype: Pixel extrapolation method.
-
-    :param value: The border value if borderType==BORDER CONSTANT
-
-A wrapper for erode and dilate. Supports 8UC1 8UC4 data types.
-
-ocl::pyrDown
--------------------
-Smoothes an image and downsamples it.
-
-.. ocv:function:: void ocl::pyrDown(const oclMat& src, oclMat& dst)
-
-    :param src: Source image.
-
-    :param dst: Destination image. Will have ``Size((src.cols+1)/2, (src.rows+1)/2)`` size and the same type as ``src`` .
-
-.. seealso:: :ocv:func:`pyrDown`
-
-
-ocl::pyrUp
--------------------
-Upsamples an image and then smoothes it.
-
-.. ocv:function:: void ocl::pyrUp(const oclMat& src, oclMat& dst)
-
-    :param src: Source image.
-
-    :param dst: Destination image. Will have ``Size(src.cols*2, src.rows*2)`` size and the same type as ``src`` .
-
-.. seealso:: :ocv:func:`pyrUp`
-
-ocl::columnSum
-------------------
-Computes a vertical (column) sum.
-
-.. ocv:function:: void ocl::columnSum(const oclMat& src, oclMat& sum)
-
-    :param src: Source image. Only  ``CV_32FC1`` images are supported for now.
-
-    :param sum: Destination image of the  ``CV_32FC1`` type.
-
-
-ocl::blendLinear
---------------------
-Performs linear blending of two images.
-
-.. ocv:function:: void ocl::blendLinear(const oclMat& img1, const oclMat& img2, const oclMat& weights1, const oclMat& weights2, oclMat& result)
-
-    :param img1: First image. Supports only ``CV_8U`` and ``CV_32F`` depth.
-
-    :param img2: Second image. Must have the same size and the same type as ``img1`` .
-
-    :param weights1: Weights for first image. Must have tha same size as ``img1`` . Supports only ``CV_32F`` type.
-
-    :param weights2: Weights for second image. Must have tha same size as ``img2`` . Supports only ``CV_32F`` type.
-
-    :param result: Destination image.
-
-ocl::medianFilter
---------------------
-Blurs an image using the median filter.
-
-.. ocv:function:: void ocl::medianFilter(const oclMat &src, oclMat &dst, int m)
-
-    :param src: input ```1-``` or ```4```-channel image; the image depth should be ```CV_8U```, ```CV_32F```.
-
-    :param dst: destination array of the same size and type as ```src```.
-
-    :param m: aperture linear size; it must be odd and greater than ```1```. Currently only ```3```, ```5``` are supported.
-
-The function smoothes an image using the median filter with the \texttt{m} \times \texttt{m} aperture. Each channel of a multi-channel image is processed independently. In-place operation is supported.
diff --git a/modules/ocl/doc/image_processing.rst b/modules/ocl/doc/image_processing.rst
deleted file mode 100644
index 959c97f9e..000000000
--- a/modules/ocl/doc/image_processing.rst
+++ /dev/null
@@ -1,347 +0,0 @@
-Image Processing
-=============================
-
-.. highlight:: cpp
-
-ocl::meanShiftFiltering
----------------------------
-Performs mean-shift filtering for each point of the source image.
-
-.. ocv:function:: void ocl::meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dst: Destination image containing the color of mapped points. It has the same size and type as  ``src`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-It maps each point of the source image into another point. As a result, you have a new color and new position of each point.
-
-
-ocl::meanShiftProc
-----------------------
-Performs a mean-shift procedure and stores information about processed points (their colors and positions) in two images.
-
-.. ocv:function:: void ocl::meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dstr: Destination image containing the color of mapped points. The size and type is the same as  ``src`` .
-
-    :param dstsp: Destination image containing the position of mapped points. The size is the same as  ``src`` size. The type is  ``CV_16SC2`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-.. seealso:: :ocv:func:`ocl::meanShiftFiltering`
-
-
-ocl::meanShiftSegmentation
-------------------------------
-Performs a mean-shift segmentation of the source image and eliminates small segments.
-
-.. ocv:function:: void ocl::meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1))
-
-    :param src: Source image. Only  ``CV_8UC4`` images are supported for now.
-
-    :param dst: Segmented image with the same size and type as  ``src`` .
-
-    :param sp: Spatial window radius.
-
-    :param sr: Color window radius.
-
-    :param minsize: Minimum segment size. Smaller segments are merged.
-
-    :param criteria: Termination criteria. See :ocv:class:`TermCriteria`.
-
-ocl::integral
------------------
-Computes an integral image.
-
-.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum, oclMat &sqsum, int sdepth=-1)
-
-.. ocv:function:: void ocl::integral(const oclMat &src, oclMat &sum, int sdepth=-1)
-
-    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
-
-    :param sum: Integral image containing 32-bit unsigned integer or 32-bit floating-point .
-
-    :param sqsum: Sqsum values is ``CV_32FC1`` or ``CV_64FC1`` type.
-
-.. seealso:: :ocv:func:`integral`
-
-ocl::cornerHarris
----------------------
-Returns void
-
-.. ocv:function:: void ocl::cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT)
-
-    :param src: Source image. Only CV_8UC1 and CV_32FC1 images are supported now.
-
-    :param dst: Destination image containing cornerness values. It has the same size as src and CV_32FC1 type.
-
-    :param blockSize: Neighborhood size
-
-    :param ksize: Aperture parameter for the Sobel operator
-
-    :param k: Harris detector free parameter
-
-    :param bordertype: Pixel extrapolation method. Only BORDER_REFLECT101, BORDER_REFLECT, BORDER_CONSTANT and BORDER_REPLICATE are supported now.
-
-Calculate Harris corner.
-
-ocl::cornerMinEigenVal
---------------------------
-Returns void
-
-.. ocv:function:: void ocl::cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT)
-
-    :param src: Source image. Only CV_8UC1 and CV_32FC1 images are supported now.
-
-    :param dst: Destination image containing cornerness values. It has the same size as src and CV_32FC1 type.
-
-    :param blockSize: Neighborhood size
-
-    :param ksize: Aperture parameter for the Sobel operator
-
-    :param bordertype: Pixel extrapolation method. Only BORDER_REFLECT101, BORDER_REFLECT, BORDER_CONSTANT and BORDER_REPLICATE are supported now.
-
-Calculate MinEigenVal.
-
-ocl::calcHist
-------------------
-Returns void
-
-.. ocv:function:: void ocl::calcHist(const oclMat &mat_src, oclMat &mat_hist)
-
-    :param src: Source arrays. They all should have the same depth, CV 8U, and the same size. Each of them can have an arbitrary number of channels.
-
-    :param dst: The output histogram, a dense or sparse dims-dimensional
-
-Calculates histogram of one or more arrays. Supports only 8UC1 data type.
-
-ocl::equalizeHist
----------------------
-Equalizes the histogram of a grayscale image.
-
-.. ocv:function:: void ocl::equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
-
-    :param mat_src: Source image.
-
-    :param mat_dst: Destination image.
-
-.. seealso:: :ocv:func:`equalizeHist`
-
-
-ocl::remap
-------------------
-Returns void
-
-.. ocv:function:: void ocl::remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar())
-
-    :param src: Source image.
-
-    :param dst: Destination image containing cornerness values. It has the same size as src and CV_32FC1 type.
-
-    :param map1: The first map of either (x,y) points or just x values having the type CV_16SC2 , CV_32FC1 , or CV_32FC2 . See covertMaps() for details on converting a floating point representation to fixed-point for speed.
-
-    :param map2: The second map of y values having the type CV_32FC1 , or none (empty map if map1 is (x,y) points), respectively.
-
-    :param interpolation: The interpolation method
-
-    :param bordertype: Pixel extrapolation method.
-
-    :param value: The border value if borderType==BORDER CONSTANT
-
-The function remap transforms the source image using the specified map: dst (x ,y) = src (map1(x , y) , map2(x , y)) where values of pixels with non-integer coordinates are computed using one of available interpolation methods. map1 and map2 can be encoded as separate floating-point maps in map1 and map2 respectively, or interleaved floating-point maps of (x,y) in map1.
-
-ocl::resize
-------------------
-Returns void
-
-.. ocv:function:: void ocl::resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR)
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param dsize: he destination image size. If it is zero, then it is computed as: dsize = Size(round(fx*src.cols), round(fy*src.rows)). Either dsize or both fx or fy must be non-zero.
-
-    :param fx: The scale factor along the horizontal axis. When 0, it is computed as (double)dsize.width/src.cols
-
-    :param fy: The scale factor along the vertical axis. When 0, it is computed as (double)dsize.height/src.rows
-
-    :param interpolation: The interpolation method: INTER NEAREST or INTER LINEAR
-
-Resizes an image. Supports CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1 , CV_32FC3 and CV_32FC4 data types.
-
-ocl::warpAffine
--------------------
-Returns void
-
-.. ocv:function:: void ocl::warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param M: 2times 3 transformation matrix
-
-    :param dsize: Size of the destination image
-
-    :param flags: A combination of interpolation methods, see cv::resize, and the optional flag WARP INVERSE MAP that means that M is the inverse transformation (dst to $src)
-
-The function warpAffine transforms the source image using the specified matrix. Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC types.
-
-ocl::warpPerspective
-------------------------
-Returns void
-
-.. ocv:function:: void ocl::warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR)
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param M: 2times 3 transformation matrix
-
-    :param dsize: Size of the destination image
-
-    :param flags: A combination of interpolation methods, see cv::resize, and the optional flag WARP INVERSE MAP that means that M is the inverse transformation (dst to $src)
-
-Applies a perspective transformation to an image. Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC types.
-
-ocl::cvtColor
-------------------
-Returns void
-
-.. ocv:function:: void ocl::cvtColor(const oclMat &src, oclMat &dst, int code, int dcn = 0)
-
-    :param src: Source image.
-
-    :param dst: Destination image.
-
-    :param code:The color space conversion code
-
-    :param dcn: The number of channels in the destination image; if the parameter is 0, the number of the channels will be derived automatically from src and the code
-
-Converts image from one color space to another.For now, only RGB2GRAY is supportted. Supports.CV_8UC1,CV_8UC4,CV_32SC1,CV_32SC4,CV_32FC1,CV_32FC4
-
-ocl::threshold
-------------------
-Returns Threshold value
-
-.. ocv:function:: double ocl::threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC)
-
-    :param src: The source array
-
-    :param dst: Destination array; will have the same size and the same type as src
-
-    :param thresh: Threshold value
-
-    :param maxVal: Maximum value to use with THRESH BINARY and THRESH BINARY INV thresholding types
-
-    :param type: Thresholding type
-
-The function applies fixed-level thresholding to a single-channel array. The function is typically used to get a bi-level (binary) image out of a grayscale image or for removing a noise, i.e. filtering out pixels with too small or too large values. There are several types of thresholding that the function supports that are determined by thresholdType.
-
-ocl::buildWarpPlaneMaps
------------------------
-Builds plane warping maps.
-
-.. ocv:function:: void ocl::buildWarpPlaneMaps( Size src_size, Rect dst_roi, const Mat& K, const Mat& R, const Mat& T, float scale, oclMat& map_x, oclMat& map_y )
-
-
-
-ocl::buildWarpCylindricalMaps
------------------------------
-Builds cylindrical warping maps.
-
-.. ocv:function:: void ocl::buildWarpCylindricalMaps( Size src_size, Rect dst_roi, const Mat& K, const Mat& R, float scale, oclMat& map_x, oclMat& map_y )
-
-
-
-
-ocl::buildWarpSphericalMaps
----------------------------
-Builds spherical warping maps.
-
-.. ocv:function:: void ocl::buildWarpSphericalMaps( Size src_size, Rect dst_roi, const Mat& K, const Mat& R, float scale, oclMat& map_x, oclMat& map_y )
-
-
-ocl::buildWarpPerspectiveMaps
------------------------------
-Builds transformation maps for perspective transformation.
-
-.. ocv:function:: void ocl::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, oclMat& xmap, oclMat& ymap)
-
-    :param M: *3x3*  transformation matrix.
-
-    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
-
-    :param dsize: Size of the destination image.
-
-    :param xmap: X values with  ``CV_32FC1`` type.
-
-    :param ymap: Y values with  ``CV_32FC1`` type.
-
-.. seealso:: :ocv:func:`ocl::warpPerspective` , :ocv:func:`ocl::remap`
-
-
-ocl::buildWarpAffineMaps
-----------------------------
-Builds transformation maps for affine transformation.
-
-.. ocv:function:: void ocl::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, oclMat& xmap, oclMat& ymap)
-
-    :param M: *2x3*  transformation matrix.
-
-    :param inverse: Flag  specifying that  ``M`` is an inverse transformation ( ``dst=>src`` ).
-
-    :param dsize: Size of the destination image.
-
-    :param xmap: X values with  ``CV_32FC1`` type.
-
-    :param ymap: Y values with  ``CV_32FC1`` type.
-
-.. seealso:: :ocv:func:`ocl::warpAffine` , :ocv:func:`ocl::remap`
-
-ocl::HoughCircles
------------------
-Finds circles in a grayscale image using the Hough transform.
-
-.. ocv:function:: void ocl::HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
-
-.. ocv:function:: void ocl::HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096)
-
-    :param src: 8-bit, single-channel grayscale input image.
-
-    :param circles: Output vector of found circles. Each vector is encoded as a 3-element floating-point vector  :math:`(x, y, radius)` .
-
-    :param method: Detection method to use. Currently, the only implemented method is  ``CV_HOUGH_GRADIENT`` , which is basically  *21HT* , described in  [Yuen90]_.
-
-    :param dp: Inverse ratio of the accumulator resolution to the image resolution. For example, if  ``dp=1`` , the accumulator has the same resolution as the input image. If  ``dp=2`` , the accumulator has half as big width and height.
-
-    :param minDist: Minimum distance between the centers of the detected circles. If the parameter is too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is too large, some circles may be missed.
-
-    :param cannyThreshold: The higher threshold of the two passed to  the :ocv:func:`ocl::Canny`  edge detector (the lower one is twice smaller).
-
-    :param votesThreshold: The accumulator threshold for the circle centers at the detection stage. The smaller it is, the more false circles may be detected.
-
-    :param minRadius: Minimum circle radius.
-
-    :param maxRadius: Maximum circle radius.
-
-    :param maxCircles: Maximum number of output circles.
-
-    :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes).
-
-.. note:: Currently only non-ROI oclMat is supported for src.
-.. seealso:: :ocv:func:`HoughCircles`
diff --git a/modules/ocl/doc/images/adaptiveBilateralFilter.jpg b/modules/ocl/doc/images/adaptiveBilateralFilter.jpg
deleted file mode 100644
index 6508f693c..000000000
Binary files a/modules/ocl/doc/images/adaptiveBilateralFilter.jpg and /dev/null differ
diff --git a/modules/ocl/doc/introduction.rst b/modules/ocl/doc/introduction.rst
deleted file mode 100644
index 2c050cb27..000000000
--- a/modules/ocl/doc/introduction.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-OpenCL Module Introduction
-==========================
-
-.. highlight:: cpp
-
-General Information
--------------------
-
-The OpenCV OCL module contains a set of classes and functions that implement and accelerate OpenCV functionality on OpenCL compatible devices. OpenCL is a Khronos standard, implemented by a variety of devices (CPUs, GPUs, FPGAs, ARM), abstracting the exact hardware details, while enabling vendors to provide native implementation for maximal acceleration on their hardware. The standard enjoys wide industry support, and the end user of the module will enjoy the data parallelism benefits that the specific platform/hardware may be capable of, in a platform/hardware independent manner.
-
-While in the future we hope to validate (and enable) the OCL module in all OpenCL capable devices, we currently develop and test on GPU devices only. This includes both discrete GPUs (NVidia, AMD), as well as integrated chips (AMD APU and Intel HD devices). Performance of any particular algorithm will depend on the particular platform characteristics and capabilities. However, currently, accuracy and  mathematical correctness has been verified to be identical to that of the pure CPU implementation on all tested GPU devices and platforms (both Windows and Linux).
-
-
-The OpenCV OCL module includes utility functions, low-level vision primitives, and high-level algorithms. The utility functions and low-level primitives provide a powerful infrastructure for developing fast vision algorithms taking advantage of OCL, whereas the high-level functionality (samples) includes some state-of-the-art algorithms (including LK Optical flow, and Face detection) ready to be used by the application developers. The module is also accompanied by an extensive performance and accuracy test suite.
-
-The OpenCV OCL module is designed for ease of use and does not require any knowledge of OpenCL. At a minimum level, it can be viewed as a set of accelerators, that can take advantage of the high compute throughput that GPU/APU devices can provide. However, it can also be viewed as a starting point to really integrate the built-in functionality with your own custom OpenCL kernels, with or without modifying the source of OpenCV-OCL. Of course, knowledge of OpenCL will certainly help, however we hope that OpenCV-OCL module, and the kernels it contains in source code, can be very useful as a means of actually learning openCL. Such a knowledge would be necessary to further fine-tune any of the existing OpenCL kernels, or for extending the framework with new kernels. As of OpenCV 2.4.4, we introduce interoperability with OpenCL, enabling easy use of custom OpenCL kernels within the OpenCV framework.
-
-To correctly run the OCL module, you need to have the OpenCL runtime provided by the device vendor, typically the device driver.
-
-To enable OCL support, configure OpenCV using CMake with ``WITH_OPENCL=ON``. When the flag is set and if OpenCL SDK is installed, the full-featured OpenCV OCL module is built. Otherwise, the module may be not built. If you have AMD'S FFT and BLAS library, you can select it with ``WITH_OPENCLAMDFFT=ON``, ``WITH_OPENCLAMDBLAS=ON``.
-
-The ocl module can be found under the "modules" directory. In "modules/ocl/src" you can find the source code for the cpp class that wrap around the direct kernel invocation. The kernels themselves can be found in "modules/ocl/src/opencl".  Samples can be found under "samples/ocl". Accuracy tests can be found in "modules/ocl/test", and performance tests under "module/ocl/perf".
-
-
-
-Right now, the user can select OpenCL device by specifying the environment variable ``OPENCV_OPENCL_DEVICE``. Variable format:
-
-.. code-block:: cpp
-
-    <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<DeviceName or ID>
-
-**Note:** Device ID range is: 0..9 (only one digit, 10 - it is a part of name)
-
-Samples:
-
-.. code-block:: cpp
-
-    '' = ':' = '::' = ':GPU|CPU:'
-    'AMD:GPU|CPU:'
-    'AMD::Tahiti'
-    ':GPU:1'
-    ':CPU:2'
-
-Also the user can use ``cv::ocl::setDevice`` function (with ``cv::ocl::getOpenCLPlatforms`` and ``cv::ocl::getOpenCLDevices``). This function initializes OpenCL runtime and setup the passed device as computing device.
-
-In the current version, all the thread share the same context and device so the multi-devices are not supported. We will add this feature soon. If a function support 4-channel operator, it should support 3-channel operator as well, because All the 3-channel matrix(i.e. RGB image) are represented by 4-channel matrix in ``oclMat``. It means 3-channel image have 4-channel space with the last channel unused. We provide a transparent interface to handle the difference between OpenCV Mat and ``oclMat``.
-
-Developer Notes
--------------------
-
-In a heterogeneous device environment, there may be cost associated with data transfer. This would be the case, for example, when data needs to be moved from host memory (accessible to the CPU), to device memory (accessible to a discrete GPU). in the case of integrated graphics chips, there may be performance issues, relating to memory coherency between access from the GPU "part" of the integrated device, or the CPU "part." For best performance, in either case, it is recommended that you do not introduce data transfers between CPU and the discrete GPU, except in the beginning and the end of the algorithmic pipeline.
-
-Some tidbits:
-
-1. OpenCL version should be larger than 1.1 with FULL PROFILE.
-
-2. Currently there's only one OpenCL context and command queue. We hope to implement multi device and multi queue support in the future.
-
-3. Many kernels use 256 as its workgroup size if possible, so the max work group size of the device must larger than 256. All GPU devices we are aware of indeed support 256 workitems in a workgroup, however non GPU devices may not. This will be improved in the future.
-
-4. If the device does not support double arithmetic, then functions' implementation generates an error.
-
-5. The ``oclMat`` uses buffer object, not image object.
-
-6. All the 3-channel matrices (i.e. RGB image) are represented by 4-channel matrices in ``oclMat``, with the last channel unused. We provide a transparent interface to handle the difference between OpenCV Mat and ``oclMat``.
-
-7. All the matrix in ``oclMat`` is aligned in column (now the alignment factor for ``step`` is 32+ byte). It means, m.cols * m.elemSize() <= m.step.
-
-8. Data transfer between Mat and ``oclMat``: If the CPU matrix is aligned in column, we will use faster API to transfer between Mat and ``oclMat``, otherwise, we will use clEnqueueRead/WriteBufferRect to transfer data to guarantee the alignment. 3-channel matrix is an exception, it's directly transferred to a temp buffer and then padded to 4-channel matrix(also aligned) when uploading and do the reverse operation when downloading.
-
-9. Data transfer between Mat and ``oclMat``: ROI is a feature of OpenCV, which allow users process a sub rectangle of a matrix. When a CPU matrix which has ROI will be transfered to GPU, the whole matrix will be transfered and set ROI as CPU's. In a word, we always transfer the whole matrix despite whether it has ROI or not.
-
-10. All the kernel file should locate in "modules/ocl/src/opencl/" with the extension ".cl". All the kernel files are transformed to pure characters at compilation time in opencl_kernels.cpp, and the file name without extension is the name of the program sources.
diff --git a/modules/ocl/doc/matrix_reductions.rst b/modules/ocl/doc/matrix_reductions.rst
deleted file mode 100644
index 41161d8aa..000000000
--- a/modules/ocl/doc/matrix_reductions.rst
+++ /dev/null
@@ -1,106 +0,0 @@
-Matrix Reductions
-=============================
-
-.. highlight:: cpp
-
-ocl::absSum
----------------
-Returns the sum of absolute values for matrix elements.
-
-.. ocv:function:: Scalar ocl::absSum(const oclMat &m)
-
-    :param m: The Source image of all depth.
-
-Counts the abs sum of matrix elements for each channel. Supports all data types.
-
-ocl::countNonZero
----------------------
-Returns the number of non-zero elements in src
-
-.. ocv:function:: int ocl::countNonZero(const oclMat &src)
-
-    :param src: Single-channel array
-
-Counts non-zero array elements. Supports all data types.
-
-ocl::min
-------------------
-
-.. ocv:function:: void ocl::min(const oclMat &src1, const oclMat &src2, oclMat &dst)
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-Computes element-wise minima of two arrays. Supports all data types.
-
-ocl::max
-------------------
-
-.. ocv:function:: void ocl::max(const oclMat &src1, const oclMat &src2, oclMat &dst)
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-Computes element-wise maxima of two arrays. Supports all data types.
-
-ocl::minMax
-------------------
-Returns void
-
-.. ocv:function:: void ocl::minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat())
-
-    :param src: Single-channel array
-
-    :param minVal: Pointer to returned minimum value, should not be NULL
-
-    :param maxVal: Pointer to returned maximum value, should not be NULL
-
-    :param mask: The optional mask used to select a sub-array
-
-Finds global minimum and maximum in a whole array or sub-array. Supports all data types.
-
-ocl::minMaxLoc
-------------------
-Returns void
-
-.. ocv:function:: void ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,const oclMat &mask = oclMat())
-
-    :param src: Single-channel array
-
-    :param minVal: Pointer to returned minimum value, should not be NULL
-
-    :param maxVal: Pointer to returned maximum value, should not be NULL
-
-    :param minLoc: Pointer to returned minimum location (in 2D case), should not be NULL
-
-    :param maxLoc: Pointer to returned maximum location (in 2D case) should not be NULL
-
-    :param mask: The optional mask used to select a sub-array
-
-The functions minMaxLoc find minimum and maximum element values and their positions. The extremums are searched across the whole array, or, if mask is not an empty array, in the specified array region. The functions do not work with multi-channel arrays.
-
-ocl::sqrSum
-------------------
-Returns the squared sum of matrix elements for each channel
-
-.. ocv:function:: Scalar ocl::sqrSum(const oclMat &m)
-
-    :param m: The Source image of all depth.
-
-Counts the squared sum of matrix elements for each channel. Supports all data types.
-
-ocl::sum
-------------------
-Returns the sum of matrix elements for each channel
-
-.. ocv:function:: Scalar ocl::sum(const oclMat &m)
-
-    :param m: The Source image of all depth.
-
-Counts the sum of matrix elements for each channel.
diff --git a/modules/ocl/doc/ml_machine_learning.rst b/modules/ocl/doc/ml_machine_learning.rst
deleted file mode 100644
index ad0e30397..000000000
--- a/modules/ocl/doc/ml_machine_learning.rst
+++ /dev/null
@@ -1,106 +0,0 @@
-ml.Machine Learning
-=============================
-
-.. highlight:: cpp
-
-ocl::KNearestNeighbour
---------------------------
-.. ocv:class:: ocl::KNearestNeighbour : public ocl::CvKNearest
-
-The class implements K-Nearest Neighbors model as described in the beginning of this section.
-
-ocl::KNearestNeighbour
---------------------------
-Computes the weighted sum of two arrays. ::
-
-    class CV_EXPORTS KNearestNeighbour: public CvKNearest
-    {
-    public:
-        KNearestNeighbour();
-        ~KNearestNeighbour();
-
-        bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
-            bool isRegression = false, int max_k = 32, bool updateBase = false);
-
-        void clear();
-
-        void find_nearest(const oclMat& samples, int k, oclMat& lables);
-
-    private:
-        /* hidden */
-    };
-
-ocl::KNearestNeighbour::train
----------------------------------
-Trains the model.
-
-.. ocv:function:: bool ocl::KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)), bool isRegression = false, int max_k = 32, bool updateBase = false)
-
-    :param isRegression: Type of the problem: ``true`` for regression and ``false`` for classification.
-
-    :param maxK: Number of maximum neighbors that may be passed to the method :ocv:func:`CvKNearest::find_nearest`.
-
-    :param updateBase: Specifies whether the model is trained from scratch (``update_base=false``), or it is updated using the new training data (``update_base=true``). In the latter case, the parameter ``maxK`` must not be larger than the original value.
-
-The method trains the K-Nearest model. It follows the conventions of the generic :ocv:func:`CvStatModel::train` approach with the following limitations:
-
-* Only ``CV_ROW_SAMPLE`` data layout is supported.
-* Input variables are all ordered.
-* Output variables can be either categorical ( ``is_regression=false`` ) or ordered ( ``is_regression=true`` ).
-* Variable subsets (``var_idx``) and missing measurements are not supported.
-
-ocl::KNearestNeighbour::find_nearest
-----------------------------------------
-Finds the neighbors and predicts responses for input vectors.
-
-.. ocv:function:: void ocl::KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables )
-
-    :param samples: Input samples stored by rows. It is a single-precision floating-point matrix of :math:`number\_of\_samples \times number\_of\_features` size.
-
-    :param k: Number of used nearest neighbors. It must satisfy constraint: :math:`k \le` :ocv:func:`CvKNearest::get_max_k`.
-
-    :param labels: Vector with results of prediction (regression or classification) for each input sample. It is a single-precision floating-point vector with ``number_of_samples`` elements.
-
-ocl::kmeans
----------------
-Finds centers of clusters and groups input samples around the clusters.
-
-.. ocv:function:: double ocl::kmeans(const oclMat &src, int K, oclMat &bestLabels, TermCriteria criteria, int attemps, int flags, oclMat &centers)
-
-    :param src: Floating-point matrix of input samples, one row per sample.
-
-    :param K: Number of clusters to split the set by.
-
-    :param bestLabels: Input/output integer array that stores the cluster indices for every sample.
-
-    :param criteria: The algorithm termination criteria, that is, the maximum number of iterations and/or the desired accuracy. The accuracy is specified as ``criteria.epsilon``. As soon as each of the cluster centers moves by less than ``criteria.epsilon`` on some iteration, the algorithm stops.
-
-    :param attempts: Flag to specify the number of times the algorithm is executed using different initial labellings. The algorithm returns the labels that yield the best compactness (see the last function parameter).
-
-    :param flags: Flag that can take the following values:
-
-            * **KMEANS_RANDOM_CENTERS** Select random initial centers in each attempt.
-
-            * **KMEANS_PP_CENTERS** Use ``kmeans++`` center initialization by Arthur and Vassilvitskii [Arthur2007].
-
-            * **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of  ``KMEANS_*_CENTERS``  flag to specify the exact method.
-
-    :param centers: Output matrix of the cluster centers, one row per each cluster center.
-
-ocl::distanceToCenters
-----------------------
-For each samples in ``source``, find its closest neighour in ``centers``.
-
-.. ocv:function:: void ocl::distanceToCenters(const oclMat &src, const oclMat &centers, Mat &dists, Mat &labels, int distType = NORM_L2SQR)
-
-    :param src: Floating-point matrix of input samples. One row per sample.
-
-    :param centers: Floating-point matrix of center candidates. One row per center.
-
-    :param distType: Distance metric to calculate distances. Supports ``NORM_L1`` and ``NORM_L2SQR``.
-
-    :param dists: The output distances calculated from each sample to the best matched center.
-
-    :param labels: The output index of best matched center for each row of sample.
-
-The method is a utility function which maybe used for multiple clustering algorithms such as K-means.
diff --git a/modules/ocl/doc/object_detection.rst b/modules/ocl/doc/object_detection.rst
deleted file mode 100644
index 0539e7738..000000000
--- a/modules/ocl/doc/object_detection.rst
+++ /dev/null
@@ -1,95 +0,0 @@
-Object Detection
-=============================
-
-.. highlight:: cpp
-
-ocl::OclCascadeClassifier
------------------------------
-.. ocv:class:: ocl::OclCascadeClassifier : public CascadeClassifier
-
-Cascade classifier class used for object detection. Supports HAAR cascade classifier  in the form of cross link ::
-
-    class CV_EXPORTS OclCascadeClassifier : public CascadeClassifier
-    {
-    public:
-            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
-                                              double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
-                                              Size minSize = Size(), Size maxSize = Size());
-    };
-
-.. note::
-
-   (Ocl) A face detection example using cascade classifiers can be found at opencv_source_code/samples/ocl/facedetect.cpp
-
-ocl::OclCascadeClassifier::detectMultiScale
-------------------------------------------------------
-Detects objects of different sizes in the input image.
-
-.. ocv:function:: void ocl::OclCascadeClassifier::detectMultiScale(oclMat &image, std::vector<cv::Rect>& faces, double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, Size minSize = Size(), Size maxSize = Size())
-
-    :param faces: Vector of rectangles where each rectangle contains the detected object.
-
-    :param image:  Matrix of type CV_8U containing an image where objects should be detected.
-
-    :param scaleFactor: Parameter specifying how much the image size is reduced at each image scale.
-
-    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.
-
-    :param flags: Parameter with the same meaning for an old cascade as in the function ``cvHaarDetectObjects``. It is not used for a new cascade.
-
-    :param minSize: Minimum possible object size. Objects smaller than that are ignored.
-
-    :param maxSize: Maximum possible object size. Objects larger than that are ignored.
-
-The function provides a very similar interface with that in CascadeClassifier class, except using oclMat as input image.
-
-ocl::MatchTemplateBuf
--------------------------
-.. ocv:struct:: ocl::MatchTemplateBuf
-
-Class providing memory buffers for :ocv:func:`ocl::matchTemplate` function, plus it allows to adjust some specific parameters. ::
-
-    struct CV_EXPORTS MatchTemplateBuf
-    {
-        Size user_block_size;
-        oclMat imagef, templf;
-        std::vector<oclMat> images;
-        std::vector<oclMat> image_sums;
-        std::vector<oclMat> image_sqsums;
-    };
-
-You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
-
-ocl::matchTemplate
-----------------------
-Computes a proximity map for a raster template and an image where the template is searched for.
-
-.. ocv:function:: void ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
-
-.. ocv:function:: void ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf &buf)
-
-    :param image: Source image.  ``CV_32F`` and  ``CV_8U`` depth images (1..4 channels) are supported for now.
-
-    :param templ: Template image with the size and type the same as  ``image`` .
-
-    :param result: Map containing comparison results ( ``CV_32FC1`` ). If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
-
-    :param method: Specifies the way to compare the template with the image.
-
-    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`ocl::MatchTemplateBuf`.
-
-    The following methods are supported for the ``CV_8U`` depth images for now:
-
-    * ``CV_TM_SQDIFF``
-    * ``CV_TM_SQDIFF_NORMED``
-    * ``CV_TM_CCORR``
-    * ``CV_TM_CCORR_NORMED``
-    * ``CV_TM_CCOEFF``
-    * ``CV_TM_CCOEFF_NORMED``
-
-    The following methods are supported for the ``CV_32F`` images for now:
-
-    * ``CV_TM_SQDIFF``
-    * ``CV_TM_CCORR``
-
-.. seealso:: :ocv:func:`matchTemplate`
diff --git a/modules/ocl/doc/ocl.rst b/modules/ocl/doc/ocl.rst
deleted file mode 100644
index 76c1f882b..000000000
--- a/modules/ocl/doc/ocl.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-***************************************
-ocl. OpenCL-accelerated Computer Vision
-***************************************
-
-.. toctree::
-    :maxdepth: 1
-
-    introduction
-    structures_and_utility_functions
-    data_structures
-    operations_on_matrices
-    matrix_reductions
-    image_filtering
-    image_processing
-    ml_machine_learning
-    object_detection
-    feature_detection_and_description
-    video_analysis
-    camera_calibration_and_3D_reconstruction
-..    camera_calibration_and_3d_reconstruction
-..    video
diff --git a/modules/ocl/doc/operations_on_matrices.rst b/modules/ocl/doc/operations_on_matrices.rst
deleted file mode 100644
index 1763d33d1..000000000
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ /dev/null
@@ -1,602 +0,0 @@
-Operations on Matrics
-=============================
-
-.. highlight:: cpp
-
-ocl::abs
-------------------
-Returns void
-
-.. ocv:function:: void ocl::abs(const oclMat& src, oclMat& dst)
-
-    :param src: input array.
-
-    :param dst: destination array, it will have the same size and same type as ``src``.
-
-Computes per-element absolute values of the input array. Supports all data types.
-
-ocl::absdiff
-------------------
-Returns void
-
-.. ocv:function:: void ocl::absdiff(const oclMat& src1, const oclMat& src2, oclMat& dst)
-
-.. ocv:function:: void ocl::absdiff(const oclMat& src1, const Scalar& s, oclMat& dst)
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param s: scalar, the second input parameter.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types.
-
-ocl::add
-------------------
-Returns void
-
-.. ocv:function:: void ocl::add(const oclMat & src1, const oclMat & src2, oclMat & dst, const oclMat & mask = oclMat())
-
-.. ocv:function:: void ocl::add(const oclMat & src1, const Scalar & s, oclMat & dst, const oclMat & mask = oclMat())
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param s: scalar, the second input parameter
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
-
-Computes per-element additon between two arrays or between array and a scalar. Supports all data types.
-
-ocl::addWeighted
---------------------
-Computes the weighted sum of two arrays.
-
-.. ocv:function:: void ocl::addWeighted(const oclMat& src1, double  alpha, const oclMat& src2, double beta, double gama, oclMat& dst)
-
-    :param src1: First source array.
-
-    :param alpha: Weight for the first array elements.
-
-    :param src2: Second source array of the same size and channel number as  ``src1`` .
-
-    :param beta: Weight for the second array elements.
-
-    :param dst: Destination array that has the same size and number of channels as the input arrays.
-
-    :param gamma: Scalar added to each sum.
-
-The function ``addWeighted`` calculates the weighted sum of two arrays as follows:
-
-.. math::
-
-    \texttt{c} (I)= \texttt{saturate} ( \texttt{a} (I)* \texttt{alpha} +  \texttt{b} (I)* \texttt{beta} +  \texttt{gamma} )
-
-where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.
-
-.. seealso:: :ocv:func:`addWeighted`
-
-ocl::bitwise_and
-------------------
-Returns void
-
-.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param s: scalar, the second input parameter.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
-
-Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types.
-
-ocl::bitwise_not
-------------------
-Returns void
-
-.. ocv:function:: void ocl::bitwise_not(const oclMat &src, oclMat &dst)
-
-    :param src: the input array.
-
-    :param dst: the destination array, it will have the same size and same type as ``src``.
-
-The functions bitwise not compute per-element bit-wise inversion of the source array. Supports all data types.
-
-ocl::bitwise_or
-------------------
-Returns void
-
-.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param s: scalar, the second input parameter.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
-
-Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types.
-
-ocl::bitwise_xor
-------------------
-Returns void
-
-.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param sc: scalar, the second input parameter.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
-
-Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types.
-
-ocl::cartToPolar
-------------------
-Returns void
-
-.. ocv:function:: void ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false)
-
-    :param x: the array of x-coordinates; must be single-precision or double-precision floating-point array.
-
-    :param y: the array of y-coordinates; it must have the same size and same type as ``x``.
-
-    :param magnitude: the destination array of magnitudes of the same size and same type as ``x``.
-
-    :param angle: the destination array of angles of the same size and same type as ``x``. The angles are measured in radians (0 to 2pi) or in degrees (0 to 360 degrees).
-
-    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
-
-Calculates the magnitude and angle of 2D vectors. Supports only ``CV_32F`` and ``CV_64F`` data types.
-
-ocl::compare
-------------------
-Returns void
-
-.. ocv:function:: void ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop)
-
-    :param src1: the first source array.
-
-    :param src2: the second source array; must have the same size and same type as ``src1``.
-
-    :param dst: the destination array; will have the same size as ``src1`` and type ``CV_8UC1``.
-
-    :param cmpop: the flag specifying the relation between the elements to be checked.
-
-Performs per-element comparison of two arrays or an array and scalar value. Supports all data types.
-
-ocl::dft
-------------
-Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
-
-.. ocv:function:: void ocl::dft(const oclMat& src, oclMat& dst, Size dft_size = Size(), int flags = 0)
-
-    :param src: source matrix (real or complex).
-
-    :param dst: destination matrix (real or complex).
-
-    :param dft_size: size of original input, which is used for transformation from complex to real.
-
-    :param flags: optional flags:
-
-        * **DFT_ROWS** transforms each individual row of the source matrix.
-
-        * **DFT_COMPLEX_OUTPUT** performs a forward transformation of 1D or 2D real array. The result, though being a complex array, has complex-conjugate symmetry (*CCS*, see the function description below for details). Such an array can be packed into a real array of the same size as input, which is the fastest option and which is what the function does by default. However, you may wish to get a full complex array (for simpler spectrum analysis, and so on). Pass the flag to enable the function to produce a full-size complex output array.
-
-        * **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real cases are always forward and inverse, respectively).
-
-        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
-
-Use to handle real matrices (``CV_32FC1``) and complex matrices in the interleaved format (``CV_32FC2``).
-
-The ``dft_size`` must be powers of ``2``, ``3`` and ``5``. Real to complex dft output is not the same with cpu version. Real to complex and complex to real does not support ``DFT_ROWS``.
-
-.. seealso:: :ocv:func:`dft`
-
-ocl::divide
-------------------
-Returns void
-
-.. ocv:function:: void ocl::divide(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
-
-.. ocv:function:: void ocl::divide(double scale, const oclMat& src1, oclMat& dst)
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param scale: scalar factor.
-
-Computes per-element divide between two arrays or between array and a scalar. Supports all data types.
-
-ocl::exp
-------------------
-Returns void
-
-.. ocv:function:: void ocl::exp(const oclMat &src, oclMat &dst)
-
-    :param src: the first source array.
-
-    :param dst: the dst array; must have the same size and same type as ``src``.
-
-The function exp calculates the exponent of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
-
-ocl::flip
-------------------
-Returns void
-
-.. ocv:function:: void ocl::flip(const oclMat& src, oclMat& dst, int flipCode)
-
-    :param src: source image.
-
-    :param dst: destination image.
-
-    :param flipCode: specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
-
-The function flip flips the array in one of three different ways (row and column indices are 0-based). Supports all data types.
-
-ocl::gemm
-------------------
-Performs generalized matrix multiplication.
-
-.. ocv:function:: void ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, const oclMat& src3, double beta, oclMat& dst, int flags = 0)
-
-    :param src1: first multiplied input matrix that should be ``CV_32FC1`` type.
-
-    :param src2: second multiplied input matrix of the same type as ``src1``.
-
-    :param alpha: weight of the matrix product.
-
-    :param src3: third optional delta matrix added to the matrix product. It should have the same type as ``src1`` and ``src2``.
-
-    :param beta: weight of ``src3``.
-
-    :param dst: destination matrix. It has the proper size and the same type as input matrices.
-
-    :param flags: operation flags:
-
-            * **GEMM_1_T** transpose ``src1``.
-            * **GEMM_2_T** transpose ``src2``.
-
-.. seealso:: :ocv:func:`gemm`
-
-ocl::log
-------------------
-Returns void
-
-.. ocv:function:: void ocl::log(const oclMat &src, oclMat &dst)
-
-    :param src: the first source array.
-
-    :param dst: the dst array; must have the same size and same type as ``src``.
-
-The function log calculates the log of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
-
-ocl::LUT
-------------------
-Returns void
-
-.. ocv:function:: void ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
-
-    :param src: source array of 8-bit elements.
-
-    :param lut: look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array.
-
-    :param dst: destination array; will have the same size and the same number of channels as ``src``, and the same depth as ``lut``.
-
-Performs a look-up table transform of an array.
-
-ocl::magnitude
-------------------
-Returns void
-
-.. ocv:function:: void ocl::magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude)
-
-    :param x: the floating-point array of x-coordinates of the vectors.
-
-    :param y: the floating-point array of y-coordinates of the vectors; must have the same size as ``x``.
-
-    :param magnitude: the destination array; will have the same size and same type as ``x``.
-
-The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of ``x`` and ``y`` arrays. Supports only ``CV_32F`` and ``CV_64F`` data types.
-
-ocl::meanStdDev
-------------------
-Returns void
-
-.. ocv:function:: void ocl::meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev)
-
-    :param mtx: source image.
-
-    :param mean: the output parameter: computed mean value.
-
-    :param stddev: the output parameter: computed standard deviation.
-
-The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types.
-
-ocl::merge
-------------------
-Returns void
-
-.. ocv:function:: void ocl::merge(const vector<oclMat> &src, oclMat &dst)
-
-    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type.
-
-    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices.
-
-Composes a multi-channel array from several single-channel arrays. Supports all data types.
-
-ocl::multiply
-------------------
-Returns void
-
-.. ocv:function:: void ocl::multiply(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param scale: optional scale factor.
-
-Computes per-element multiply between two arrays or between array and a scalar. Supports all data types.
-
-ocl::norm
-------------------
-Returns the calculated norm
-
-.. ocv:function:: double ocl::norm(const oclMat &src1, int normType = NORM_L2)
-
-.. ocv:function:: double ocl::norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2)
-
-    :param src1: the first source array.
-
-    :param src2: the second source array of the same size and the same type as ``src1``.
-
-    :param normType: type of the norm.
-
-The functions ``norm`` calculate an absolute norm of ``src1`` (when there is no ``src2`` ):
-
-.. math::
-
-    norm =  \forkthree{\|\texttt{src1}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I)|}{if  $\texttt{normType} = \texttt{NORM\_INF}$ }
-    { \| \texttt{src1} \| _{L_1} =  \sum _I | \texttt{src1} (I)|}{if  $\texttt{normType} = \texttt{NORM\_L1}$ }
-    { \| \texttt{src1} \| _{L_2} =  \sqrt{\sum_I \texttt{src1}(I)^2} }{if  $\texttt{normType} = \texttt{NORM\_L2}$ }
-
-or an absolute or relative difference norm if ``src2`` is there:
-
-.. math::
-
-    norm =  \forkthree{\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  $\texttt{normType} = \texttt{NORM\_INF}$ }
-    { \| \texttt{src1} - \texttt{src2} \| _{L_1} =  \sum _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  $\texttt{normType} = \texttt{NORM\_L1}$ }
-    { \| \texttt{src1} - \texttt{src2} \| _{L_2} =  \sqrt{\sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2} }{if  $\texttt{normType} = \texttt{NORM\_L2}$ }
-
-or
-
-.. math::
-
-    norm =  \forkthree{\frac{\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}}    }{\|\texttt{src2}\|_{L_{\infty}} }}{if  $\texttt{normType} = \texttt{NORM\_RELATIVE\_INF}$ }
-    { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_1} }{\|\texttt{src2}\|_{L_1}} }{if  $\texttt{normType} = \texttt{NORM\_RELATIVE\_L1}$ }
-    { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}} }{if  $\texttt{normType} = \texttt{NORM\_RELATIVE\_L2}$ }
-
-The functions ``norm`` return the calculated norm.
-
-A multi-channel input arrays are treated as a single-channel, that is, the results for all channels are combined.
-
-ocl::oclMat::convertTo
---------------------------
-Returns void
-
-.. ocv:function:: void ocl::oclMat::convertTo(oclMat &m, int rtype, double alpha = 1, double beta = 0) const
-
-    :param m: the destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
-
-    :param rtype: the desired destination matrix type, or rather, the depth (since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
-
-    :param alpha: optional scale factor.
-
-    :param beta: optional delta added to the scaled values.
-
-The method converts source pixel values to the target datatype. Saturate cast is applied in the end to avoid possible overflows. Supports all data types.
-
-ocl::oclMat::copyTo
------------------------
-Returns void
-
-.. ocv:function:: void ocl::oclMat::copyTo(oclMat &m, const oclMat &mask = oclMat()) const
-
-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.
-
-    :param mask: The operation mask. Its non-zero elements indicate, which matrix elements need to be copied.
-
-Copies the matrix to another one. Supports all data types.
-
-ocl::oclMat::setTo
-----------------------
-Returns oclMat
-
-.. ocv:function:: oclMat& ocl::oclMat::setTo(const Scalar &s, const oclMat &mask = oclMat())
-
-    :param s: Assigned scalar, which is converted to the actual array type.
-
-    :param mask: The operation mask of the same size as ``*this`` and type ``CV_8UC1``.
-
-Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports all data types.
-
-ocl::phase
-------------------
-Returns void
-
-.. ocv:function:: void ocl::phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false)
-
-    :param x: the source floating-point array of x-coordinates of 2D vectors
-
-    :param y: the source array of y-coordinates of 2D vectors; must have the same size and the same type as ``x``.
-
-    :param angle: the destination array of vector angles; it will have the same size and same type as ``x``.
-
-    :param angleInDegrees: when it is true, the function will compute angle in degrees, otherwise they will be measured in radians.
-
-The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of ``x`` and ``y``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data type.
-
-ocl::polarToCart
-------------------
-Returns void
-
-.. ocv:function:: void ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false)
-
-    :param magnitude: the source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are = 1. If it's not empty, it must have the same size and same type as ``angle``.
-
-    :param angle: the source floating-point array of angles of the 2D vectors.
-
-    :param x: the destination array of x-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
-
-    :param y: the destination array of y-coordinates of 2D vectors; will have the same size and the same type as ``angle``.
-
-    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.
-
-The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only ``CV_32F`` and ``CV_64F`` data types.
-
-ocl::pow
-------------------
-Returns void
-
-.. ocv:function:: void ocl::pow(const oclMat &x, double p, oclMat &y)
-
-    :param x: the source array.
-
-    :param p: the exponent of power; the source floating-point array of angles of the 2D vectors.
-
-    :param y: the destination array, should be the same type as the source.
-
-The function pow raises every element of the input array to ``p``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data types.
-
-ocl::setIdentity
-------------------
-Returns void
-
-.. ocv:function:: void ocl::setIdentity(oclMat& src, const Scalar & val = Scalar(1))
-
-    :param src: matrix to initialize (not necessarily square).
-
-    :param val: value to assign to diagonal elements.
-
-The function initializes a scaled identity matrix.
-
-ocl::sortByKey
-------------------
-Returns void
-
-.. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)
-
-    :param keys: the keys to be used as sorting indices.
-
-    :param values: the array of values.
-
-    :param isGreaterThan: determine sorting order.
-
-    :param method: supported sorting methods:
-
-            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size.
-            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys.
-            * **SORT_MERGE**     merge sort.
-            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``).
-
-Returns the sorted result of all the elements in values based on equivalent keys.
-
-The element unit in the values to be sorted is determined from the data type, i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
-
-Both keys and values will be sorted inplace.
-
-Keys needs to be a **single** channel ``oclMat``.
-
-Example::
-
-    input -
-    keys   = {2,    3,   1}   (CV_8UC1)
-    values = {10,5, 4,3, 6,2} (CV_8UC2)
-    sortByKey(keys, values, SORT_SELECTION, false);
-    output -
-    keys   = {1,    2,   3}   (CV_8UC1)
-    values = {6,2, 10,5, 4,3} (CV_8UC2)
-
-ocl::split
-------------------
-Returns void
-
-.. ocv:function:: void ocl::split(const oclMat &src, vector<oclMat> &dst)
-
-    :param src: The source multi-channel array
-
-    :param dst: The destination array or vector of arrays; The number of arrays must match src.channels(). The arrays themselves will be reallocated if needed.
-
-The functions split split multi-channel array into separate single-channel arrays. Supports all data types.
-
-ocl::sqrt
-------------------
-Returns void
-
-.. ocv:function:: void ocl::sqrt(const oclMat &src, oclMat &dst)
-
-    :param src: the first source array.
-
-    :param dst: the dst array; must have the same size and same type as ``src``.
-
-The function ``sqrt`` calculates the square root of each input array element. Supports only ``CV_32FC1`` and ``CV_64F`` data types.
-
-ocl::subtract
-------------------
-Returns void
-
-.. ocv:function:: void ocl::subtract(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())
-
-.. ocv:function:: void ocl::subtract(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())
-
-    :param src1: the first input array.
-
-    :param src2: the second input array, must be the same size and same type as ``src1``.
-
-    :param s: scalar, the second input parameter.
-
-    :param dst: the destination array, it will have the same size and same type as ``src1``.
-
-    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.
-
-Computes per-element subtract between two arrays or between array and a scalar. Supports all data types.
-
-ocl::transpose
-------------------
-Returns void
-
-.. ocv:function:: void ocl::transpose(const oclMat &src, oclMat &dst)
-
-    :param src: the source array.
-
-    :param dst: the destination array of the same type as ``src``.
-
-Transposes a matrix (in case when ``src`` == ``dst`` and matrix is square the operation are performed inplace).
diff --git a/modules/ocl/doc/structures_and_utility_functions.rst b/modules/ocl/doc/structures_and_utility_functions.rst
deleted file mode 100644
index 1d1265c81..000000000
--- a/modules/ocl/doc/structures_and_utility_functions.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-Data Structures and Utility Functions
-========================================
-
-.. highlight:: cpp
-
-ocl::getOpenCLPlatforms
------------------------
-Returns the list of OpenCL platforms
-
-.. ocv:function:: int ocl::getOpenCLPlatforms( PlatformsInfo& platforms )
-
-    :param platforms: Output variable
-
-ocl::getOpenCLDevices
----------------------
-Returns the list of devices
-
-.. ocv:function:: int ocl::getOpenCLDevices( DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU, const PlatformInfo* platform = NULL )
-
-    :param devices: Output variable
-
-    :param deviceType: Bitmask of ``CVCL_DEVICE_TYPE_GPU``, ``CVCL_DEVICE_TYPE_CPU`` or ``CVCL_DEVICE_TYPE_DEFAULT``.
-
-    :param platform: Specifies preferrable platform
-
-ocl::setDevice
---------------
-Initialize OpenCL computation context
-
-.. ocv:function:: void ocl::setDevice( const DeviceInfo* info )
-
-    :param info: device info
-
-ocl::initializeContext
---------------------------------
-Alternative way to initialize OpenCL computation context.
-
-.. ocv:function:: void ocl::initializeContext(void* pClPlatform, void* pClContext, void* pClDevice)
-
-    :param pClPlatform: selected ``platform_id`` (via pointer, parameter type is ``cl_platform_id*``)
-
-    :param pClContext: selected ``cl_context`` (via pointer, parameter type is ``cl_context*``)
-
-    :param pClDevice: selected ``cl_device_id`` (via pointer, parameter type is ``cl_device_id*``)
-
-This function can be used for context initialization with D3D/OpenGL interoperability.
-
-ocl::setBinaryPath
-------------------
-Returns void
-
-.. ocv:function:: void ocl::setBinaryPath(const char *path)
-
-    :param path: the path of OpenCL kernel binaries
-
-If you call this function and set a valid path, the OCL module will save the compiled kernel to the address in the first time and reload the binary since that. It can save compilation time at the runtime.
diff --git a/modules/ocl/doc/video_analysis.rst b/modules/ocl/doc/video_analysis.rst
deleted file mode 100644
index 83ba2008e..000000000
--- a/modules/ocl/doc/video_analysis.rst
+++ /dev/null
@@ -1,561 +0,0 @@
-Video Analysis
-=============================
-
-.. highlight:: cpp
-
-ocl::GoodFeaturesToTrackDetector_OCL
-----------------------------------------
-.. ocv:class:: ocl::GoodFeaturesToTrackDetector_OCL
-
-Class used for strong corners detection on an image. ::
-
-    class GoodFeaturesToTrackDetector_OCL
-    {
-    public:
-        explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
-            int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
-
-        //! return 1 rows matrix with CV_32FC2 type
-        void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
-        //! download points of type Point2f to a vector. the vector's content will be erased
-        void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
-
-        int maxCorners;
-        double qualityLevel;
-        double minDistance;
-
-        int blockSize;
-        bool useHarrisDetector;
-        double harrisK;
-        void releaseMemory()
-        {
-            Dx_.release();
-            Dy_.release();
-            eig_.release();
-            minMaxbuf_.release();
-            tmpCorners_.release();
-        }
-    };
-
-The class finds the most prominent corners in the image.
-
-.. seealso:: :ocv:func:`goodFeaturesToTrack()`
-
-ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL
--------------------------------------------------------------------------
-Constructor.
-
-.. ocv:function:: ocl::GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04)
-
-    :param maxCorners: Maximum number of corners to return. If there are more corners than are found, the strongest of them is returned.
-
-    :param qualityLevel: Parameter characterizing the minimal accepted quality of image corners. The parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue (see  :ocv:func:`ocl::cornerMinEigenVal` ) or the Harris function response (see  :ocv:func:`ocl::cornerHarris` ). The corners with the quality measure less than the product are rejected. For example, if the best corner has the quality measure = 1500, and the  ``qualityLevel=0.01`` , then all the corners with the quality measure less than 15 are rejected.
-
-    :param minDistance: Minimum possible Euclidean distance between the returned corners.
-
-    :param blockSize: Size of an average block for computing a derivative covariation matrix over each pixel neighborhood. See  :ocv:func:`cornerEigenValsAndVecs` .
-
-    :param useHarrisDetector: Parameter indicating whether to use a Harris detector (see :ocv:func:`ocl::cornerHarris`) or :ocv:func:`ocl::cornerMinEigenVal`.
-
-    :param harrisK: Free parameter of the Harris detector.
-
-ocl::GoodFeaturesToTrackDetector_OCL::operator ()
--------------------------------------------------------
-Finds the most prominent corners in the image.
-
-.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat())
-
-    :param image: Input 8-bit, single-channel image.
-
-    :param corners: Output vector of detected corners (it will be one row matrix with CV_32FC2 type).
-
-    :param mask: Optional region of interest. If the image is not empty (it needs to have the type  ``CV_8UC1``  and the same size as  ``image`` ), it  specifies the region in which the corners are detected.
-
-.. seealso:: :ocv:func:`goodFeaturesToTrack`
-
-ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory
---------------------------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void ocl::GoodFeaturesToTrackDetector_OCL::releaseMemory()
-
-ocl::FarnebackOpticalFlow
--------------------------------
-.. ocv:class:: ocl::FarnebackOpticalFlow
-
-Class computing a dense optical flow using the Gunnar Farneback's algorithm. ::
-
-    class CV_EXPORTS FarnebackOpticalFlow
-    {
-    public:
-        FarnebackOpticalFlow();
-
-        int numLevels;
-        double pyrScale;
-        bool fastPyramids;
-        int winSize;
-        int numIters;
-        int polyN;
-        double polySigma;
-        int flags;
-
-        void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
-
-        void releaseMemory();
-
-    private:
-        /* hidden */
-    };
-
-ocl::FarnebackOpticalFlow::operator ()
-------------------------------------------
-Computes a dense optical flow using the Gunnar Farneback's algorithm.
-
-.. ocv:function:: void ocl::FarnebackOpticalFlow::operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy)
-
-    :param frame0: First 8-bit gray-scale input image
-    :param frame1: Second 8-bit gray-scale input image
-    :param flowx: Flow horizontal component
-    :param flowy: Flow vertical component
-
-.. seealso:: :ocv:func:`calcOpticalFlowFarneback`
-
-ocl::FarnebackOpticalFlow::releaseMemory
---------------------------------------------
-Releases unused auxiliary memory buffers.
-
-.. ocv:function:: void ocl::FarnebackOpticalFlow::releaseMemory()
-
-
-ocl::PyrLKOpticalFlow
--------------------------
-.. ocv:class:: ocl::PyrLKOpticalFlow
-
-Class used for calculating an optical flow. ::
-
-    class PyrLKOpticalFlow
-    {
-    public:
-        PyrLKOpticalFlow();
-
-        void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
-            oclMat& status, oclMat* err = 0);
-
-        void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
-
-        Size winSize;
-        int maxLevel;
-        int iters;
-        double derivLambda;
-        bool useInitialFlow;
-        float minEigThreshold;
-        bool getMinEigenVals;
-
-        void releaseMemory();
-
-    private:
-        /* hidden */
-    };
-
-The class can calculate an optical flow for a sparse feature set or dense optical flow using the iterative Lucas-Kanade method with pyramids.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-ocl::PyrLKOpticalFlow::sparse
----------------------------------
-Calculate an optical flow for a sparse feature set.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err = 0)
-
-    :param prevImg: First 8-bit input image (supports both grayscale and color images).
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param prevPts: Vector of 2D points for which the flow needs to be found. It must be one row matrix with CV_32FC2 type.
-
-    :param nextPts: Output vector of 2D points (with single-precision floating-point coordinates) containing the calculated new positions of input features in the second image. When ``useInitialFlow`` is true, the vector must have the same size as in the input.
-
-    :param status: Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the flow for the corresponding features has been found. Otherwise, it is set to 0.
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-.. seealso:: :ocv:func:`calcOpticalFlowPyrLK`
-
-
-ocl::PyrLKOpticalFlow::dense
----------------------------------
-Calculate dense optical flow.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0)
-
-    :param prevImg: First 8-bit grayscale input image.
-
-    :param nextImg: Second input image of the same size and the same type as  ``prevImg`` .
-
-    :param u: Horizontal component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param v: Vertical component of the optical flow of the same size as input images, 32-bit floating-point, single-channel
-
-    :param err: Output vector (CV_32FC1 type) that contains the difference between patches around the original and moved points or min eigen value if ``getMinEigenVals`` is checked. It can be NULL, if not needed.
-
-
-ocl::PyrLKOpticalFlow::releaseMemory
-----------------------------------------
-Releases inner buffers memory.
-
-.. ocv:function:: void ocl::PyrLKOpticalFlow::releaseMemory()
-
-ocl::interpolateFrames
---------------------------
-Interpolates frames (images) using provided optical flow (displacement field).
-
-.. ocv:function:: void ocl::interpolateFrames(const oclMat& frame0, const oclMat& frame1, const oclMat& fu, const oclMat& fv, const oclMat& bu, const oclMat& bv, float pos, oclMat& newFrame, oclMat& buf)
-
-    :param frame0: First frame (32-bit floating point images, single channel).
-
-    :param frame1: Second frame. Must have the same type and size as ``frame0`` .
-
-    :param fu: Forward horizontal displacement.
-
-    :param fv: Forward vertical displacement.
-
-    :param bu: Backward horizontal displacement.
-
-    :param bv: Backward vertical displacement.
-
-    :param pos: New frame position.
-
-    :param newFrame: Output image.
-
-    :param buf: Temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat: occlusion masks for first frame, occlusion masks for second, interpolated forward horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow, interpolated backward vertical flow.
-
-ocl::KalmanFilter
---------------------
-.. ocv:class:: ocl::KalmanFilter
-
-Kalman filter class. ::
-
-    class CV_EXPORTS KalmanFilter
-    {
-    public:
-        KalmanFilter();
-        //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
-        KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
-        //! re-initializes Kalman filter. The previous content is destroyed.
-        void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
-
-        const oclMat& predict(const oclMat& control=oclMat());
-        const oclMat& correct(const oclMat& measurement);
-
-        oclMat statePre; //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
-        oclMat statePost; //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
-        oclMat transitionMatrix; //!< state transition matrix (A)
-        oclMat controlMatrix; //!< control matrix (B) (not used if there is no control)
-        oclMat measurementMatrix; //!< measurement matrix (H)
-        oclMat processNoiseCov; //!< process noise covariance matrix (Q)
-        oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
-        oclMat errorCovPre; //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
-        oclMat gain; //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
-        oclMat errorCovPost; //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
-    private:
-        /* hidden */
-    };
-
-ocl::KalmanFilter::KalmanFilter
-----------------------------------
-The constructors.
-
-.. ocv:function:: ocl::KalmanFilter::KalmanFilter()
-
-.. ocv:function:: ocl::KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
-
-    The full constructor.
-
-    :param dynamParams: Dimensionality of the state.
-
-    :param measureParams: Dimensionality of the measurement.
-
-    :param controlParams: Dimensionality of the control vector.
-
-    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
-
-
-ocl::KalmanFilter::init
----------------------------
-Re-initializes Kalman filter. The previous content is destroyed.
-
-.. ocv:function:: void ocl::KalmanFilter::init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F)
-
-    :param dynamParams: Dimensionalityensionality of the state.
-
-    :param measureParams: Dimensionality of the measurement.
-
-    :param controlParams: Dimensionality of the control vector.
-
-    :param type: Type of the created matrices that should be ``CV_32F`` or ``CV_64F``.
-
-
-ocl::KalmanFilter::predict
-------------------------------
-Computes a predicted state.
-
-.. ocv:function:: const oclMat& ocl::KalmanFilter::predict(const oclMat& control=oclMat())
-
-    :param control: The optional input control
-
-
-ocl::KalmanFilter::correct
------------------------------
-Updates the predicted state from the measurement.
-
-.. ocv:function:: const oclMat& ocl::KalmanFilter::correct(const oclMat& measurement)
-
-    :param measurement: The measured system parameters
-
-
-ocl::BackgroundSubtractor
-----------------------------
-.. ocv:class:: ocl::BackgroundSubtractor
-
-Base class for background/foreground segmentation. ::
-
-    class CV_EXPORTS BackgroundSubtractor
-    {
-    public:
-        //! the virtual destructor
-        virtual ~BackgroundSubtractor();
-        //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
-        virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
-
-        //! computes a background image
-        virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
-    };
-
-
-The class is only used to define the common interface for the whole family of background/foreground segmentation algorithms.
-
-
-ocl::BackgroundSubtractor::operator()
------------------------------------------
-Computes a foreground mask.
-
-.. ocv:function:: void ocl::BackgroundSubtractor::operator()(const oclMat& image, oclMat& fgmask, float learningRate)
-
-    :param image: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-
-ocl::BackgroundSubtractor::getBackgroundImage
--------------------------------------------------
-Computes a background image.
-
-.. ocv:function:: void ocl::BackgroundSubtractor::getBackgroundImage(oclMat& backgroundImage) const
-
-    :param backgroundImage: The output background image.
-
-.. note:: Sometimes the background image can be very blurry, as it contain the average background statistics.
-
-ocl::MOG
-------------
-.. ocv:class:: ocl::MOG : public ocl::BackgroundSubtractor
-
-Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm. ::
-
-    class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
-    {
-    public:
-        //! the default constructor
-        MOG(int nmixtures = -1);
-
-        //! re-initiaization method
-        void initialize(Size frameSize, int frameType);
-
-        //! the update operator
-        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
-
-        //! computes a background image which are the mean of all background gaussians
-        void getBackgroundImage(oclMat& backgroundImage) const;
-
-        //! releases all inner buffers
-        void release();
-
-        int history;
-        float varThreshold;
-        float backgroundRatio;
-        float noiseSigma;
-
-    private:
-        /* hidden */
-    };
-
-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2001]_.
-
-.. seealso:: :ocv:class:`BackgroundSubtractorMOG`
-
-
-ocl::MOG::MOG
----------------------
-The constructor.
-
-.. ocv:function:: ocl::MOG::MOG(int nmixtures = -1)
-
-    :param nmixtures: Number of Gaussian mixtures.
-
-Default constructor sets all parameters to default values.
-
-
-ocl::MOG::operator()
-------------------------
-Updates the background model and returns the foreground mask.
-
-.. ocv:function:: void ocl::MOG::operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f)
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-
-ocl::MOG::getBackgroundImage
---------------------------------
-Computes a background image.
-
-.. ocv:function:: void ocl::MOG::getBackgroundImage(oclMat& backgroundImage) const
-
-    :param backgroundImage: The output background image.
-
-
-ocl::MOG::release
----------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void ocl::MOG::release()
-
-
-ocl::MOG2
--------------
-.. ocv:class:: ocl::MOG2 : public ocl::BackgroundSubtractor
-
-  Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
-
-  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [MOG2004]_. ::
-
-    class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
-    {
-    public:
-        //! the default constructor
-        MOG2(int nmixtures = -1);
-
-        //! re-initiaization method
-        void initialize(Size frameSize, int frameType);
-
-        //! the update operator
-        void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
-
-        //! computes a background image which are the mean of all background gaussians
-        void getBackgroundImage(oclMat& backgroundImage) const;
-
-        //! releases all inner buffers
-        void release();
-
-        int history;
-
-        float varThreshold;
-
-        float backgroundRatio;
-
-        float varThresholdGen;
-
-        float fVarInit;
-        float fVarMin;
-        float fVarMax;
-
-        float fCT;
-
-        bool bShadowDetection;
-        unsigned char nShadowDetection;
-        float fTau;
-
-    private:
-        /* hidden */
-    };
-
-  .. ocv:member:: float backgroundRatio
-
-      Threshold defining whether the component is significant enough to be included into the background model. ``cf=0.1 => TB=0.9`` is default. For ``alpha=0.001``, it means that the mode should exist for approximately 105 frames before it is considered foreground.
-
-  .. ocv:member:: float varThreshold
-
-      Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the existing components (corresponds to ``Tg``). If it is not close to any component, a new component is generated. ``3 sigma => Tg=3*3=9`` is default. A smaller ``Tg`` value generates more components. A higher ``Tg`` value may result in a small number of components but they can grow too large.
-
-  .. ocv:member:: float fVarInit
-
-      Initial variance for the newly generated components. It affects the speed of adaptation. The parameter value is based on your estimate of the typical standard deviation from the images. OpenCV uses 15 as a reasonable value.
-
-  .. ocv:member:: float fVarMin
-
-      Parameter used to further control the variance.
-
-  .. ocv:member:: float fVarMax
-
-      Parameter used to further control the variance.
-
-  .. ocv:member:: float fCT
-
-      Complexity reduction parameter. This parameter defines the number of samples needed to accept to prove the component exists. ``CT=0.05`` is a default value for all the samples. By setting ``CT=0`` you get an algorithm very similar to the standard Stauffer&Grimson algorithm.
-
-  .. ocv:member:: uchar nShadowDetection
-
-      The value for marking shadow pixels in the output foreground mask. Default value is 127.
-
-  .. ocv:member:: float fTau
-
-      Shadow threshold. The shadow is detected if the pixel is a darker version of the background. ``Tau`` is a threshold defining how much darker the shadow can be. ``Tau= 0.5`` means that if a pixel is more than twice darker then it is not shadow. See [ShadowDetect2003]_.
-
-  .. ocv:member:: bool bShadowDetection
-
-      Parameter defining whether shadow detection should be enabled.
-
-
-.. seealso:: :ocv:class:`BackgroundSubtractorMOG2`
-
-
-ocl::MOG2::MOG2
------------------------
-The constructor.
-
-.. ocv:function:: ocl::MOG2::MOG2(int nmixtures = -1)
-
-    :param nmixtures: Number of Gaussian mixtures.
-
-Default constructor sets all parameters to default values.
-
-
-ocl::MOG2::operator()
--------------------------
-Updates the background model and returns the foreground mask.
-
-.. ocv:function:: void ocl::MOG2::operator()( const oclMat& frame, oclMat& fgmask, float learningRate=-1.0f)
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-
-ocl::MOG2::getBackgroundImage
----------------------------------
-Computes a background image.
-
-.. ocv:function:: void ocl::MOG2::getBackgroundImage(oclMat& backgroundImage) const
-
-    :param backgroundImage: The output background image.
-
-
-ocl::MOG2::release
-----------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void ocl::MOG2::release()
-
-
-.. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
deleted file mode 100644
index 542dbeb0b..000000000
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ /dev/null
@@ -1,2104 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef __OPENCV_OCL_HPP__
-#define __OPENCV_OCL_HPP__
-
-#include <memory>
-#include <vector>
-
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/objdetect.hpp"
-#include "opencv2/ml.hpp"
-
-namespace cv
-{
-    namespace ocl
-    {
-        enum DeviceType
-        {
-            CVCL_DEVICE_TYPE_DEFAULT     = (1 << 0),
-            CVCL_DEVICE_TYPE_CPU         = (1 << 1),
-            CVCL_DEVICE_TYPE_GPU         = (1 << 2),
-            CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
-            //CVCL_DEVICE_TYPE_CUSTOM      = (1 << 4)
-            CVCL_DEVICE_TYPE_ALL         = 0xFFFFFFFF
-        };
-
-        enum DevMemRW
-        {
-            DEVICE_MEM_R_W = 0,
-            DEVICE_MEM_R_ONLY,
-            DEVICE_MEM_W_ONLY
-        };
-
-        enum DevMemType
-        {
-            DEVICE_MEM_DEFAULT = 0,
-            DEVICE_MEM_AHP,         //alloc host pointer
-            DEVICE_MEM_UHP,         //use host pointer
-            DEVICE_MEM_CHP,         //copy host pointer
-            DEVICE_MEM_PM           //persistent memory
-        };
-
-        // these classes contain OpenCL runtime information
-
-        struct PlatformInfo;
-
-        struct DeviceInfo
-        {
-        public:
-            int _id; // reserved, don't use it
-
-            DeviceType deviceType;
-            std::string deviceProfile;
-            std::string deviceVersion;
-            std::string deviceName;
-            std::string deviceVendor;
-            int deviceVendorId;
-            std::string deviceDriverVersion;
-            std::string deviceExtensions;
-
-            size_t maxWorkGroupSize;
-            std::vector<size_t> maxWorkItemSizes;
-            int maxComputeUnits;
-            size_t localMemorySize;
-            size_t maxMemAllocSize;
-
-            int deviceVersionMajor;
-            int deviceVersionMinor;
-
-            bool haveDoubleSupport;
-            bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
-            bool isIntelDevice;
-
-            std::string compilationExtraOptions;
-
-            const PlatformInfo* platform;
-
-            DeviceInfo();
-            ~DeviceInfo();
-        };
-
-        struct PlatformInfo
-        {
-            int _id; // reserved, don't use it
-
-            std::string platformProfile;
-            std::string platformVersion;
-            std::string platformName;
-            std::string platformVendor;
-            std::string platformExtensons;
-
-            int platformVersionMajor;
-            int platformVersionMinor;
-
-            std::vector<const DeviceInfo*> devices;
-
-            PlatformInfo();
-            ~PlatformInfo();
-        };
-
-        //////////////////////////////// Initialization & Info ////////////////////////
-        typedef std::vector<const PlatformInfo*> PlatformsInfo;
-
-        CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms);
-
-        typedef std::vector<const DeviceInfo*> DevicesInfo;
-
-        CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU,
-                const PlatformInfo* platform = NULL);
-
-        // set device you want to use
-        CV_EXPORTS void setDevice(const DeviceInfo* info);
-
-        // Initialize from OpenCL handles directly.
-        // Argument types is (pointers): cl_platform_id*, cl_context*, cl_device_id*
-        CV_EXPORTS void initializeContext(void* pClPlatform, void* pClContext, void* pClDevice);
-
-        enum FEATURE_TYPE
-        {
-            FEATURE_CL_DOUBLE = 1,
-            FEATURE_CL_UNIFIED_MEM,
-            FEATURE_CL_VER_1_2,
-            FEATURE_CL_INTEL_DEVICE
-        };
-
-        // Represents OpenCL context, interface
-        class CV_EXPORTS Context
-        {
-        protected:
-            Context() { }
-            ~Context() { }
-        public:
-            static Context *getContext();
-
-            bool supportsFeature(FEATURE_TYPE featureType) const;
-            const DeviceInfo& getDeviceInfo() const;
-
-            const void* getOpenCLContextPtr() const;
-            const void* getOpenCLCommandQueuePtr() const;
-            const void* getOpenCLDeviceIDPtr() const;
-        };
-
-        inline const void *getClContextPtr()
-        {
-            return Context::getContext()->getOpenCLContextPtr();
-        }
-
-        inline const void *getClCommandQueuePtr()
-        {
-            return Context::getContext()->getOpenCLCommandQueuePtr();
-        }
-
-        CV_EXPORTS bool supportsFeature(FEATURE_TYPE featureType);
-
-        CV_EXPORTS void finish();
-
-        enum BINARY_CACHE_MODE
-        {
-            CACHE_NONE    = 0,        // do not cache OpenCL binary
-            CACHE_DEBUG   = 0x1 << 0, // cache OpenCL binary when built in debug mode
-            CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode
-            CACHE_ALL     = CACHE_DEBUG | CACHE_RELEASE, // cache opencl binary
-        };
-        //! Enable or disable OpenCL program binary caching onto local disk
-        // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the
-        // compiled OpenCL program to be cached to the path automatically as "path/*.clb"
-        // binary file, which will be reused when the OpenCV executable is started again.
-        //
-        // This feature is enabled by default.
-        CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./");
-
-        //! set where binary cache to be saved to
-        CV_EXPORTS void setBinaryPath(const char *path);
-
-        struct ProgramSource
-        {
-            const char* name;
-            const char* programStr;
-            const char* programHash;
-
-            // Cache in memory by name (should be unique). Caching on disk disabled.
-            inline ProgramSource(const char* _name, const char* _programStr)
-                : name(_name), programStr(_programStr), programHash(NULL)
-            {
-            }
-
-            // Cache in memory by name (should be unique). Caching on disk uses programHash mark.
-            inline ProgramSource(const char* _name, const char* _programStr, const char* _programHash)
-                : name(_name), programStr(_programStr), programHash(_programHash)
-            {
-            }
-        };
-
-        //! Calls OpenCL kernel. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
-        //! Deprecated, will be replaced
-        CV_EXPORTS void openCLExecuteKernelInterop(Context *clCxt,
-                const cv::ocl::ProgramSource& source, String kernelName,
-                size_t globalThreads[3], size_t localThreads[3],
-                std::vector< std::pair<size_t, const void *> > &args,
-                int channels, int depth, const char *build_options);
-
-        class CV_EXPORTS oclMatExpr;
-        //////////////////////////////// oclMat ////////////////////////////////
-        class CV_EXPORTS oclMat
-        {
-        public:
-            //! default constructor
-            oclMat();
-            //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-            oclMat(int rows, int cols, int type);
-            oclMat(Size size, int type);
-            //! constucts oclMatrix and fills it with the specified value _s.
-            oclMat(int rows, int cols, int type, const Scalar &s);
-            oclMat(Size size, int type, const Scalar &s);
-            //! copy constructor
-            oclMat(const oclMat &m);
-
-            //! constructor for oclMatrix headers pointing to user-allocated data
-            oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP);
-            oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP);
-
-            //! creates a matrix header for a part of the bigger matrix
-            oclMat(const oclMat &m, const Range &rowRange, const Range &colRange);
-            oclMat(const oclMat &m, const Rect &roi);
-
-            //! builds oclMat from Mat. Perfom blocking upload to device.
-            explicit oclMat (const Mat &m);
-
-            //! destructor - calls release()
-            ~oclMat();
-
-            //! assignment operators
-            oclMat &operator = (const oclMat &m);
-            //! assignment operator. Perfom blocking upload to device.
-            oclMat &operator = (const Mat &m);
-            oclMat &operator = (const oclMatExpr& expr);
-
-            //! pefroms blocking upload data to oclMat.
-            void upload(const cv::Mat &m);
-
-
-            //! downloads data from device to host memory. Blocking calls.
-            operator Mat() const;
-            void download(cv::Mat &m) const;
-
-            //! convert to _InputArray
-            operator _InputArray();
-
-            //! convert to _OutputArray
-            operator _OutputArray();
-
-            //! returns a new oclMatrix header for the specified row
-            oclMat row(int y) const;
-            //! returns a new oclMatrix header for the specified column
-            oclMat col(int x) const;
-            //! ... for the specified row span
-            oclMat rowRange(int startrow, int endrow) const;
-            oclMat rowRange(const Range &r) const;
-            //! ... for the specified column span
-            oclMat colRange(int startcol, int endcol) const;
-            oclMat colRange(const Range &r) const;
-
-            //! returns deep copy of the oclMatrix, i.e. the data is copied
-            oclMat clone() const;
-
-            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-            // It calls m.create(this->size(), this->type()).
-            // It supports any data type
-            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
-
-            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
-            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
-
-            void assignTo( oclMat &m, int type = -1 ) const;
-
-            //! sets every oclMatrix element to s
-            oclMat& operator = (const Scalar &s);
-            //! sets some of the oclMatrix elements to s, according to the mask
-            oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat());
-            //! creates alternative oclMatrix header for the same data, with different
-            // number of channels and/or different number of rows. see cvReshape.
-            oclMat reshape(int cn, int rows = 0) const;
-
-            //! allocates new oclMatrix data unless the oclMatrix already has specified size and type.
-            // previous data is unreferenced if needed.
-            void create(int rows, int cols, int type);
-            void create(Size size, int type);
-
-            //! allocates new oclMatrix with specified device memory type.
-            void createEx(int rows, int cols, int type,
-                          DevMemRW rw_type, DevMemType mem_type);
-            void createEx(Size size, int type, DevMemRW rw_type,
-                          DevMemType mem_type);
-
-            //! decreases reference counter;
-            // deallocate the data when reference counter reaches 0.
-            void release();
-
-            //! swaps with other smart pointer
-            void swap(oclMat &mat);
-
-            //! locates oclMatrix header within a parent oclMatrix. See below
-            void locateROI( Size &wholeSize, Point &ofs ) const;
-            //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix.
-            oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
-            //! extracts a rectangular sub-oclMatrix
-            // (this is a generalized form of row, rowRange etc.)
-            oclMat operator()( Range rowRange, Range colRange ) const;
-            oclMat operator()( const Rect &roi ) const;
-
-            oclMat& operator+=( const oclMat& m );
-            oclMat& operator-=( const oclMat& m );
-            oclMat& operator*=( const oclMat& m );
-            oclMat& operator/=( const oclMat& m );
-
-            //! returns true if the oclMatrix data is continuous
-            // (i.e. when there are no gaps between successive rows).
-            // similar to CV_IS_oclMat_CONT(cvoclMat->type)
-            bool isContinuous() const;
-            //! returns element size in bytes,
-            // similar to CV_ELEM_SIZE(cvMat->type)
-            size_t elemSize() const;
-            //! returns the size of element channel in bytes.
-            size_t elemSize1() const;
-            //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
-            int type() const;
-            //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl
-            //! 3 channels element actually use 4 channel space
-            int ocltype() const;
-            //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
-            int depth() const;
-            //! returns element type, similar to CV_MAT_CN(cvMat->type)
-            int channels() const;
-            //! returns element type, return 4 for 3 channels element,
-            //!becuase 3 channels element actually use 4 channel space
-            int oclchannels() const;
-            //! returns step/elemSize1()
-            size_t step1() const;
-            //! returns oclMatrix size:
-            // width == number of columns, height == number of rows
-            Size size() const;
-            //! returns true if oclMatrix data is NULL
-            bool empty() const;
-
-            //! matrix transposition
-            oclMat t() const;
-
-            /*! includes several bit-fields:
-              - the magic signature
-              - continuity flag
-              - depth
-              - number of channels
-              */
-            int flags;
-            //! the number of rows and columns
-            int rows, cols;
-            //! a distance between successive rows in bytes; includes the gap if any
-            size_t step;
-            //! pointer to the data(OCL memory object)
-            uchar *data;
-
-            //! pointer to the reference counter;
-            // when oclMatrix points to user-allocated data, the pointer is NULL
-            int *refcount;
-
-            //! helper fields used in locateROI and adjustROI
-            //datastart and dataend are not used in current version
-            uchar *datastart;
-            uchar *dataend;
-
-            //! OpenCL context associated with the oclMat object.
-            Context *clCxt; // TODO clCtx
-            //add offset for handle ROI, calculated in byte
-            int offset;
-            //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used
-            int wholerows;
-            int wholecols;
-        };
-
-        // convert InputArray/OutputArray to oclMat references
-        CV_EXPORTS oclMat& getOclMatRef(InputArray src);
-        CV_EXPORTS oclMat& getOclMatRef(OutputArray src);
-
-        ///////////////////// mat split and merge /////////////////////////////////
-        //! Compose a multi-channel array from several single-channel arrays
-        // Support all types
-        CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst);
-        CV_EXPORTS void merge(const std::vector<oclMat> &src, oclMat &dst);
-
-        //! Divides multi-channel array into several single-channel arrays
-        // Support all types
-        CV_EXPORTS void split(const oclMat &src, oclMat *dst);
-        CV_EXPORTS void split(const oclMat &src, std::vector<oclMat> &dst);
-
-        ////////////////////////////// Arithmetics ///////////////////////////////////
-
-        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
-        // supports all data types
-        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
-
-        //! adds one matrix to another (dst = src1 + src2)
-        // supports all data types
-        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
-        //! adds scalar to a matrix (dst = src1 + s)
-        // supports all data types
-        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
-
-        //! subtracts one matrix from another (dst = src1 - src2)
-        // supports all data types
-        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (dst = src1 - s)
-        // supports all data types
-        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
-
-        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
-        // supports all data types
-        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
-        //! multiplies matrix to a number (dst = scalar * src)
-        // supports all data types
-        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
-
-        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
-        // supports all data types
-        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
-        //! computes element-wise quotient of the two arrays (dst = scale / src)
-        // supports all data types
-        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
-
-        //! computes element-wise minimum of the two arrays (dst = min(src1, src2))
-        // supports all data types
-        CV_EXPORTS void min(const oclMat &src1, const oclMat &src2, oclMat &dst);
-
-        //! computes element-wise maximum of the two arrays (dst = max(src1, src2))
-        // supports all data types
-        CV_EXPORTS void max(const oclMat &src1, const oclMat &src2, oclMat &dst);
-
-        //! compares elements of two arrays (dst = src1 <cmpop> src2)
-        // supports all data types
-        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);
-
-        //! transposes the matrix
-        // supports all data types
-        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);
-
-        //! computes element-wise absolute values of an array (dst = abs(src))
-        // supports all data types
-        CV_EXPORTS void abs(const oclMat &src, oclMat &dst);
-
-        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
-        // supports all data types
-        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
-        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
-        // supports all data types
-        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);
-
-        //! computes mean value and standard deviation of all or selected array elements
-        // supports all data types
-        CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev);
-
-        //! computes norm of array
-        // supports NORM_INF, NORM_L1, NORM_L2
-        // supports all data types
-        CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2);
-
-        //! computes norm of the difference between two arrays
-        // supports NORM_INF, NORM_L1, NORM_L2
-        // supports all data types
-        CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2);
-
-        //! reverses the order of the rows, columns or both in a matrix
-        // supports all types
-        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);
-
-        //! computes sum of array elements
-        // support all types
-        CV_EXPORTS Scalar sum(const oclMat &m);
-        CV_EXPORTS Scalar absSum(const oclMat &m);
-        CV_EXPORTS Scalar sqrSum(const oclMat &m);
-
-        //! finds global minimum and maximum array elements and returns their values
-        // support all C1 types
-        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
-
-        //! finds global minimum and maximum array elements and returns their values with locations
-        // support all C1 types
-        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
-                                  const oclMat &mask = oclMat());
-
-        //! counts non-zero array elements
-        // support all types
-        CV_EXPORTS int countNonZero(const oclMat &src);
-
-        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
-        // destination array will have the depth type as lut and the same channels number as source
-        //It supports 8UC1 8UC4 only
-        CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst);
-
-        //! only 8UC1 and 256 bins is supported now
-        CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist);
-        //! only 8UC1 and 256 bins is supported now
-        CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst);
-
-        //! only 8UC1 is supported now
-        CV_EXPORTS Ptr<cv::CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
-
-        //! bilateralFilter
-        // supports 8UC1 8UC4
-        CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT);
-
-        //! Applies an adaptive bilateral filter to the input image
-        //  Unlike the usual bilateral filter that uses fixed value for sigmaColor,
-        //  the adaptive version calculates the local variance in he ksize neighborhood
-        //  and use this as sigmaColor, for the value filtering. However, the local standard deviation is
-        //  clamped to the maxSigmaColor.
-        //  supports 8UC1, 8UC3
-        CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, double maxSigmaColor=20.0, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);
-
-        //! computes exponent of each matrix element (dst = e**src)
-        // supports only CV_32FC1, CV_64FC1 type
-        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);
-
-        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
-        // supports only CV_32FC1, CV_64FC1 type
-        CV_EXPORTS void log(const oclMat &src, oclMat &dst);
-
-        //! computes square root of each matrix element
-        // supports only CV_32FC1, CV_64FC1 type
-        CV_EXPORTS void sqrt(const oclMat &src, oclMat &dst);
-
-        //! computes magnitude of each (x(i), y(i)) vector
-        // supports only CV_32F, CV_64F type
-        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
-
-        //! computes angle (angle(i)) of each (x(i), y(i)) vector
-        // supports only CV_32F, CV_64F type
-        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);
-
-        //! the function raises every element of tne input array to p
-        // support only CV_32F, CV_64F type
-        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);
-
-        //! converts Cartesian coordinates to polar
-        // supports only CV_32F CV_64F type
-        CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false);
-
-        //! converts polar coordinates to Cartesian
-        // supports only CV_32F CV_64F type
-        CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false);
-
-        //! perfroms per-elements bit-wise inversion
-        // supports all types
-        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
-
-        //! calculates per-element bit-wise disjunction of two arrays
-        // supports all types
-        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
-        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
-
-        //! calculates per-element bit-wise conjunction of two arrays
-        // supports all types
-        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
-        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
-
-        //! calculates per-element bit-wise "exclusive or" operation
-        // supports all types
-        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
-        CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
-
-        //! Logical operators
-        CV_EXPORTS oclMat operator ~ (const oclMat &);
-        CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &);
-        CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &);
-        CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &);
-
-
-        //! Mathematics operators
-        CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2);
-        CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2);
-        CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
-        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
-
-        struct CV_EXPORTS ConvolveBuf
-        {
-            Size result_size;
-            Size block_size;
-            Size user_block_size;
-            Size dft_size;
-
-            oclMat image_spect, templ_spect, result_spect;
-            oclMat image_block, templ_block, result_data;
-
-            void create(Size image_size, Size templ_size);
-            static Size estimateBlockSize(Size result_size, Size templ_size);
-        };
-
-        //! computes convolution of two images, may use discrete Fourier transform
-        // support only CV_32FC1 type
-        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
-        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
-
-        //! Performs a per-element multiplication of two Fourier spectrums.
-        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
-        //! support only CV_32FC2 type
-        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);
-
-        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code, int dcn = 0);
-
-        //! initializes a scaled identity matrix
-        CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1));
-
-        //! fills the output array with repeated copies of the input array
-        CV_EXPORTS void repeat(const oclMat & src, int ny, int nx, oclMat & dst);
-
-        //////////////////////////////// Filter Engine ////////////////////////////////
-
-        /*!
-          The Base Class for 1D or Row-wise Filters
-
-          This is the base class for linear or non-linear filters that process 1D data.
-          In particular, such filters are used for the "horizontal" filtering parts in separable filters.
-          */
-        class CV_EXPORTS BaseRowFilter_GPU
-        {
-        public:
-            BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
-            virtual ~BaseRowFilter_GPU() {}
-            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-            int ksize, anchor, bordertype;
-        };
-
-        /*!
-          The Base Class for Column-wise Filters
-
-          This is the base class for linear or non-linear filters that process columns of 2D arrays.
-          Such filters are used for the "vertical" filtering parts in separable filters.
-          */
-        class CV_EXPORTS BaseColumnFilter_GPU
-        {
-        public:
-            BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {}
-            virtual ~BaseColumnFilter_GPU() {}
-            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-            int ksize, anchor, bordertype;
-        };
-
-        /*!
-          The Base Class for Non-Separable 2D Filters.
-
-          This is the base class for linear or non-linear 2D filters.
-          */
-        class CV_EXPORTS BaseFilter_GPU
-        {
-        public:
-            BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_)
-                : ksize(ksize_), anchor(anchor_), borderType(borderType_) {}
-            virtual ~BaseFilter_GPU() {}
-            virtual void operator()(const oclMat &src, oclMat &dst) = 0;
-            Size ksize;
-            Point anchor;
-            int borderType;
-        };
-
-        /*!
-          The Base Class for Filter Engine.
-
-          The class can be used to apply an arbitrary filtering operation to an image.
-          It contains all the necessary intermediate buffers.
-          */
-        class CV_EXPORTS FilterEngine_GPU
-        {
-        public:
-            virtual ~FilterEngine_GPU() {}
-
-            virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0;
-        };
-
-        //! returns the non-separable filter engine with the specified filter
-        CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D);
-
-        //! returns the primitive row filter with the specified kernel
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel,
-                int anchor = -1, int bordertype = BORDER_DEFAULT);
-
-        //! returns the primitive column filter with the specified kernel
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel,
-                int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0);
-
-        //! returns the separable linear filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
-                const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
-
-        //! returns the separable filter engine with the specified filters
-        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
-                const Ptr<BaseColumnFilter_GPU> &columnFilter);
-
-        //! returns the Gaussian filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
-
-        //! returns filter engine for the generalized Sobel operator
-        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
-
-        //! applies Laplacian operator to the image
-        // supports only ksize = 1 and ksize = 3
-        CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1,
-                double delta=0, int borderType=BORDER_DEFAULT);
-
-        //! returns 2D box filter
-        // dst type must be the same as source type
-        CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
-                const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-        //! returns box filter engine
-        CV_EXPORTS Ptr<FilterEngine_GPU> createBoxFilter_GPU(int srcType, int dstType, const Size &ksize,
-                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-        //! returns 2D filter with the specified kernel
-        // supports: dst type must be the same as source type
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
-                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-        //! returns the non-separable linear filter engine
-        // supports: dst type must be the same as source type
-        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
-                const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-        //! smooths the image using the normalized box filter
-        CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
-                                  Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
-
-        //! returns 2D morphological filter
-        //! only MORPH_ERODE and MORPH_DILATE are supported
-        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
-        // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize,
-                Point anchor = Point(-1, -1));
-
-        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
-        CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat &kernel,
-                const Point &anchor = Point(-1, -1), int iterations = 1);
-
-        //! a synonym for normalized box filter
-        static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
-                                int borderType = BORDER_CONSTANT)
-        {
-            boxFilter(src, dst, -1, ksize, anchor, borderType);
-        }
-
-        //! applies non-separable 2D linear filter to the image
-        CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
-                                 Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT);
-
-        //! applies separable 2D linear filter to the image
-        CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
-                                    Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
-
-        //! applies generalized Sobel operator to the image
-        // dst.type must equalize src.type
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
-        CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
-
-        //! applies the vertical or horizontal Scharr operator to the image
-        // dst.type must equalize src.type
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
-        CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT);
-
-        //! smooths the image using Gaussian filter.
-        // dst.type must equalize src.type
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
-        CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
-
-        //! erodes the image (applies the local minimum operator)
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
-
-                               int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
-
-
-        //! dilates the image (applies the local maximum operator)
-        // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
-        CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
-
-                                int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
-
-
-        //! applies an advanced morphological operation to the image
-        CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1,
-
-                                      int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue());
-
-
-        ////////////////////////////// Image processing //////////////////////////////
-        //! Does mean shift filtering on GPU.
-        CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr,
-                                           TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
-
-        //! Does mean shift procedure on GPU.
-        CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr,
-                                      TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
-
-        //! Does mean shift segmentation with elimiation of small regions.
-        CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize,
-                                              TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1));
-
-        //! applies fixed threshold to the image.
-        // supports CV_8UC1 and CV_32FC1 data type
-        // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV
-        CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC);
-
-        //! resizes the image
-        // Supports INTER_NEAREST, INTER_LINEAR
-        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
-        CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
-
-        //! Applies a generic geometrical transformation to an image.
-
-        // Supports INTER_NEAREST, INTER_LINEAR.
-        // Map1 supports CV_16SC2, CV_32FC2  types.
-        // Src supports CV_8UC1, CV_8UC2, CV_8UC4.
-        CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar());
-
-        //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
-        // supports CV_8UC1, CV_8UC4, CV_32SC1 types
-        CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar());
-
-        //! Smoothes image using median filter
-        // The source 1- or 4-channel image. m should be 3 or 5, the image depth should be CV_8U or CV_32F.
-        CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m);
-
-        //! warps the image using affine transformation
-        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
-        CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
-
-        //! warps the image using perspective transformation
-        // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-        // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types
-        CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR);
-
-        //! computes the integral image and integral for the squared image
-        // sum will support CV_32S, CV_32F, sqsum - support CV32F, CV_64F
-        // supports only CV_8UC1 source type
-        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum, int sdepth=-1 );
-        CV_EXPORTS void integral(const oclMat &src, oclMat &sum, int sdepth=-1 );
-        CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
-        CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
-            int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT);
-        CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
-        CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy,
-            int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT);
-
-
-        /////////////////////////////////// ML ///////////////////////////////////////////
-
-        //! Compute closest centers for each lines in source and lable it after center's index
-        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
-        // supports NORM_L1 and NORM_L2 distType
-        // if indices is provided, only the indexed rows will be calculated and their results are in the same
-        // order of indices
-        CV_EXPORTS void distanceToCenters(const oclMat &src, const oclMat &centers, Mat &dists, Mat &labels, int distType = NORM_L2SQR);
-
-        //!Does k-means procedure on GPU
-        // supports CV_32FC1/CV_32FC2/CV_32FC4 data type
-        CV_EXPORTS double kmeans(const oclMat &src, int K, oclMat &bestLabels,
-                                     TermCriteria criteria, int attemps, int flags, oclMat &centers);
-
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        ///////////////////////////////////////////CascadeClassifier//////////////////////////////////////////////////////////////////
-        ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        class CV_EXPORTS OclCascadeClassifier : public  cv::CascadeClassifier
-        {
-        public:
-            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
-                double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
-                Size minSize = Size(), Size maxSize = Size());
-        };
-
-        /////////////////////////////// Pyramid /////////////////////////////////////
-        CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
-
-        //! upsamples the source image and then smoothes it
-        CV_EXPORTS void pyrUp(const oclMat &src, oclMat &dst);
-
-        //! performs linear blending of two images
-        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
-        // supports only CV_8UC1 source type
-        CV_EXPORTS void blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, oclMat &result);
-
-        //! computes vertical sum, supports only CV_32FC1 images
-        CV_EXPORTS void columnSum(const oclMat &src, oclMat &sum);
-
-        ///////////////////////////////////////// match_template /////////////////////////////////////////////////////////////
-        struct CV_EXPORTS MatchTemplateBuf
-        {
-            Size user_block_size;
-            oclMat imagef, templf;
-            std::vector<oclMat> images;
-            std::vector<oclMat> image_sums;
-            std::vector<oclMat> image_sqsums;
-        };
-
-        //! computes the proximity map for the raster template and the image where the template is searched for
-        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
-        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
-        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method);
-
-        //! computes the proximity map for the raster template and the image where the template is searched for
-        // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
-        // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
-        CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf);
-
-
-
-        ///////////////////////////////////////////// Canny /////////////////////////////////////////////
-        struct CV_EXPORTS CannyBuf;
-
-        //! compute edges of the input image using Canny operator
-        // Support CV_8UC1 only
-        CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-        CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
-        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
-        CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false);
-
-        struct CV_EXPORTS CannyBuf
-        {
-            CannyBuf() : counter(1, 1, CV_32S) { }
-            ~CannyBuf()
-            {
-                release();
-            }
-            explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S)
-            {
-                create(image_size, apperture_size);
-            }
-            CannyBuf(const oclMat &dx_, const oclMat &dy_);
-            void create(const Size &image_size, int apperture_size = 3);
-            void release();
-
-            oclMat dx, dy;
-            oclMat dx_buf, dy_buf;
-            oclMat magBuf, mapBuf;
-            oclMat trackBuf1, trackBuf2;
-            oclMat counter;
-            Ptr<FilterEngine_GPU> filterDX, filterDY;
-        };
-
-        ///////////////////////////////////////// Hough Transform /////////////////////////////////////////
-        //! HoughCircles
-        struct HoughCirclesBuf
-        {
-            oclMat edges;
-            oclMat accum;
-            oclMat srcPoints;
-            oclMat centers;
-            CannyBuf cannyBuf;
-        };
-
-        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
-        CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096);
-        CV_EXPORTS void HoughCirclesDownload(const oclMat& d_circles, OutputArray h_circles);
-
-
-        ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
-        //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
-        //! Param dft_size is the size of DFT transform.
-        //!
-        //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format.
-        // support src type of CV32FC1, CV32FC2
-        // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS
-        // dft_size is the size of original input, which is used for transformation from complex to real.
-        // dft_size must be powers of 2, 3 and 5
-        // real to complex dft requires at least v1.8 clAmdFft
-        // real to complex dft output is not the same with cpu version
-        // real to complex and complex to real does not support DFT_ROWS
-        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);
-
-        //! implements generalized matrix product algorithm GEMM from BLAS
-        // The functionality requires clAmdBlas library
-        // only support type CV_32FC1
-        // flag GEMM_3_T is not supported
-        CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
-                             const oclMat &src3, double beta, oclMat &dst, int flags = 0);
-
-        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
-
-        struct CV_EXPORTS HOGDescriptor
-
-        {
-
-            enum { DEFAULT_WIN_SIGMA = -1 };
-
-            enum { DEFAULT_NLEVELS = 64 };
-
-            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
-
-
-
-            HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16),
-
-                          Size block_stride = Size(8, 8), Size cell_size = Size(8, 8),
-
-                          int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA,
-
-                          double threshold_L2hys = 0.2, bool gamma_correction = true,
-
-                          int nlevels = DEFAULT_NLEVELS);
-
-
-
-            size_t getDescriptorSize() const;
-
-            size_t getBlockHistogramSize() const;
-
-
-
-            void setSVMDetector(const std::vector<float> &detector);
-
-
-
-            static std::vector<float> getDefaultPeopleDetector();
-
-            static std::vector<float> getPeopleDetector48x96();
-
-            static std::vector<float> getPeopleDetector64x128();
-
-
-
-            void detect(const oclMat &img, std::vector<Point> &found_locations,
-
-                        double hit_threshold = 0, Size win_stride = Size(),
-
-                        Size padding = Size());
-
-
-
-            void detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
-
-                                  double hit_threshold = 0, Size win_stride = Size(),
-
-                                  Size padding = Size(), double scale0 = 1.05,
-
-                                  int group_threshold = 2);
-
-
-
-            void getDescriptors(const oclMat &img, Size win_stride,
-
-                                oclMat &descriptors,
-
-                                int descr_format = DESCR_FORMAT_COL_BY_COL);
-
-
-
-            Size win_size;
-
-            Size block_size;
-
-            Size block_stride;
-
-            Size cell_size;
-
-            int nbins;
-
-            double win_sigma;
-
-            double threshold_L2hys;
-
-            bool gamma_correction;
-
-            int nlevels;
-
-
-
-        protected:
-
-            // initialize buffers; only need to do once in case of multiscale detection
-
-            void init_buffer(const oclMat &img, Size win_stride);
-
-
-
-            void computeBlockHistograms(const oclMat &img);
-
-            void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle);
-
-
-
-            double getWinSigma() const;
-
-            bool checkDetectorSize() const;
-
-
-
-            static int numPartsWithin(int size, int part_size, int stride);
-
-            static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-
-
-            // Coefficients of the separating plane
-
-            float free_coef;
-
-            oclMat detector;
-
-
-
-            // Results of the last classification step
-
-            oclMat labels;
-
-            Mat labels_host;
-
-
-
-            // Results of the last histogram evaluation step
-
-            oclMat block_hists;
-
-
-
-            // Gradients conputation results
-
-            oclMat grad, qangle;
-
-
-
-            // scaled image
-
-            oclMat image_scale;
-
-
-
-            // effect size of input image (might be different from original size after scaling)
-
-            Size effect_size;
-
-        };
-
-
-        ////////////////////////feature2d_ocl/////////////////
-        /****************************************************************************************\
-        *                                      Distance                                          *
-        \****************************************************************************************/
-        template<typename T>
-        struct CV_EXPORTS Accumulator
-        {
-            typedef T Type;
-        };
-        template<> struct Accumulator<unsigned char>
-        {
-            typedef float Type;
-        };
-        template<> struct Accumulator<unsigned short>
-        {
-            typedef float Type;
-        };
-        template<> struct Accumulator<char>
-        {
-            typedef float Type;
-        };
-        template<> struct Accumulator<short>
-        {
-            typedef float Type;
-        };
-
-        /*
-         * Manhattan distance (city block distance) functor
-         */
-        template<class T>
-        struct CV_EXPORTS L1
-        {
-            enum { normType = NORM_L1 };
-            typedef T ValueType;
-            typedef typename Accumulator<T>::Type ResultType;
-
-            ResultType operator()( const T *a, const T *b, int size ) const
-            {
-                return normL1<ValueType, ResultType>(a, b, size);
-            }
-        };
-
-        /*
-         * Euclidean distance functor
-         */
-        template<class T>
-        struct CV_EXPORTS L2
-        {
-            enum { normType = NORM_L2 };
-            typedef T ValueType;
-            typedef typename Accumulator<T>::Type ResultType;
-
-            ResultType operator()( const T *a, const T *b, int size ) const
-            {
-                return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
-            }
-        };
-
-        /*
-         * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
-         * bit count of A exclusive XOR'ed with B
-         */
-        struct CV_EXPORTS Hamming
-        {
-            enum { normType = NORM_HAMMING };
-            typedef unsigned char ValueType;
-            typedef int ResultType;
-
-            /** this will count the bits in a ^ b
-             */
-            ResultType operator()( const unsigned char *a, const unsigned char *b, int size ) const
-            {
-                return normHamming(a, b, size);
-            }
-        };
-
-        ////////////////////////////////// BruteForceMatcher //////////////////////////////////
-
-        class CV_EXPORTS BruteForceMatcher_OCL_base
-        {
-        public:
-            enum DistType {L1Dist = 0, L2Dist, HammingDist};
-            explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
-
-            // Add descriptors to train descriptor collection
-            void add(const std::vector<oclMat> &descCollection);
-
-            // Get train descriptors collection
-            const std::vector<oclMat> &getTrainDescriptors() const;
-
-            // Clear train descriptors collection
-            void clear();
-
-            // Return true if there are not train descriptors in collection
-            bool empty() const;
-
-            // Return true if the matcher supports mask in match methods
-            bool isMaskSupported() const;
-
-            // Find one best match for each query descriptor
-            void matchSingle(const oclMat &query, const oclMat &train,
-                             oclMat &trainIdx, oclMat &distance,
-                             const oclMat &mask = oclMat());
-
-            // Download trainIdx and distance and convert it to CPU vector with DMatch
-            static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
-            // Convert trainIdx and distance to vector with DMatch
-            static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
-
-            // Find one best match for each query descriptor
-            void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
-
-            // Make gpu collection of trains and masks in suitable format for matchCollection function
-            void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
-
-            // Find one best match from train collection for each query descriptor
-            void matchCollection(const oclMat &query, const oclMat &trainCollection,
-                                 oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
-                                 const oclMat &masks = oclMat());
-
-            // Download trainIdx, imgIdx and distance and convert it to vector with DMatch
-            static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
-            // Convert trainIdx, imgIdx and distance to vector with DMatch
-            static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
-
-            // Find one best match from train collection for each query descriptor.
-            void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
-
-            // Find k best matches for each query descriptor (in increasing order of distances)
-            void knnMatchSingle(const oclMat &query, const oclMat &train,
-                                oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
-                                const oclMat &mask = oclMat());
-
-            // Download trainIdx and distance and convert it to vector with DMatch
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
-                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-            // Convert trainIdx and distance to vector with DMatch
-            static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
-                                        std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-            // Find k best matches for each query descriptor (in increasing order of distances).
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            void knnMatch(const oclMat &query, const oclMat &train,
-                          std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
-                          bool compactResult = false);
-
-            // Find k best matches from train collection for each query descriptor (in increasing order of distances)
-            void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
-                                     oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
-                                     const oclMat &maskCollection = oclMat());
-
-            // Download trainIdx and distance and convert it to vector with DMatch
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
-                                          std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-            // Convert trainIdx and distance to vector with DMatch
-            static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
-                                         std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-            // Find k best matches  for each query descriptor (in increasing order of distances).
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
-                          const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
-
-            // Find best matches for each query descriptor which have distance less than maxDistance.
-            // nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
-            // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
-            // because it didn't have enough memory.
-            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
-            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-            // Matches doesn't sorted.
-            void radiusMatchSingle(const oclMat &query, const oclMat &train,
-                                   oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
-                                   const oclMat &mask = oclMat());
-
-            // Download trainIdx, nMatches and distance and convert it to vector with DMatch.
-            // matches will be sorted in increasing order of distances.
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
-                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-            // Convert trainIdx, nMatches and distance to vector with DMatch.
-            static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
-                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-            // Find best matches for each query descriptor which have distance less than maxDistance
-            // in increasing order of distances).
-            void radiusMatch(const oclMat &query, const oclMat &train,
-                             std::vector< std::vector<DMatch> > &matches, float maxDistance,
-                             const oclMat &mask = oclMat(), bool compactResult = false);
-
-            // Find best matches for each query descriptor which have distance less than maxDistance.
-            // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
-            // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
-            // Matches doesn't sorted.
-            void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
-                                       const std::vector<oclMat> &masks = std::vector<oclMat>());
-
-            // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
-            // matches will be sorted in increasing order of distances.
-            // compactResult is used when mask is not empty. If compactResult is false matches
-            // vector will have the same size as queryDescriptors rows. If compactResult is true
-            // matches vector will not contain matches for fully masked out query descriptors.
-            static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
-                                            std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-            // Convert trainIdx, nMatches and distance to vector with DMatch.
-            static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
-                                           std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
-
-            // Find best matches from train collection for each query descriptor which have distance less than
-            // maxDistance (in increasing order of distances).
-            void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
-                             const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
-
-            DistType distType;
-
-        private:
-            std::vector<oclMat> trainDescCollection;
-        };
-
-        template <class Distance>
-        class CV_EXPORTS BruteForceMatcher_OCL;
-
-        template <typename T>
-        class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
-        {
-        public:
-            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
-            explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
-        };
-        template <typename T>
-        class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
-        {
-        public:
-            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
-            explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
-        };
-        template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
-        {
-        public:
-            explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
-            explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
-        };
-
-        class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
-        {
-        public:
-            explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
-        };
-
-        class CV_EXPORTS GoodFeaturesToTrackDetector_OCL
-        {
-        public:
-            explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0,
-                int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04);
-
-            //! return 1 rows matrix with CV_32FC2 type
-            void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat());
-            //! download points of type Point2f to a vector. the vector's content will be erased
-            void downloadPoints(const oclMat &points, std::vector<Point2f> &points_v);
-
-            int maxCorners;
-            double qualityLevel;
-            double minDistance;
-
-            int blockSize;
-            bool useHarrisDetector;
-            double harrisK;
-            void releaseMemory()
-            {
-                Dx_.release();
-                Dy_.release();
-                eig_.release();
-                minMaxbuf_.release();
-                tmpCorners_.release();
-            }
-        private:
-            oclMat Dx_;
-            oclMat Dy_;
-            oclMat eig_;
-            oclMat minMaxbuf_;
-            oclMat tmpCorners_;
-        };
-
-        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
-            int blockSize_, bool useHarrisDetector_, double harrisK_)
-        {
-            maxCorners = maxCorners_;
-            qualityLevel = qualityLevel_;
-            minDistance = minDistance_;
-            blockSize = blockSize_;
-            useHarrisDetector = useHarrisDetector_;
-            harrisK = harrisK_;
-        }
-
-        ////////////////////////////////// FAST Feature Detector //////////////////////////////////
-        class CV_EXPORTS FAST_OCL
-        {
-        public:
-            enum
-            {
-                X_ROW = 0,
-                Y_ROW,
-                RESPONSE_ROW,
-                ROWS_COUNT
-            };
-
-            // all features have same size
-            static const int FEATURE_SIZE = 7;
-
-            explicit FAST_OCL(int threshold, bool nonmaxSupression = true, double keypointsRatio = 0.05);
-
-            //! finds the keypoints using FAST detector
-            //! supports only CV_8UC1 images
-            void operator ()(const oclMat& image, const oclMat& mask, oclMat& keypoints);
-            void operator ()(const oclMat& image, const oclMat& mask, std::vector<KeyPoint>& keypoints);
-
-            //! download keypoints from device to host memory
-            static void downloadKeypoints(const oclMat& d_keypoints, std::vector<KeyPoint>& keypoints);
-
-            //! convert keypoints to KeyPoint vector
-            static void convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints);
-
-            //! release temporary buffer's memory
-            void release();
-
-            bool nonmaxSupression;
-
-            int threshold;
-
-            //! max keypoints = keypointsRatio * img.size().area()
-            double keypointsRatio;
-
-            //! find keypoints and compute it's response if nonmaxSupression is true
-            //! return count of detected keypoints
-            int calcKeyPointsLocation(const oclMat& image, const oclMat& mask);
-
-            //! get final array of keypoints
-            //! performs nonmax supression if needed
-            //! return final count of keypoints
-            int getKeyPoints(oclMat& keypoints);
-
-        private:
-            oclMat kpLoc_;
-            int count_;
-
-            oclMat score_;
-
-            oclMat d_keypoints_;
-
-            int calcKeypointsOCL(const oclMat& img, const oclMat& mask, int maxKeypoints);
-            int nonmaxSupressionOCL(oclMat& keypoints);
-        };
-
-        /////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
-
-        class CV_EXPORTS PyrLKOpticalFlow
-        {
-        public:
-            PyrLKOpticalFlow()
-            {
-                winSize = Size(21, 21);
-                maxLevel = 3;
-                iters = 30;
-                derivLambda = 0.5;
-                useInitialFlow = false;
-                minEigThreshold = 1e-4f;
-                getMinEigenVals = false;
-                isDeviceArch11_ = false;
-            }
-
-            void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
-                        oclMat &status, oclMat *err = 0);
-
-            void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
-
-            Size winSize;
-            int maxLevel;
-            int iters;
-            double derivLambda;
-            bool useInitialFlow;
-            float minEigThreshold;
-            bool getMinEigenVals;
-
-            void releaseMemory()
-            {
-                dx_calcBuf_.release();
-                dy_calcBuf_.release();
-
-                prevPyr_.clear();
-                nextPyr_.clear();
-
-                dx_buf_.release();
-                dy_buf_.release();
-            }
-
-        private:
-            void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
-
-            void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
-
-            oclMat dx_calcBuf_;
-            oclMat dy_calcBuf_;
-
-            std::vector<oclMat> prevPyr_;
-            std::vector<oclMat> nextPyr_;
-
-            oclMat dx_buf_;
-            oclMat dy_buf_;
-
-            oclMat uPyr_[2];
-            oclMat vPyr_[2];
-
-            bool isDeviceArch11_;
-        };
-
-        class CV_EXPORTS FarnebackOpticalFlow
-        {
-        public:
-            FarnebackOpticalFlow();
-
-            int numLevels;
-            double pyrScale;
-            bool fastPyramids;
-            int winSize;
-            int numIters;
-            int polyN;
-            double polySigma;
-            int flags;
-
-            void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy);
-
-            void releaseMemory();
-
-        private:
-            void prepareGaussian(
-                int n, double sigma, float *g, float *xg, float *xxg,
-                double &ig11, double &ig03, double &ig33, double &ig55);
-
-            void setPolynomialExpansionConsts(int n, double sigma);
-
-            void updateFlow_boxFilter(
-                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat &flowy,
-                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
-
-            void updateFlow_gaussianBlur(
-                const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat& flowy,
-                oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices);
-
-            oclMat frames_[2];
-            oclMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
-            std::vector<oclMat> pyramid0_, pyramid1_;
-        };
-
-        //////////////// build warping maps ////////////////////
-        //! builds plane warping maps
-        CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T, float scale, oclMat &map_x, oclMat &map_y);
-        //! builds cylindrical warping maps
-        CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
-        //! builds spherical warping maps
-        CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y);
-        //! builds Affine warping maps
-        CV_EXPORTS void buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
-
-        //! builds Perspective warping maps
-        CV_EXPORTS void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap);
-
-        ///////////////////////////////////// interpolate frames //////////////////////////////////////////////
-        //! Interpolate frames (images) using provided optical flow (displacement field).
-        //! frame0   - frame 0 (32-bit floating point images, single channel)
-        //! frame1   - frame 1 (the same type and size)
-        //! fu       - forward horizontal displacement
-        //! fv       - forward vertical displacement
-        //! bu       - backward horizontal displacement
-        //! bv       - backward vertical displacement
-        //! pos      - new frame position
-        //! newFrame - new frame
-        //! buf      - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat;
-        //!            occlusion masks            0, occlusion masks            1,
-        //!            interpolated forward flow  0, interpolated forward flow  1,
-        //!            interpolated backward flow 0, interpolated backward flow 1
-        //!
-        CV_EXPORTS void interpolateFrames(const oclMat &frame0, const oclMat &frame1,
-                                          const oclMat &fu, const oclMat &fv,
-                                          const oclMat &bu, const oclMat &bv,
-                                          float pos, oclMat &newFrame, oclMat &buf);
-
-        //! computes moments of the rasterized shape or a vector of points
-        //! _array should be a vector a points standing for the contour
-        CV_EXPORTS Moments ocl_moments(InputArray contour);
-        //! src should be a general image uploaded to the GPU.
-        //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
-        //! to use type of CV_64FC1, the GPU should support CV_64FC1
-        CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
-
-        class CV_EXPORTS StereoBM_OCL
-        {
-        public:
-            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
-
-            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
-
-            //! the default constructor
-            StereoBM_OCL();
-            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
-            StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
-
-            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
-            //! Output disparity has CV_8U type.
-            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
-
-            //! Some heuristics that tries to estmate
-            // if current GPU will be faster then CPU in this algorithm.
-            // It queries current active device.
-            static bool checkIfGpuCallReasonable();
-
-            int preset;
-            int ndisp;
-            int winSize;
-
-            // If avergeTexThreshold  == 0 => post procesing is disabled
-            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
-            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
-            // i.e. input left image is low textured.
-            float avergeTexThreshold;
-        private:
-            oclMat minSSD, leBuf, riBuf;
-        };
-
-        class CV_EXPORTS StereoBeliefPropagation
-        {
-        public:
-            enum { DEFAULT_NDISP  = 64 };
-            enum { DEFAULT_ITERS  = 5  };
-            enum { DEFAULT_LEVELS = 5  };
-            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels);
-            explicit StereoBeliefPropagation(int ndisp  = DEFAULT_NDISP,
-                                             int iters  = DEFAULT_ITERS,
-                                             int levels = DEFAULT_LEVELS,
-                                             int msg_type = CV_16S);
-            StereoBeliefPropagation(int ndisp, int iters, int levels,
-                                    float max_data_term, float data_weight,
-                                    float max_disc_term, float disc_single_jump,
-                                    int msg_type = CV_32F);
-            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
-            void operator()(const oclMat &data, oclMat &disparity);
-            int ndisp;
-            int iters;
-            int levels;
-            float max_data_term;
-            float data_weight;
-            float max_disc_term;
-            float disc_single_jump;
-            int msg_type;
-        private:
-            oclMat u, d, l, r, u2, d2, l2, r2;
-            std::vector<oclMat> datas;
-            oclMat out;
-        };
-
-        class CV_EXPORTS StereoConstantSpaceBP
-        {
-        public:
-            enum { DEFAULT_NDISP    = 128 };
-            enum { DEFAULT_ITERS    = 8   };
-            enum { DEFAULT_LEVELS   = 4   };
-            enum { DEFAULT_NR_PLANE = 4   };
-            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
-            explicit StereoConstantSpaceBP(
-                int ndisp    = DEFAULT_NDISP,
-                int iters    = DEFAULT_ITERS,
-                int levels   = DEFAULT_LEVELS,
-                int nr_plane = DEFAULT_NR_PLANE,
-                int msg_type = CV_32F);
-            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
-                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
-                int min_disp_th = 0,
-                int msg_type = CV_32F);
-            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
-            int ndisp;
-            int iters;
-            int levels;
-            int nr_plane;
-            float max_data_term;
-            float data_weight;
-            float max_disc_term;
-            float disc_single_jump;
-            int min_disp_th;
-            int msg_type;
-            bool use_local_init_data_cost;
-        private:
-            oclMat u[2], d[2], l[2], r[2];
-            oclMat disp_selected_pyr[2];
-            oclMat data_cost;
-            oclMat data_cost_selected;
-            oclMat temp;
-            oclMat out;
-        };
-
-        // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
-        //
-        // see reference:
-        //   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-        //   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-        class CV_EXPORTS OpticalFlowDual_TVL1_OCL
-        {
-        public:
-            OpticalFlowDual_TVL1_OCL();
-
-            void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
-
-            void collectGarbage();
-
-            /**
-            * Time step of the numerical scheme.
-            */
-            double tau;
-
-            /**
-            * Weight parameter for the data term, attachment parameter.
-            * This is the most relevant parameter, which determines the smoothness of the output.
-            * The smaller this parameter is, the smoother the solutions we obtain.
-            * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
-            */
-            double lambda;
-
-            /**
-            * Weight parameter for (u - v)^2, tightness parameter.
-            * It serves as a link between the attachment and the regularization terms.
-            * In theory, it should have a small value in order to maintain both parts in correspondence.
-            * The method is stable for a large range of values of this parameter.
-            */
-            double theta;
-
-            /**
-            * Number of scales used to create the pyramid of images.
-            */
-            int nscales;
-
-            /**
-            * Number of warpings per scale.
-            * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
-            * This is a parameter that assures the stability of the method.
-            * It also affects the running time, so it is a compromise between speed and accuracy.
-            */
-            int warps;
-
-            /**
-            * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
-            * A small value will yield more accurate solutions at the expense of a slower convergence.
-            */
-            double epsilon;
-
-            /**
-            * Stopping criterion iterations number used in the numerical scheme.
-            */
-            int iterations;
-
-            bool useInitialFlow;
-
-        private:
-            void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
-
-            std::vector<oclMat> I0s;
-            std::vector<oclMat> I1s;
-            std::vector<oclMat> u1s;
-            std::vector<oclMat> u2s;
-
-            oclMat I1x_buf;
-            oclMat I1y_buf;
-
-            oclMat I1w_buf;
-            oclMat I1wx_buf;
-            oclMat I1wy_buf;
-
-            oclMat grad_buf;
-            oclMat rho_c_buf;
-
-            oclMat p11_buf;
-            oclMat p12_buf;
-            oclMat p21_buf;
-            oclMat p22_buf;
-
-            oclMat diff_buf;
-            oclMat norm_buf;
-        };
-        // current supported sorting methods
-        enum
-        {
-            SORT_BITONIC,   // only support power-of-2 buffer size
-            SORT_SELECTION, // cannot sort duplicate keys
-            SORT_MERGE,
-            SORT_RADIX      // only support signed int/float keys(CV_32S/CV_32F)
-        };
-        //! Returns the sorted result of all the elements in input based on equivalent keys.
-        //
-        //  The element unit in the values to be sorted is determined from the data type,
-        //  i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
-        //  matrix dimension.
-        //  both keys and values will be sorted inplace
-        //  Key needs to be single channel oclMat.
-        //
-        //  Example:
-        //  input -
-        //    keys   = {2,    3,   1}   (CV_8UC1)
-        //    values = {10,5, 4,3, 6,2} (CV_8UC2)
-        //  sortByKey(keys, values, SORT_SELECTION, false);
-        //  output -
-        //    keys   = {1,    2,   3}   (CV_8UC1)
-        //    values = {6,2, 10,5, 4,3} (CV_8UC2)
-        CV_EXPORTS void sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
-        /*!Base class for MOG and MOG2!*/
-        class CV_EXPORTS BackgroundSubtractor
-        {
-        public:
-            //! the virtual destructor
-            virtual ~BackgroundSubtractor();
-            //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image.
-            virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate);
-
-            //! computes a background image
-            virtual void getBackgroundImage(oclMat& backgroundImage) const = 0;
-        };
-                /*!
-        Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm
-
-        The class implements the following algorithm:
-        "An improved adaptive background mixture model for real-time tracking with shadow detection"
-        P. KadewTraKuPong and R. Bowden,
-        Proc. 2nd European Workshp on Advanced Video-Based Surveillance Systems, 2001."
-        http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/avbs01/avbs01.pdf
-        */
-        class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor
-        {
-        public:
-            //! the default constructor
-            MOG(int nmixtures = -1);
-
-            //! re-initiaization method
-            void initialize(Size frameSize, int frameType);
-
-            //! the update operator
-            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f);
-
-            //! computes a background image which are the mean of all background gaussians
-            void getBackgroundImage(oclMat& backgroundImage) const;
-
-            //! releases all inner buffers
-            void release();
-
-            int history;
-            float varThreshold;
-            float backgroundRatio;
-            float noiseSigma;
-
-        private:
-            int nmixtures_;
-
-            Size frameSize_;
-            int frameType_;
-            int nframes_;
-
-            oclMat weight_;
-            oclMat sortKey_;
-            oclMat mean_;
-            oclMat var_;
-        };
-
-        /*!
-        The class implements the following algorithm:
-        "Improved adaptive Gausian mixture model for background subtraction"
-        Z.Zivkovic
-        International Conference Pattern Recognition, UK, August, 2004.
-        http://www.zoranz.net/Publications/zivkovic2004ICPR.pdf
-        */
-        class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor
-        {
-        public:
-            //! the default constructor
-            MOG2(int nmixtures = -1);
-
-            //! re-initiaization method
-            void initialize(Size frameSize, int frameType);
-
-            //! the update operator
-            void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f);
-
-            //! computes a background image which are the mean of all background gaussians
-            void getBackgroundImage(oclMat& backgroundImage) const;
-
-            //! releases all inner buffers
-            void release();
-
-            // parameters
-            // you should call initialize after parameters changes
-
-            int history;
-
-            //! here it is the maximum allowed number of mixture components.
-            //! Actual number is determined dynamically per pixel
-            float varThreshold;
-            // threshold on the squared Mahalanobis distance to decide if it is well described
-            // by the background model or not. Related to Cthr from the paper.
-            // This does not influence the update of the background. A typical value could be 4 sigma
-            // and that is varThreshold=4*4=16; Corresponds to Tb in the paper.
-
-            /////////////////////////
-            // less important parameters - things you might change but be carefull
-            ////////////////////////
-
-            float backgroundRatio;
-            // corresponds to fTB=1-cf from the paper
-            // TB - threshold when the component becomes significant enough to be included into
-            // the background model. It is the TB=1-cf from the paper. So I use cf=0.1 => TB=0.
-            // For alpha=0.001 it means that the mode should exist for approximately 105 frames before
-            // it is considered foreground
-            // float noiseSigma;
-            float varThresholdGen;
-
-            //correspondts to Tg - threshold on the squared Mahalan. dist. to decide
-            //when a sample is close to the existing components. If it is not close
-            //to any a new component will be generated. I use 3 sigma => Tg=3*3=9.
-            //Smaller Tg leads to more generated components and higher Tg might make
-            //lead to small number of components but they can grow too large
-            float fVarInit;
-            float fVarMin;
-            float fVarMax;
-
-            //initial variance  for the newly generated components.
-            //It will will influence the speed of adaptation. A good guess should be made.
-            //A simple way is to estimate the typical standard deviation from the images.
-            //I used here 10 as a reasonable value
-            // min and max can be used to further control the variance
-            float fCT; //CT - complexity reduction prior
-            //this is related to the number of samples needed to accept that a component
-            //actually exists. We use CT=0.05 of all the samples. By setting CT=0 you get
-            //the standard Stauffer&Grimson algorithm (maybe not exact but very similar)
-
-            //shadow detection parameters
-            bool bShadowDetection; //default 1 - do shadow detection
-            unsigned char nShadowDetection; //do shadow detection - insert this value as the detection result - 127 default value
-            float fTau;
-            // Tau - shadow threshold. The shadow is detected if the pixel is darker
-            //version of the background. Tau is a threshold on how much darker the shadow can be.
-            //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow
-            //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
-
-        private:
-            int nmixtures_;
-
-            Size frameSize_;
-            int frameType_;
-            int nframes_;
-
-            oclMat weight_;
-            oclMat variance_;
-            oclMat mean_;
-
-            oclMat bgmodelUsedModes_; //keep track of number of modes per pixel
-        };
-
-        /*!***************Kalman Filter*************!*/
-        class CV_EXPORTS KalmanFilter
-        {
-        public:
-            KalmanFilter();
-            //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector
-            KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
-            //! re-initializes Kalman filter. The previous content is destroyed.
-            void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F);
-
-            const oclMat& predict(const oclMat& control=oclMat());
-            const oclMat& correct(const oclMat& measurement);
-
-            oclMat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
-            oclMat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
-            oclMat transitionMatrix;   //!< state transition matrix (A)
-            oclMat controlMatrix;      //!< control matrix (B) (not used if there is no control)
-            oclMat measurementMatrix;  //!< measurement matrix (H)
-            oclMat processNoiseCov;    //!< process noise covariance matrix (Q)
-            oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R)
-            oclMat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
-            oclMat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
-            oclMat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
-        private:
-            oclMat temp1;
-            oclMat temp2;
-            oclMat temp3;
-            oclMat temp4;
-            oclMat temp5;
-        };
-
-        /*!***************K Nearest Neighbour*************!*/
-        class CV_EXPORTS KNearestNeighbour: public CvKNearest
-        {
-        public:
-            KNearestNeighbour();
-            ~KNearestNeighbour();
-
-            bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)),
-                bool isRegression = false, int max_k = 32, bool updateBase = false);
-
-            void clear();
-
-            void find_nearest(const oclMat& samples, int k, oclMat& lables);
-
-        private:
-            oclMat samples_ocl;
-        };
-
-        /*!***************  SVM  *************!*/
-        class CV_EXPORTS CvSVM_OCL : public CvSVM
-        {
-        public:
-            CvSVM_OCL();
-
-            CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses,
-                      const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
-                      CvSVMParams params=CvSVMParams());
-            CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const;
-            CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const;
-            CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
-            float predict( const CvMat* samples, CV_OUT CvMat* results ) const;
-
-        protected:
-            float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const;
-            void create_kernel();
-            void create_solver();
-        };
-
-        /*!***************  END  *************!*/
-    }
-}
-#if defined _MSC_VER && _MSC_VER >= 1200
-#  pragma warning( push)
-#  pragma warning( disable: 4267)
-#endif
-#include "opencv2/ocl/matrix_operations.hpp"
-#if defined _MSC_VER && _MSC_VER >= 1200
-#  pragma warning( pop)
-#endif
-
-#endif /* __OPENCV_OCL_HPP__ */
diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
deleted file mode 100644
index 410adbd8b..000000000
--- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
+++ /dev/null
@@ -1,490 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_OCL_MATRIX_OPERATIONS_HPP__
-#define __OPENCV_OCL_MATRIX_OPERATIONS_HPP__
-
-#include "opencv2/ocl.hpp"
-
-namespace cv
-{
-
-    namespace ocl
-    {
-
-        enum
-        {
-            MAT_ADD = 1,
-            MAT_SUB,
-            MAT_MUL,
-            MAT_DIV,
-            MAT_NOT,
-            MAT_AND,
-            MAT_OR,
-            MAT_XOR
-        };
-
-        class CV_EXPORTS oclMatExpr
-        {
-            public:
-                oclMatExpr() : a(oclMat()), b(oclMat()), op(0) {}
-                oclMatExpr(const oclMat& _a, const oclMat& _b, int _op)
-                    : a(_a), b(_b), op(_op) {}
-                operator oclMat() const;
-                void assign(oclMat& m) const;
-
-            protected:
-                oclMat a, b;
-                int op;
-        };
-        ////////////////////////////////////////////////////////////////////////
-        //////////////////////////////// oclMat ////////////////////////////////
-        ////////////////////////////////////////////////////////////////////////
-
-        inline oclMat::oclMat() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0) {}
-
-        inline oclMat::oclMat(int _rows, int _cols, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            if( _rows > 0 && _cols > 0 )
-                create( _rows, _cols, _type );
-        }
-
-        inline oclMat::oclMat(Size _size, int _type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            if( _size.height > 0 && _size.width > 0 )
-                create( _size.height, _size.width, _type );
-        }
-
-        inline oclMat::oclMat(int _rows, int _cols, int _type, const Scalar &_s)
-            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            if(_rows > 0 && _cols > 0)
-            {
-                create(_rows, _cols, _type);
-                *this = _s;
-            }
-        }
-
-        inline oclMat::oclMat(Size _size, int _type, const Scalar &_s)
-            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            if( _size.height > 0 && _size.width > 0 )
-            {
-                create( _size.height, _size.width, _type );
-                *this = _s;
-            }
-        }
-
-        inline oclMat::oclMat(const oclMat &m)
-            : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
-              refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
-        {
-            if( refcount )
-                CV_XADD(refcount, 1);
-        }
-
-        inline oclMat::oclMat(int _rows, int _cols, int _type, void *_data, size_t _step)
-            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0),
-              datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            cv::Mat m(_rows, _cols, _type, _data, _step);
-            upload(m);
-            //size_t minstep = cols * elemSize();
-            //if( step == Mat::AUTO_STEP )
-            //{
-            //    step = minstep;
-            //    flags |= Mat::CONTINUOUS_FLAG;
-            //}
-            //else
-            //{
-            //    if( rows == 1 ) step = minstep;
-            //    CV_DbgAssert( step >= minstep );
-            //    flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
-            //}
-            //dataend += step * (rows - 1) + minstep;
-        }
-
-        inline oclMat::oclMat(Size _size, int _type, void *_data, size_t _step)
-            : flags(0), rows(0), cols(0),
-              step(0), data(0), refcount(0),
-              datastart(0), dataend(0), offset(0), wholerows(0), wholecols(0)
-        {
-            cv::Mat m(_size, _type, _data, _step);
-            upload(m);
-            //size_t minstep = cols * elemSize();
-            //if( step == Mat::AUTO_STEP )
-            //{
-            //    step = minstep;
-            //    flags |= Mat::CONTINUOUS_FLAG;
-            //}
-            //else
-            //{
-            //    if( rows == 1 ) step = minstep;
-            //    CV_DbgAssert( step >= minstep );
-            //    flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
-            //}
-            //dataend += step * (rows - 1) + minstep;
-        }
-
-
-        inline oclMat::oclMat(const oclMat &m, const Range &rRange, const Range &cRange)
-        {
-            flags = m.flags;
-            step = m.step;
-            refcount = m.refcount;
-            data = m.data;
-            datastart = m.datastart;
-            dataend = m.dataend;
-            clCxt = m.clCxt;
-            wholerows = m.wholerows;
-            wholecols = m.wholecols;
-            offset = m.offset;
-            if( rRange == Range::all() )
-                rows = m.rows;
-            else
-            {
-                CV_Assert( 0 <= rRange.start && rRange.start <= rRange.end && rRange.end <= m.rows );
-                rows = rRange.size();
-                offset += step * rRange.start;
-            }
-
-            if( cRange == Range::all() )
-                cols = m.cols;
-            else
-            {
-                CV_Assert( 0 <= cRange.start && cRange.start <= cRange.end && cRange.end <= m.cols );
-                cols = cRange.size();
-                offset += cRange.start * elemSize();
-                flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
-            }
-
-            if( rows == 1 )
-                flags |= Mat::CONTINUOUS_FLAG;
-
-            if( refcount )
-                CV_XADD(refcount, 1);
-            if( rows <= 0 || cols <= 0 )
-                rows = cols = 0;
-        }
-
-        inline oclMat::oclMat(const oclMat &m, const Rect &roi)
-            : flags(m.flags), rows(roi.height), cols(roi.width),
-              step(m.step), data(m.data), refcount(m.refcount),
-              datastart(m.datastart), dataend(m.dataend), clCxt(m.clCxt), offset(m.offset), wholerows(m.wholerows), wholecols(m.wholecols)
-        {
-            flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
-            offset += roi.y * step + roi.x * elemSize();
-            CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.wholecols &&
-                       0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.wholerows );
-            if( refcount )
-                CV_XADD(refcount, 1);
-            if( rows <= 0 || cols <= 0 )
-                rows = cols = 0;
-        }
-
-        inline oclMat::oclMat(const Mat &m)
-            : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) , offset(0), wholerows(0), wholecols(0)
-        {
-            //clCxt = Context::getContext();
-            upload(m);
-        }
-
-        inline oclMat::~oclMat()
-        {
-            release();
-        }
-
-        inline oclMat &oclMat::operator = (const oclMat &m)
-        {
-            if( this != &m )
-            {
-                if( m.refcount )
-                    CV_XADD(m.refcount, 1);
-                release();
-                clCxt = m.clCxt;
-                flags = m.flags;
-                rows = m.rows;
-                cols = m.cols;
-                step = m.step;
-                data = m.data;
-                datastart = m.datastart;
-                dataend = m.dataend;
-                offset = m.offset;
-                wholerows = m.wholerows;
-                wholecols = m.wholecols;
-                refcount = m.refcount;
-            }
-            return *this;
-        }
-
-        inline oclMat &oclMat::operator = (const Mat &m)
-        {
-            //clCxt = Context::getContext();
-            upload(m);
-            return *this;
-        }
-
-        inline oclMat& oclMat::operator = (const oclMatExpr& expr)
-        {
-            expr.assign(*this);
-            return *this;
-        }
-
-        /* Fixme! To be supported in OpenCL later. */
-#if 0
-        template <class T> inline oclMat::operator DevMem2D_<T>() const
-        {
-            return DevMem2D_<T>(rows, cols, (T *)data, step);
-        }
-        template <class T> inline oclMat::operator PtrStep_<T>() const
-        {
-            return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this));
-        }
-#endif
-
-        //CPP: void oclMat::upload(const Mat& m);
-
-        inline oclMat::operator Mat() const
-        {
-            Mat m;
-            download(m);
-            return m;
-        }
-
-        //CPP void oclMat::download(cv::Mat& m) const;
-
-        inline oclMat oclMat::row(int y) const
-        {
-            return oclMat(*this, Range(y, y + 1), Range::all());
-        }
-        inline oclMat oclMat::col(int x) const
-        {
-            return oclMat(*this, Range::all(), Range(x, x + 1));
-        }
-        inline oclMat oclMat::rowRange(int startrow, int endrow) const
-        {
-            return oclMat(*this, Range(startrow, endrow), Range::all());
-        }
-        inline oclMat oclMat::rowRange(const Range &r) const
-        {
-            return oclMat(*this, r, Range::all());
-        }
-        inline oclMat oclMat::colRange(int startcol, int endcol) const
-        {
-            return oclMat(*this, Range::all(), Range(startcol, endcol));
-        }
-        inline oclMat oclMat::colRange(const Range &r) const
-        {
-            return oclMat(*this, Range::all(), r);
-        }
-
-        inline oclMat oclMat::clone() const
-        {
-            oclMat m;
-            copyTo(m);
-            return m;
-        }
-
-        //CPP void oclMat::copyTo( oclMat& m ) const;
-        //CPP void oclMat::copyTo( oclMat& m, const oclMat& mask  ) const;
-        //CPP void oclMat::convertTo( oclMat& m, int rtype, double alpha=1, double beta=0 ) const;
-
-        inline void oclMat::assignTo( oclMat &m, int mtype ) const
-        {
-            if( mtype < 0 )
-                m = *this;
-            else
-                convertTo(m, mtype);
-        }
-
-        //CPP oclMat& oclMat::operator = (const Scalar& s);
-        //CPP oclMat& oclMat::setTo(const Scalar& s, const oclMat& mask=oclMat());
-        //CPP oclMat oclMat::reshape(int _cn, int _rows=0) const;
-        inline void oclMat::create(Size _size, int _type)
-        {
-            create(_size.height, _size.width, _type);
-        }
-        //CPP void oclMat::create(int _rows, int _cols, int _type);
-        //CPP void oclMat::release();
-
-        inline void oclMat::swap(oclMat &b)
-        {
-            std::swap( flags, b.flags );
-            std::swap( rows, b.rows );
-            std::swap( cols, b.cols );
-            std::swap( step, b.step );
-            std::swap( data, b.data );
-            std::swap( datastart, b.datastart );
-            std::swap( dataend, b.dataend );
-            std::swap( refcount, b.refcount );
-            std::swap( offset, b.offset );
-            std::swap( clCxt,  b.clCxt );
-            std::swap( wholerows, b.wholerows );
-            std::swap( wholecols, b.wholecols );
-        }
-
-        inline void oclMat::locateROI( Size &wholeSize, Point &ofs ) const
-        {
-            size_t esz = elemSize();//, minstep;
-            //ptrdiff_t delta1 = offset;//, delta2 = dataend - datastart;
-            CV_DbgAssert( step > 0 );
-            if( offset == 0 )
-                ofs.x = ofs.y = 0;
-            else
-            {
-                ofs.y = (int)(offset / step);
-                ofs.x = (int)((offset - step * ofs.y) / esz);
-                //CV_DbgAssert( data == datastart + ofs.y*step + ofs.x*esz );
-            }
-            //minstep = (ofs.x + cols)*esz;
-            //wholeSize.height = (int)((delta2 - minstep)/step + 1);
-            //wholeSize.height = std::max(wholeSize.height, ofs.y + rows);
-            //wholeSize.width = (int)((delta2 - step*(wholeSize.height-1))/esz);
-            //wholeSize.width = std::max(wholeSize.width, ofs.x + cols);
-            wholeSize.height = wholerows;
-            wholeSize.width = wholecols;
-        }
-
-        inline oclMat &oclMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
-        {
-            Size wholeSize;
-            Point ofs;
-            size_t esz = elemSize();
-            locateROI( wholeSize, ofs );
-            int row1 = std::max(ofs.y - dtop, 0), row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
-            int col1 = std::max(ofs.x - dleft, 0), col2 = std::min(ofs.x + cols + dright, wholeSize.width);
-            offset += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
-            rows = row2 - row1;
-            cols = col2 - col1;
-            if( esz * cols == step || rows == 1 )
-                flags |= Mat::CONTINUOUS_FLAG;
-            else
-                flags &= ~Mat::CONTINUOUS_FLAG;
-            return *this;
-        }
-
-        inline oclMat oclMat::operator()( Range rRange, Range cRange ) const
-        {
-            return oclMat(*this, rRange, cRange);
-        }
-        inline oclMat oclMat::operator()( const Rect &roi ) const
-        {
-            return oclMat(*this, roi);
-        }
-
-        inline bool oclMat::isContinuous() const
-        {
-            return (flags & Mat::CONTINUOUS_FLAG) != 0;
-        }
-        inline size_t oclMat::elemSize() const
-        {
-            return CV_ELEM_SIZE((CV_MAKE_TYPE(type(), oclchannels())));
-        }
-        inline size_t oclMat::elemSize1() const
-        {
-            return CV_ELEM_SIZE1(flags);
-        }
-        inline int oclMat::type() const
-        {
-            return CV_MAT_TYPE(flags);
-        }
-        inline int oclMat::ocltype() const
-        {
-            return CV_MAKE_TYPE(depth(), oclchannels());
-        }
-        inline int oclMat::depth() const
-        {
-            return CV_MAT_DEPTH(flags);
-        }
-        inline int oclMat::channels() const
-        {
-            return CV_MAT_CN(flags);
-        }
-        inline int oclMat::oclchannels() const
-        {
-            return (CV_MAT_CN(flags)) == 3 ? 4 : (CV_MAT_CN(flags));
-        }
-        inline size_t oclMat::step1() const
-        {
-            return step / elemSize1();
-        }
-        inline Size oclMat::size() const
-        {
-            return Size(cols, rows);
-        }
-        inline bool oclMat::empty() const
-        {
-            return data == 0;
-        }
-
-        inline oclMat oclMat::t() const
-        {
-            oclMat tmp;
-            transpose(*this, tmp);
-            return tmp;
-        }
-
-        static inline void swap( oclMat &a, oclMat &b )
-        {
-            a.swap(b);
-        }
-
-        inline void ensureSizeIsEnough(int rows, int cols, int type, oclMat &m)
-        {
-            if (m.type() == type && m.rows >= rows && m.cols >= cols)
-                m = m(Rect(0, 0, cols, rows));
-            else
-                m.create(rows, cols, type);
-        }
-
-        inline void ensureSizeIsEnough(Size size, int type, oclMat &m)
-        {
-            ensureSizeIsEnough(size.height, size.width, type, m);
-        }
-
-
-    } /* end of namespace ocl */
-
-} /* end of namespace cv */
-
-#endif /* __OPENCV_OCL_MATRIX_OPERATIONS_HPP__ */
diff --git a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp b/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
deleted file mode 100644
index ee0f703ee..000000000
--- a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined(DUMP_MESSAGE_STDOUT) && !defined(DUMP_PROPERTY_XML)
-#error Invalid usage
-#endif
-
-#if !defined(DUMP_PROPERTY_XML)
-#define DUMP_PROPERTY_XML(...)
-#endif
-
-#if !defined(DUMP_MESSAGE_STDOUT)
-#define DUMP_MESSAGE_STDOUT(...)
-#endif
-
-#include <sstream>
-
-static std::string bytesToStringRepr(size_t value)
-{
-    size_t b = value % 1024;
-    value /= 1024;
-
-    size_t kb = value % 1024;
-    value /= 1024;
-
-    size_t mb = value % 1024;
-    value /= 1024;
-
-    size_t gb = value;
-
-    std::ostringstream stream;
-
-    if (gb > 0)
-        stream << gb << " GB ";
-    if (mb > 0)
-        stream << mb << " MB ";
-    if (kb > 0)
-        stream << kb << " kB ";
-    if (b > 0)
-        stream << b << " B";
-
-    return stream.str();
-}
-
-static void dumpOpenCLDevice()
-{
-    using namespace cv::ocl;
-    try
-    {
-        cv::ocl::PlatformsInfo platforms;
-        cv::ocl::getOpenCLPlatforms(platforms);
-        DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
-        const char* deviceTypeStr;
-        for(unsigned int i=0; i < platforms.size(); i++)
-        {
-            DUMP_MESSAGE_STDOUT("    " << platforms.at(i)->platformName);
-            const cv::ocl::DevicesInfo& devices = platforms.at(i)->devices;
-            for(unsigned int j=0; j < devices.size(); j++)
-            {
-                const cv::ocl::DeviceInfo& current_device = *devices.at(j);
-                deviceTypeStr = current_device.deviceType == CVCL_DEVICE_TYPE_CPU
-                            ? ("CPU") : (current_device.deviceType == CVCL_DEVICE_TYPE_GPU ? "GPU" : "unknown");
-                DUMP_MESSAGE_STDOUT( "        " << deviceTypeStr << " : " << current_device.deviceName << " : " << current_device.deviceVersion );
-                DUMP_PROPERTY_XML("cv_ocl_platform_"<< i<<"_device_"<<j, "(Platform=" << current_device.platform->platformName << ")(Type="
-                    << deviceTypeStr <<")(Name="<< current_device.deviceName << ")(Version="<< current_device.deviceVersion<<")");
-            }
-        }
-        DUMP_MESSAGE_STDOUT("Current OpenCL device: ");
-
-        const cv::ocl::DeviceInfo& deviceInfo = cv::ocl::Context::getContext()->getDeviceInfo();
-
-        DUMP_MESSAGE_STDOUT("    Platform = "<< deviceInfo.platform->platformName);
-        DUMP_PROPERTY_XML("cv_ocl_current_platformName", deviceInfo.platform->platformName);
-
-        deviceTypeStr = deviceInfo.deviceType == CVCL_DEVICE_TYPE_CPU
-                        ? "CPU" : (deviceInfo.deviceType == CVCL_DEVICE_TYPE_GPU ? "GPU" : "unknown");
-        DUMP_MESSAGE_STDOUT("    Type = "<< deviceTypeStr);
-        DUMP_PROPERTY_XML("cv_ocl_current_deviceType", deviceTypeStr);
-
-        DUMP_MESSAGE_STDOUT("    Name = "<< deviceInfo.deviceName);
-        DUMP_PROPERTY_XML("cv_ocl_current_deviceName", deviceInfo.deviceName);
-
-        DUMP_MESSAGE_STDOUT("    Version = " << deviceInfo.deviceVersion);
-        DUMP_PROPERTY_XML("cv_ocl_current_deviceVersion", deviceInfo.deviceVersion);
-
-        DUMP_MESSAGE_STDOUT("    Compute units = "<< deviceInfo.maxComputeUnits);
-        DUMP_PROPERTY_XML("cv_ocl_current_maxComputeUnits", deviceInfo.maxComputeUnits);
-
-        DUMP_MESSAGE_STDOUT("    Max work group size = "<< deviceInfo.maxWorkGroupSize);
-        DUMP_PROPERTY_XML("cv_ocl_current_maxWorkGroupSize", deviceInfo.maxWorkGroupSize);
-
-        std::string localMemorySizeStr = bytesToStringRepr(deviceInfo.localMemorySize);
-        DUMP_MESSAGE_STDOUT("    Local memory size = "<< localMemorySizeStr.c_str());
-        DUMP_PROPERTY_XML("cv_ocl_current_localMemorySize", deviceInfo.localMemorySize);
-
-        std::string maxMemAllocSizeStr = bytesToStringRepr(deviceInfo.maxMemAllocSize);
-        DUMP_MESSAGE_STDOUT("    Max memory allocation size = "<< maxMemAllocSizeStr.c_str());
-        DUMP_PROPERTY_XML("cv_ocl_current_maxMemAllocSize", deviceInfo.maxMemAllocSize);
-
-        const char* doubleSupportStr = deviceInfo.haveDoubleSupport ? "Yes" : "No";
-        DUMP_MESSAGE_STDOUT("    Double support = "<< doubleSupportStr);
-        DUMP_PROPERTY_XML("cv_ocl_current_haveDoubleSupport", deviceInfo.haveDoubleSupport);
-
-        const char* isUnifiedMemoryStr = deviceInfo.isUnifiedMemory ? "Yes" : "No";
-        DUMP_MESSAGE_STDOUT("    Unified memory = "<< isUnifiedMemoryStr);
-        DUMP_PROPERTY_XML("cv_ocl_current_isUnifiedMemory", deviceInfo.isUnifiedMemory);
-    }
-    catch (...)
-    {
-        DUMP_MESSAGE_STDOUT("OpenCL device not available");
-        DUMP_PROPERTY_XML("cv_ocl", "not available");
-    }
-}
-
-#undef DUMP_MESSAGE_STDOUT
-#undef DUMP_PROPERTY_XML
diff --git a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp b/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
deleted file mode 100644
index dfc658372..000000000
--- a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_OCL_PRIVATE_OPENCL_UTILS_HPP__
-#define __OPENCV_OCL_PRIVATE_OPENCL_UTILS_HPP__
-
-#include "opencv2/core/opencl/runtime/opencl_core.hpp"
-#include <vector>
-#include <string>
-
-namespace cl_utils {
-
-inline cl_int getPlatforms(std::vector<cl_platform_id>& platforms)
-{
-    cl_uint n = 0;
-
-    cl_int err = ::clGetPlatformIDs(0, NULL, &n);
-    if (err != CL_SUCCESS)
-        return err;
-
-    platforms.clear(); platforms.resize(n);
-    err = ::clGetPlatformIDs(n, &platforms[0], NULL);
-    if (err != CL_SUCCESS)
-        return err;
-
-    return CL_SUCCESS;
-}
-
-inline cl_int getDevices(cl_platform_id platform, cl_device_type type, std::vector<cl_device_id>& devices)
-{
-    cl_uint n = 0;
-
-    cl_int err = ::clGetDeviceIDs(platform, type, 0, NULL, &n);
-    if (err != CL_SUCCESS)
-        return err;
-
-    devices.clear(); devices.resize(n);
-    err = ::clGetDeviceIDs(platform, type, n, &devices[0], NULL);
-    if (err != CL_SUCCESS)
-        return err;
-
-    return CL_SUCCESS;
-}
-
-
-
-
-template <typename Functor, typename ObjectType, typename T>
-inline cl_int getScalarInfo(Functor f, ObjectType obj, cl_uint name, T& param)
-{
-    return f(obj, name, sizeof(T), &param, NULL);
-}
-
-template <typename Functor, typename ObjectType>
-inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
-{
-    ::size_t required;
-    cl_int err = f(obj, name, 0, NULL, &required);
-    if (err != CL_SUCCESS)
-        return err;
-
-    param.clear();
-    if (required > 0)
-    {
-        std::vector<char> buf(required + 1, char(0));
-        err = f(obj, name, required, &buf[0], NULL);
-        if (err != CL_SUCCESS)
-            return err;
-        param = &buf[0];
-    }
-
-    return CL_SUCCESS;
-};
-
-} // namespace cl_utils
-
-#endif // __OPENCV_OCL_PRIVATE_OPENCL_UTILS_HPP__
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
deleted file mode 100644
index b1ceacd7a..000000000
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_OCL_PRIVATE_UTIL__
-#define __OPENCV_OCL_PRIVATE_UTIL__
-
-#include "opencv2/core/opencl/runtime/opencl_core.hpp"
-#include "opencv2/core/ocl_genbase.hpp"
-
-#include "opencv2/ocl.hpp"
-
-namespace cv
-{
-namespace ocl
-{
-
-inline cl_device_id getClDeviceID(const Context *ctx)
-{
-    return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr());
-}
-
-inline cl_context getClContext(const Context *ctx)
-{
-    return *(cl_context*)(ctx->getOpenCLContextPtr());
-}
-
-inline cl_command_queue getClCommandQueue(const Context *ctx)
-{
-    return *(cl_command_queue*)(ctx->getOpenCLCommandQueuePtr());
-}
-
-CV_EXPORTS cv::Mutex& getInitializationMutex();
-
-enum openCLMemcpyKind
-{
-    clMemcpyHostToDevice = 0,
-    clMemcpyDeviceToHost,
-    clMemcpyDeviceToDevice
-};
-///////////////////////////OpenCL call wrappers////////////////////////////
-CV_EXPORTS void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
-        size_t widthInBytes, size_t height);
-CV_EXPORTS void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-        size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
-CV_EXPORTS void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
-        const void *src, size_t spitch,
-        size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
-CV_EXPORTS void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
-        const void *src, size_t spitch,
-        size_t width, size_t height, int src_offset);
-CV_EXPORTS void openCLFree(void *devPtr);
-CV_EXPORTS cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
-CV_EXPORTS void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
-CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
-        const cv::ocl::ProgramEntry* source, String kernelName);
-CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
-        const cv::ocl::ProgramEntry* source, String kernelName, const char *build_options);
-CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
-        String kernelName, int channels, int depth, const char *build_options);
-CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
-CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
-                          size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
-CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, String kernelName, std::vector< std::pair<size_t, const void *> > &args,
-        int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
-CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
-        size_t globalThreads[3], size_t localThreads[3],
-        std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
-CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
-CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-        int depth, const char *build_options);
-
-CV_EXPORTS cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-        const size_t size);
-
-CV_EXPORTS cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
-
-enum FLUSH_MODE
-{
-    CLFINISH = 0,
-    CLFLUSH,
-    DISABLE
-};
-
-CV_EXPORTS void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
-CV_EXPORTS void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-        size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-        int depth, const char *build_options, FLUSH_MODE finish_mode = DISABLE);
-
-// bind oclMat to OpenCL image textures
-// note:
-//   1. there is no memory management. User need to explicitly release the resource
-//   2. for faster clamping, there is no buffer padding for the constructed texture
-CV_EXPORTS cl_mem bindTexture(const oclMat &mat);
-CV_EXPORTS void releaseTexture(cl_mem& texture);
-
-//Represents an image texture object
-class CV_EXPORTS TextureCL
-{
-public:
-    TextureCL(cl_mem tex, int r, int c, int t)
-        : tex_(tex), rows(r), cols(c), type(t) {}
-    ~TextureCL()
-    {
-        openCLFree(tex_);
-    }
-    operator cl_mem()
-    {
-        return tex_;
-    }
-    cl_mem const tex_;
-    const int rows;
-    const int cols;
-    const int type;
-private:
-    //disable assignment
-    void operator=(const TextureCL&);
-};
-// bind oclMat to OpenCL image textures and retunrs an TextureCL object
-// note:
-//   for faster clamping, there is no buffer padding for the constructed texture
-CV_EXPORTS Ptr<TextureCL> bindTexturePtr(const oclMat &mat);
-
-CV_EXPORTS bool isCpuDevice();
-
-CV_EXPORTS size_t queryWaveFrontSize(cl_kernel kernel);
-
-
-inline size_t divUp(size_t total, size_t grain)
-{
-    return (total + grain - 1) / grain;
-}
-
-inline size_t roundUp(size_t sz, size_t n)
-{
-    // we don't assume that n is a power of 2 (see alignSize)
-    // equal to divUp(sz, n) * n
-    size_t t = sz + n - 1;
-    size_t rem = t % n;
-    size_t result = t - rem;
-    return result;
-}
-
-}//namespace ocl
-}//namespace cv
-
-#endif //__OPENCV_OCL_PRIVATE_UTIL__
diff --git a/modules/ocl/perf/main.cpp b/modules/ocl/perf/main.cpp
deleted file mode 100644
index c3b2f362f..000000000
--- a/modules/ocl/perf/main.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#define DUMP_PROPERTY_XML(propertyName, propertyValue) \
-    do { \
-        std::stringstream ssName, ssValue;\
-        ssName << propertyName;\
-        ssValue << propertyValue; \
-        ::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
-    } while (false)
-
-#define DUMP_MESSAGE_STDOUT(msg) \
-    do { \
-        std::cout << msg << std::endl; \
-    } while (false)
-
-
-#include "opencv2/ocl/private/opencl_dumpinfo.hpp"
-
-static const char * impls[] =
-{
-    IMPL_OCL,
-    IMPL_PLAIN,
-#ifdef HAVE_OPENCV_GPU
-    IMPL_GPU
-#endif
-};
-
-
-int main(int argc, char ** argv)
-{
-    ::perf::TestBase::setModulePerformanceStrategy(::perf::PERF_STRATEGY_SIMPLE);
-
-    CV_PERF_TEST_MAIN_INTERNALS(ocl, impls, dumpOpenCLDevice())
-}
diff --git a/modules/ocl/perf/perf_arithm.cpp b/modules/ocl/perf/perf_arithm.cpp
deleted file mode 100644
index 592c65285..000000000
--- a/modules/ocl/perf/perf_arithm.cpp
+++ /dev/null
@@ -1,1127 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::get;
-using std::tr1::tuple;
-
-///////////// Lut ////////////////////////
-
-typedef Size_MatType LUTFixture;
-
-PERF_TEST_P(LUTFixture, LUT,
-          ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                             OCL_PERF_ENUM(CV_8UC1, CV_8UC3)))
-{
-    // getting params
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    // creating src data
-    Mat src(srcSize, type), lut(1, 256, CV_8UC1);
-    int dstType = CV_MAKETYPE(lut.depth(), src.channels());
-    Mat dst(srcSize, dstType);
-
-    randu(lut, 0, 2);
-    declare.in(src, WARMUP_RNG).in(lut).out(dst);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclLut(lut), oclDst(srcSize, dstType);
-
-        OCL_TEST_CYCLE() cv::ocl::LUT(oclSrc, oclLut, oclDst);
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::LUT(src, lut, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Exp ////////////////////////
-
-typedef TestBaseWithParam<Size> ExpFixture;
-
-PERF_TEST_P(ExpFixture, Exp, OCL_TYPICAL_MAT_SIZES)
-{
-    // getting params
-    const Size srcSize = GetParam();
-    const double eps = 1e-6;
-
-    // creating src data
-    Mat src(srcSize, CV_32FC1), dst(srcSize, CV_32FC1);
-    declare.in(src).out(dst);
-    randu(src, 5, 16);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, src.type());
-
-        OCL_TEST_CYCLE() cv::ocl::exp(oclSrc, oclDst);
-
-        oclDst.download(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::exp(src, dst);
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK(dst, eps, ERROR_RELATIVE);
-}
-
-///////////// LOG ////////////////////////
-
-typedef TestBaseWithParam<Size> LogFixture;
-
-PERF_TEST_P(LogFixture, Log, OCL_TYPICAL_MAT_SIZES)
-{
-    // getting params
-    const Size srcSize = GetParam();
-    const double eps = 1e-6;
-
-    // creating src data
-    Mat src(srcSize, CV_32F), dst(srcSize, src.type());
-    randu(src, 1, 10);
-    declare.in(src).out(dst);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(3.6);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, src.type());
-
-        OCL_TEST_CYCLE() cv::ocl::log(oclSrc, oclDst);
-
-        oclDst.download(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::log(src, dst);
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK(dst, eps, ERROR_RELATIVE);
-}
-
-///////////// SQRT ///////////////////////
-
-typedef TestBaseWithParam<Size> SqrtFixture;
-
-PERF_TEST_P(SqrtFixture, Sqrt, OCL_TYPICAL_MAT_SIZES)
-{
-    // getting params
-    const Size srcSize = GetParam();
-    const double eps = 1e-6;
-
-    // creating src data
-    Mat src(srcSize, CV_32F), dst(srcSize, src.type());
-    randu(src, 0, 10);
-    declare.in(src).out(dst);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, src.type());
-
-        OCL_TEST_CYCLE() cv::ocl::sqrt(oclSrc, oclDst);
-
-        oclDst.download(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::sqrt(src, dst);
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK(dst, eps, ERROR_RELATIVE);
-}
-
-///////////// Add ////////////////////////
-
-typedef Size_MatType AddFixture;
-
-PERF_TEST_P(AddFixture, Add,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    // getting params
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    // creating src data
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    randu(src1, 0, 1);
-    randu(src2, 0, 1);
-    declare.in(src1, src2).out(dst);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::add(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::add(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Mul ////////////////////////
-
-typedef Size_MatType MulFixture;
-
-PERF_TEST_P(MulFixture, Mul, ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                                                OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    // getting params
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    // creating src data
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-    declare.in(src1, src2).out(dst);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::multiply(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::multiply(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Div ////////////////////////
-
-typedef Size_MatType DivFixture;
-
-PERF_TEST_P(DivFixture, Div,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    // getting params
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    // creating src data
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2).out(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if ((srcSize == OCL_SIZE_4000 && type == CV_8UC1) ||
-            (srcSize == OCL_SIZE_2000 && type == CV_8UC4))
-        declare.time(4.2);
-    else if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(16.6);
-
-    // select implementation
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::divide(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::divide(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Absdiff ////////////////////////
-
-typedef Size_MatType AbsDiffFixture;
-
-PERF_TEST_P(AbsDiffFixture, Absdiff,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2).in(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::absdiff(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::absdiff(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// CartToPolar ////////////////////////
-
-typedef TestBaseWithParam<Size> CartToPolarFixture;
-
-PERF_TEST_P(CartToPolarFixture, CartToPolar, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    const double eps = 8e-3;
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst1(srcSize, CV_32FC1), dst2(srcSize, CV_32FC1);
-    declare.in(src1, src2).out(dst1, dst2);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(3.6);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
-                oclDst1(srcSize, src1.type()), oclDst2(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::cartToPolar(oclSrc1, oclSrc2, oclDst1, oclDst2);
-
-        oclDst1.download(dst1);
-        oclDst2.download(dst2);
-
-        SANITY_CHECK(dst1, eps);
-        SANITY_CHECK(dst2, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::cartToPolar(src1, src2, dst1, dst2);
-
-        SANITY_CHECK(dst1, eps);
-        SANITY_CHECK(dst2, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// PolarToCart ////////////////////////
-
-typedef TestBaseWithParam<Size> PolarToCartFixture;
-
-PERF_TEST_P(PolarToCartFixture, PolarToCart, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst1(srcSize, CV_32FC1), dst2(srcSize, CV_32FC1);
-    declare.in(src1, src2).out(dst1, dst2);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(5.4);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
-                oclDst1(srcSize, src1.type()), oclDst2(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::polarToCart(oclSrc1, oclSrc2, oclDst1, oclDst2);
-
-        oclDst1.download(dst1);
-        oclDst2.download(dst2);
-
-        SANITY_CHECK(dst1, 5e-5);
-        SANITY_CHECK(dst2, 5e-5);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::polarToCart(src1, src2, dst1, dst2);
-
-        SANITY_CHECK(dst1, 5e-5);
-        SANITY_CHECK(dst2, 5e-5);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Magnitude ////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeFixture;
-
-PERF_TEST_P(MagnitudeFixture, Magnitude, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    randu(src1, 0, 1);
-    randu(src2, 0, 1);
-    declare.in(src1, src2).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
-                oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitude(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::magnitude(src1, src2, dst);
-
-        SANITY_CHECK(dst, 1e-6);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Transpose ////////////////////////
-
-typedef Size_MatType TransposeFixture;
-
-PERF_TEST_P(TransposeFixture, Transpose,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::transpose(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::transpose(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Flip ////////////////////////
-
-typedef Size_MatType FlipFixture;
-
-PERF_TEST_P(FlipFixture, Flip,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::flip(oclSrc, oclDst, 0);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::flip(src, dst, 0);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// minMax ////////////////////////
-
-typedef Size_MatType minMaxFixture;
-
-PERF_TEST_P(minMaxFixture, minMax,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type);
-    declare.in(src, WARMUP_RNG);
-
-    double min_val = std::numeric_limits<double>::max(),
-            max_val = std::numeric_limits<double>::min();
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() cv::ocl::minMax(oclSrc, &min_val, &max_val);
-
-        ASSERT_GE(max_val, min_val);
-        SANITY_CHECK(min_val);
-        SANITY_CHECK(max_val);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        Point min_loc, max_loc;
-
-        TEST_CYCLE() cv::minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-
-        ASSERT_GE(max_val, min_val);
-        SANITY_CHECK(min_val);
-        SANITY_CHECK(max_val);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// minMaxLoc ////////////////////////
-
-typedef Size_MatType minMaxLocFixture;
-
-PERF_TEST_P(minMaxLocFixture, minMaxLoc,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type);
-    randu(src, 0, 1);
-    declare.in(src);
-
-    double min_val = 0.0, max_val = 0.0;
-    Point min_loc, max_loc;
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() cv::ocl::minMaxLoc(oclSrc, &min_val, &max_val, &min_loc, &max_loc);
-
-        ASSERT_GE(max_val, min_val);
-        SANITY_CHECK(min_val);
-        SANITY_CHECK(max_val);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::minMaxLoc(src, &min_val, &max_val, &min_loc, &max_loc);
-
-        ASSERT_GE(max_val, min_val);
-        SANITY_CHECK(min_val);
-        SANITY_CHECK(max_val);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Sum ////////////////////////
-
-typedef Size_MatType SumFixture;
-
-PERF_TEST_P(SumFixture, Sum,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type);
-    Scalar result;
-    randu(src, 0, 60);
-    declare.in(src);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() result = cv::ocl::sum(oclSrc);
-
-        SANITY_CHECK(result);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() result = cv::sum(src);
-
-        SANITY_CHECK(result);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// countNonZero ////////////////////////
-
-typedef Size_MatType countNonZeroFixture;
-
-PERF_TEST_P(countNonZeroFixture, countNonZero,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type);
-    int result = 0;
-    randu(src, 0, 256);
-    declare.in(src);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-
-        OCL_TEST_CYCLE() result = cv::ocl::countNonZero(oclSrc);
-
-        SANITY_CHECK(result);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() result = cv::countNonZero(src);
-
-        SANITY_CHECK(result);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Phase ////////////////////////
-
-typedef TestBaseWithParam<Size> PhaseFixture;
-
-PERF_TEST_P(PhaseFixture, Phase, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2).out(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
-                oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::phase(oclSrc1, oclSrc2, oclDst, 1);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-2);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::phase(src1, src2, dst, 1);
-
-        SANITY_CHECK(dst, 1e-2);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// bitwise_and////////////////////////
-
-typedef Size_MatType BitwiseAndFixture;
-
-PERF_TEST_P(BitwiseAndFixture, bitwise_and,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2).out(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::bitwise_and(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::bitwise_and(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// bitwise_xor ////////////////////////
-
-typedef Size_MatType BitwiseXorFixture;
-
-PERF_TEST_P(BitwiseXorFixture, bitwise_xor,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2).out(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::bitwise_xor(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::bitwise_xor(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// bitwise_or ////////////////////////
-
-typedef Size_MatType BitwiseOrFixture;
-
-PERF_TEST_P(BitwiseOrFixture, bitwise_or,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2).out(dst);
-    randu(src1, 0, 256);
-    randu(src2, 0, 256);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::bitwise_or(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::bitwise_or(src1, src2, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// bitwise_not////////////////////////
-
-typedef Size_MatType BitwiseNotFixture;
-
-PERF_TEST_P(BitwiseAndFixture, bitwise_not,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::bitwise_not(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::bitwise_not(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// compare////////////////////////
-
-typedef Size_MatType CompareFixture;
-
-PERF_TEST_P(CompareFixture, compare,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, CV_8UC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, CV_8UC1);
-
-        OCL_TEST_CYCLE() cv::ocl::compare(oclSrc1, oclSrc2, oclDst, CMP_EQ);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::compare(src1, src2, dst, CMP_EQ);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// pow ////////////////////////
-
-typedef TestBaseWithParam<Size> PowFixture;
-
-PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    const double eps = 1e-6;
-
-    Mat src(srcSize, CV_32F), dst(srcSize, CV_32F);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, src.type());
-
-        OCL_TEST_CYCLE() cv::ocl::pow(oclSrc, -2.0, oclDst);
-
-        oclDst.download(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::pow(src, -2.0, dst);
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK(dst, eps, ERROR_RELATIVE);
-}
-
-///////////// AddWeighted////////////////////////
-
-typedef Size_MatType AddWeightedFixture;
-
-PERF_TEST_P(AddWeightedFixture, AddWeighted,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-    double alpha = 2.0, beta = 1.0, gama = 3.0;
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::addWeighted(oclSrc1, alpha, oclSrc2, beta, gama, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::addWeighted(src1, alpha, src2, beta, gama, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Min ////////////////////////
-
-typedef Size_MatType MinFixture;
-
-PERF_TEST_P(MinFixture, Min,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::min(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() dst = cv::min(src1, src2);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Max ////////////////////////
-
-typedef Size_MatType MaxFixture;
-
-PERF_TEST_P(MaxFixture, Max,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::max(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() dst = cv::max(src1, src2);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Max ////////////////////////
-
-typedef Size_MatType AbsFixture;
-
-PERF_TEST_P(AbsFixture, Abs,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::abs(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() dst = cv::abs(src);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Repeat ////////////////////////
-
-typedef Size_MatType RepeatFixture;
-
-PERF_TEST_P(RepeatFixture, Repeat,
-            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    const int nx = 3, ny = 2;
-    const Size dstSize(srcSize.width * nx, srcSize.height * ny);
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-    checkDeviceMaxMemoryAllocSize(dstSize, type);
-
-    Mat src(srcSize, type), dst(dstSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::repeat(oclSrc, ny, nx, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::repeat(src, ny, nx, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_bgfg.cpp b/modules/ocl/perf/perf_bgfg.cpp
deleted file mode 100644
index 95099640f..000000000
--- a/modules/ocl/perf/perf_bgfg.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using namespace std;
-using namespace cv::ocl;
-using namespace cv;
-using std::tr1::tuple;
-using std::tr1::get;
-
-#if defined(HAVE_XINE)         || \
-    defined(HAVE_GSTREAMER)    || \
-    defined(HAVE_QUICKTIME)    || \
-    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32)
-
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
-#else
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
-#endif
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
-{
-    for(int i = 0; i< (int)(input.size()); i++)
-    {
-        cvtColor(input[i], output[i], COLOR_RGB2GRAY);
-    }
-}
-
-//prepare data for CPU
-static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
-{
-    cv::Mat frame;
-    std::vector<Mat> frame_buffer_init;
-    int nFrame = (int)frame_buffer.size();
-    for(int i = 0; i < nFrame; i++)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-        frame_buffer_init.push_back(frame);
-    }
-
-    if(cn == 1)
-        cvtFrameFmt(frame_buffer_init, frame_buffer);
-    else
-        frame_buffer = frame_buffer_init;
-}
-
-//copy CPU data to GPU
-static void prepareData(vector<Mat>& frame_buffer, vector<oclMat>& frame_buffer_ocl)
-{
-    for(int i = 0; i < (int)frame_buffer.size(); i++)
-        frame_buffer_ocl.push_back(cv::ocl::oclMat(frame_buffer[i]));
-}
-
-///////////// MOG ////////////////////////
-
-typedef tuple<string, int, double> VideoMOGParamType;
-typedef TestBaseWithParam<VideoMOGParamType> VideoMOGFixture;
-
-PERF_TEST_P(VideoMOGFixture, MOG,
-            ::testing::Combine(::testing::Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
-            ::testing::Values(1, 3),
-            ::testing::Values(0.0, 0.01)))
-{
-    VideoMOGParamType params = GetParam();
-
-    const string inputFile = perf::TestBase::getDataPath(get<0>(params));
-    const int cn = get<1>(params);
-    const float learningRate = static_cast<float>(get<2>(params));
-
-    const int nFrame = 5;
-
-    Mat foreground_cpu;
-    std::vector<Mat> frame_buffer(nFrame);
-    std::vector<oclMat> frame_buffer_ocl;
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    prepareData(cap, cn, frame_buffer);
-
-    cv::Mat foreground;
-    cv::ocl::oclMat foreground_d;
-    if(RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE()
-        {
-            cv::Ptr<cv::BackgroundSubtractorMOG> mog = createBackgroundSubtractorMOG();
-            foreground.release();
-            for (int i = 0; i < nFrame; i++)
-            {
-                mog->apply(frame_buffer[i], foreground, learningRate);
-            }
-        }
-        SANITY_CHECK(foreground);
-    }
-    else if(RUN_OCL_IMPL)
-    {
-        prepareData(frame_buffer, frame_buffer_ocl);
-        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
-        OCL_TEST_CYCLE()
-        {
-            cv::ocl::MOG d_mog;
-            foreground_d.release();
-            for (int i = 0; i < nFrame; ++i)
-            {
-                d_mog(frame_buffer_ocl[i], foreground_d, learningRate);
-            }
-        }
-        foreground_d.download(foreground);
-        SANITY_CHECK(foreground);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// MOG2 ////////////////////////
-
-typedef tuple<string, int> VideoMOG2ParamType;
-typedef TestBaseWithParam<VideoMOG2ParamType> VideoMOG2Fixture;
-
-PERF_TEST_P(VideoMOG2Fixture, DISABLED_MOG2, // TODO Disabled: random hungs on buildslave
-            ::testing::Combine(::testing::Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
-            ::testing::Values(1, 3)))
-{
-    VideoMOG2ParamType params = GetParam();
-
-    const string inputFile = perf::TestBase::getDataPath(get<0>(params));
-    const int cn = get<1>(params);
-    int nFrame = 5;
-
-    std::vector<cv::Mat> frame_buffer(nFrame);
-    std::vector<cv::ocl::oclMat> frame_buffer_ocl;
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-    prepareData(cap, cn, frame_buffer);
-    cv::Mat foreground;
-    cv::ocl::oclMat foreground_d;
-
-    if(RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE()
-        {
-            cv::Ptr<cv::BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2();
-            mog2->setDetectShadows(false);
-            foreground.release();
-
-            for (int i = 0; i < nFrame; i++)
-            {
-                mog2->apply(frame_buffer[i], foreground);
-            }
-        }
-        SANITY_CHECK(foreground);
-    }
-    else if(RUN_OCL_IMPL)
-    {
-        prepareData(frame_buffer, frame_buffer_ocl);
-        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
-        OCL_TEST_CYCLE()
-        {
-            cv::ocl::MOG2 d_mog2;
-            foreground_d.release();
-            for (int i = 0; i < nFrame; i++)
-            {
-                d_mog2(frame_buffer_ocl[i], foreground_d);
-            }
-        }
-        foreground_d.download(foreground);
-        SANITY_CHECK(foreground);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// MOG2_GetBackgroundImage //////////////////
-
-typedef TestBaseWithParam<VideoMOG2ParamType> Video_MOG2GetBackgroundImage;
-
-PERF_TEST_P(Video_MOG2GetBackgroundImage, MOG2,
-            ::testing::Combine(::testing::Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
-            ::testing::Values(3)))
-{
-    VideoMOG2ParamType params = GetParam();
-
-    const string inputFile = perf::TestBase::getDataPath(get<0>(params));
-    const int cn = get<1>(params);
-    int nFrame = 5;
-
-    std::vector<cv::Mat> frame_buffer(nFrame);
-    std::vector<cv::ocl::oclMat> frame_buffer_ocl;
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    prepareData(cap, cn, frame_buffer);
-
-    cv::Mat foreground;
-    cv::Mat background;
-    cv::ocl::oclMat foreground_d;
-    cv::ocl::oclMat background_d;
-
-    if(RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE()
-        {
-            cv::Ptr<cv::BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2();
-            mog2->setDetectShadows(false);
-            foreground.release();
-            background.release();
-            for (int i = 0; i < nFrame; i++)
-            {
-                mog2->apply(frame_buffer[i], foreground);
-            }
-            mog2->getBackgroundImage(background);
-        }
-        SANITY_CHECK(background);
-    }
-    else if(RUN_OCL_IMPL)
-    {
-        prepareData(frame_buffer, frame_buffer_ocl);
-        CV_Assert((int)(frame_buffer_ocl.size()) == nFrame);
-        OCL_TEST_CYCLE()
-        {
-            cv::ocl::MOG2 d_mog2;
-            foreground_d.release();
-            background_d.release();
-            for (int i = 0; i < nFrame; i++)
-            {
-                d_mog2(frame_buffer_ocl[i], foreground_d);
-            }
-            d_mog2.getBackgroundImage(background_d);
-        }
-        background_d.download(background);
-        SANITY_CHECK(background);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-#endif
diff --git a/modules/ocl/perf/perf_blend.cpp b/modules/ocl/perf/perf_blend.cpp
deleted file mode 100644
index 6f611bbc3..000000000
--- a/modules/ocl/perf/perf_blend.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using namespace cv;
-using std::tr1::get;
-
-///////////// blend ////////////////////////
-
-template <typename T>
-static void blendLinearGold(const Mat &img1, const Mat &img2,
-                            const Mat &weights1, const Mat &weights2,
-                            Mat &result_gold)
-{
-    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
-    CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
-              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
-
-    result_gold.create(img1.size(), img1.type());
-
-    int cn = img1.channels();
-    int step1 = img1.cols * img1.channels();
-
-    for (int y = 0; y < img1.rows; ++y)
-    {
-        const float * const weights1_row = weights1.ptr<float>(y);
-        const float * const weights2_row = weights2.ptr<float>(y);
-        const T * const img1_row = img1.ptr<T>(y);
-        const T * const img2_row = img2.ptr<T>(y);
-        T * const result_gold_row = result_gold.ptr<T>(y);
-
-        for (int x = 0; x < step1; ++x)
-        {
-            int x1 = x / cn;
-            float w1 = weights1_row[x1], w2 = weights2_row[x1];
-            result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
-                                                 + (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
-        }
-    }
-}
-
-typedef void (*blendFunction)(const Mat &img1, const Mat &img2,
-                              const Mat &weights1, const Mat &weights2,
-                              Mat &result_gold);
-
-typedef Size_MatType blendLinearFixture;
-
-PERF_TEST_P(blendLinearFixture, blendLinear, ::testing::Combine(
-                OCL_TYPICAL_MAT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_32FC1)))
-{
-    Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int srcType = get<1>(params);
-    const double eps = CV_MAT_DEPTH(srcType) <= CV_32S ? 1.0 : 0.2;
-
-    Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
-    Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
-
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-    randu(weights1, 0.0f, 1.0f);
-    randu(weights2, 0.0f, 1.0f);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst;
-        ocl::oclMat oclWeights1(weights1), oclWeights2(weights2);
-
-        OCL_TEST_CYCLE() ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        blendFunction funcs[] = { (blendFunction)blendLinearGold<uchar>, (blendFunction)blendLinearGold<float> };
-        int funcIdx = CV_MAT_DEPTH(srcType) == CV_8UC1 ? 0 : 1;
-
-        TEST_CYCLE() (funcs[funcIdx])(src1, src2, weights1, weights2, dst);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_brute_force_matcher.cpp b/modules/ocl/perf/perf_brute_force_matcher.cpp
deleted file mode 100644
index d124428e9..000000000
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-
-#define OCL_BFMATCHER_TYPICAL_MAT_SIZES ::testing::Values(cv::Size(128, 500), cv::Size(128, 1000), cv::Size(128, 2000))
-
-//////////////////// BruteForceMatch /////////////////
-
-typedef TestBaseWithParam<Size> BruteForceMatcherFixture;
-
-PERF_TEST_P(BruteForceMatcherFixture, match,
-            OCL_BFMATCHER_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    vector<DMatch> matches;
-    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
-    declare.in(query, train).time(srcSize.height == 2000 ? 9 : 4 );
-    randu(query, 0.0f, 1.0f);
-    randu(train, 0.0f, 1.0f);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        BFMatcher matcher(NORM_L2);
-        TEST_CYCLE() matcher.match(query, train, matches);
-
-        SANITY_CHECK_MATCHES(matches);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
-        ocl::oclMat oclQuery(query), oclTrain(train);
-        ocl::oclMat oclTrainIdx, oclDistance;
-
-        OCL_TEST_CYCLE()
-            oclMatcher.matchSingle(oclQuery, oclTrain, oclTrainIdx, oclDistance);
-
-        oclMatcher.matchDownload(oclTrainIdx, oclDistance, matches);
-
-        SANITY_CHECK_MATCHES(matches, 1e-5);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-PERF_TEST_P(BruteForceMatcherFixture, knnMatch,
-            OCL_BFMATCHER_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    vector<vector<DMatch> > matches(2);
-    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
-    randu(query, 0.0f, 1.0f);
-    randu(train, 0.0f, 1.0f);
-
-    declare.in(query, train);
-    if (srcSize.height == 2000)
-        declare.time(9);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        BFMatcher matcher(NORM_L2);
-        TEST_CYCLE() matcher.knnMatch(query, train, matches, 2);
-
-        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
-        SANITY_CHECK_MATCHES(matches0);
-        SANITY_CHECK_MATCHES(matches1);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
-        ocl::oclMat oclQuery(query), oclTrain(train);
-        ocl::oclMat oclTrainIdx, oclDistance, oclAllDist;
-
-        OCL_TEST_CYCLE()
-                oclMatcher.knnMatchSingle(oclQuery, oclTrain, oclTrainIdx, oclDistance, oclAllDist, 2);
-
-        oclMatcher.knnMatchDownload(oclTrainIdx, oclDistance, matches);
-
-        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
-        SANITY_CHECK_MATCHES(matches0, 1e-5);
-        SANITY_CHECK_MATCHES(matches1, 1e-5);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-PERF_TEST_P(BruteForceMatcherFixture, radiusMatch,
-            OCL_BFMATCHER_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    const float max_distance = 2.0f;
-    vector<vector<DMatch> > matches(2);
-    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
-    declare.in(query, train);
-
-    randu(query, 0.0f, 1.0f);
-    randu(train, 0.0f, 1.0f);
-
-    if (srcSize.height == 2000)
-        declare.time(9.15);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        cv::BFMatcher matcher(NORM_L2);
-        TEST_CYCLE() matcher.radiusMatch(query, train, matches, max_distance);
-
-        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
-        SANITY_CHECK_MATCHES(matches0);
-        SANITY_CHECK_MATCHES(matches1);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclQuery(query), oclTrain(train);
-        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
-        ocl::oclMat oclTrainIdx, oclDistance, oclNMatches;
-
-        OCL_TEST_CYCLE()
-                oclMatcher.radiusMatchSingle(oclQuery, oclTrain, oclTrainIdx, oclDistance, oclNMatches, max_distance);
-
-        oclMatcher.radiusMatchDownload(oclTrainIdx, oclDistance, oclNMatches, matches);
-
-        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
-        SANITY_CHECK_MATCHES(matches0);
-        SANITY_CHECK_MATCHES(matches1);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-#undef OCL_BFMATCHER_TYPICAL_MAT_SIZES
diff --git a/modules/ocl/perf/perf_calib3d.cpp b/modules/ocl/perf/perf_calib3d.cpp
deleted file mode 100644
index 12fee549b..000000000
--- a/modules/ocl/perf/perf_calib3d.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-///////////// StereoMatchBM ////////////////////////
-
-PERF_TEST(StereoMatchBMFixture, StereoMatchBM)
-{
-    Mat left_image = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE);
-    Mat right_image = imread(getDataPath("gpu/stereobm/aloe-R.png"), cv::IMREAD_GRAYSCALE);
-
-    ASSERT_TRUE(!left_image.empty()) << "no input image";
-    ASSERT_TRUE(!right_image.empty()) << "no input image";
-    ASSERT_TRUE(right_image.size() == left_image.size());
-    ASSERT_TRUE(right_image.size() == left_image.size());
-
-    const int n_disp = 128, winSize = 19;
-    Mat disp(left_image.size(), CV_16SC1);
-
-    declare.in(left_image, right_image).out(disp);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclLeft(left_image), oclRight(right_image),
-                oclDisp(left_image.size(), CV_16SC1);
-        ocl::StereoBM_OCL oclBM(0, n_disp, winSize);
-
-        OCL_TEST_CYCLE() oclBM(oclLeft, oclRight, oclDisp);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        Ptr<StereoBM> bm = createStereoBM(n_disp, winSize);
-
-        TEST_CYCLE() bm->compute(left_image, right_image, disp);
-    }
-    else
-        OCL_PERF_ELSE
-
-    int value = 0;
-    SANITY_CHECK(value);
-}
diff --git a/modules/ocl/perf/perf_canny.cpp b/modules/ocl/perf/perf_canny.cpp
deleted file mode 100644
index 33723daa3..000000000
--- a/modules/ocl/perf/perf_canny.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-
-///////////// Canny ////////////////////////
-
-PERF_TEST(CannyFixture, Canny)
-{
-    Mat img = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE),
-            edges(img.size(), CV_8UC1);
-    ASSERT_TRUE(!img.empty()) << "can't open aloe-L.png";
-
-    declare.in(img).out(edges);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclImg(img), oclEdges(img.size(), CV_8UC1);
-
-        OCL_TEST_CYCLE() ocl::Canny(oclImg, oclEdges, 50.0, 100.0);
-        oclEdges.download(edges);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() Canny(img, edges, 50.0, 100.0);
-    }
-    else
-        OCL_PERF_ELSE
-
-    int value = 0;
-    SANITY_CHECK(value);
-}
diff --git a/modules/ocl/perf/perf_filters.cpp b/modules/ocl/perf/perf_filters.cpp
deleted file mode 100644
index 7e5389df6..000000000
--- a/modules/ocl/perf/perf_filters.cpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::get;
-using std::tr1::tuple;
-
-///////////// Blur////////////////////////
-
-typedef Size_MatType BlurFixture;
-
-PERF_TEST_P(BlurFixture, Blur,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params), ksize(3, 3);
-    const int type = get<1>(params), bordertype = BORDER_CONSTANT;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(5);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::blur(oclSrc, oclDst, ksize, Point(-1, -1), bordertype);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::blur(src, dst, ksize, Point(-1, -1), bordertype);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Laplacian////////////////////////
-
-typedef Size_MatType LaplacianFixture;
-
-PERF_TEST_P(LaplacianFixture, Laplacian,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), ksize = 3;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(6);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::Laplacian(oclSrc, oclDst, -1, ksize, 1);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize, 1);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Erode ////////////////////
-
-typedef Size_MatType ErodeFixture;
-
-PERF_TEST_P(ErodeFixture, Erode,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), ksize = 3;
-    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst).in(ker);
-
-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(5);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type), oclKer(ker);
-
-        OCL_TEST_CYCLE() cv::ocl::erode(oclSrc, oclDst, oclKer);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::erode(src, dst, ker);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Sobel ////////////////////////
-
-typedef Size_MatType SobelFixture;
-
-PERF_TEST_P(SobelFixture, Sobel,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), dx = 1, dy = 1;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if ((srcSize == OCL_SIZE_2000 && type == CV_8UC4) ||
-            (srcSize == OCL_SIZE_4000 && type == CV_8UC1))
-        declare.time(5.5);
-    else if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(20);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::Sobel(oclSrc, oclDst, -1, dx, dy);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Scharr ////////////////////////
-
-typedef Size_MatType ScharrFixture;
-
-PERF_TEST_P(ScharrFixture, Scharr,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), dx = 1, dy = 0;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type, sizeof(float) * 2);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if ((srcSize == OCL_SIZE_2000 && type == CV_8UC4) ||
-            (srcSize == OCL_SIZE_4000 && type == CV_8UC1))
-        declare.time(5.5);
-    else if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(21);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::Scharr(oclSrc, oclDst, -1, dx, dy);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::Scharr(src, dst, -1, dx, dy);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// GaussianBlur ////////////////////////
-
-typedef Size_MatType GaussianBlurFixture;
-
-PERF_TEST_P(GaussianBlurFixture, GaussianBlur,
-            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), ksize = 7;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    const double eps = src.depth() == CV_8U ? 1 + DBL_EPSILON : 3e-4;
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::GaussianBlur(oclSrc, oclDst, Size(ksize, ksize), 0);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::GaussianBlur(src, dst, Size(ksize, ksize), 0);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// filter2D////////////////////////
-
-typedef Size_MatType filter2DFixture;
-
-PERF_TEST_P(filter2DFixture, filter2D,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), ksize = 3;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type), kernel(ksize, ksize, CV_32SC1);
-    declare.in(src, WARMUP_RNG).in(kernel).out(dst);
-    randu(kernel, -3.0, 3.0);
-
-    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
-        declare.time(8);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type), oclKernel(kernel);
-
-        OCL_TEST_CYCLE() cv::ocl::filter2D(oclSrc, oclDst, -1, oclKernel);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Bilateral////////////////////////
-
-typedef Size_MatType BilateralFixture;
-
-PERF_TEST_P(BilateralFixture, Bilateral,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC3)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), d = 7;
-    const double sigmacolor = 50.0, sigmaspace = 50.0;
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(type == CV_8UC3 ? 8 : 4.5);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::bilateralFilter(oclSrc, oclDst, d, sigmacolor, sigmaspace);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::bilateralFilter(src, dst, d, sigmacolor, sigmaspace);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// adaptiveBilateral////////////////////////
-
-typedef Size_MatType adaptiveBilateralFixture;
-
-PERF_TEST_P(adaptiveBilateralFixture, adaptiveBilateral,
-            ::testing::Combine(::testing::Values(OCL_SIZE_1000), OCL_PERF_ENUM(CV_8UC1, CV_8UC3)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    const double sigmaspace = 10.0;
-    Size ksize(9, 9);
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::adaptiveBilateralFilter(oclSrc, oclDst, ksize, sigmaspace);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1.0);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::adaptiveBilateralFilter(src, dst, ksize, sigmaspace);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_gftt.cpp b/modules/ocl/perf/perf_gftt.cpp
deleted file mode 100644
index af24c3489..000000000
--- a/modules/ocl/perf/perf_gftt.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-
-///////////// GoodFeaturesToTrack ////////////////////////
-
-typedef tuple<string, double> GoodFeaturesToTrackParams;
-typedef TestBaseWithParam<GoodFeaturesToTrackParams> GoodFeaturesToTrackFixture;
-
-PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrack,
-            ::testing::Combine(::testing::Values(string("gpu/opticalflow/rubberwhale1.png"),
-                                                 string("gpu/stereobm/aloe-L.png")),
-                               ::testing::Range(0.0, 4.0, 3.0)))
-{
-
-    const GoodFeaturesToTrackParams param = GetParam();
-    const string fileName = getDataPath(get<0>(param));
-    const int maxCorners = 2000;
-    const double qualityLevel = 0.01, minDistance = get<1>(param);
-
-    Mat frame = imread(fileName, IMREAD_GRAYSCALE);
-    ASSERT_TRUE(!frame.empty()) << "no input image";
-
-    vector<Point2f> pts_gold;
-    declare.in(frame);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclFrame(frame), pts_oclmat;
-        ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
-
-        OCL_TEST_CYCLE() detector(oclFrame, pts_oclmat);
-
-        detector.downloadPoints(pts_oclmat, pts_gold);
-
-        SANITY_CHECK(pts_gold);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::goodFeaturesToTrack(frame, pts_gold,
-                                             maxCorners, qualityLevel, minDistance);
-
-        SANITY_CHECK(pts_gold);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_hough.cpp b/modules/ocl/perf/perf_hough.cpp
deleted file mode 100644
index e90356acb..000000000
--- a/modules/ocl/perf/perf_hough.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace perf;
-
-//////////////////////////////////////////////////////////////////////
-// HoughCircles
-
-typedef std::tr1::tuple<cv::Size, float, float> Size_Dp_MinDist_t;
-typedef perf::TestBaseWithParam<Size_Dp_MinDist_t> Size_Dp_MinDist;
-
-PERF_TEST_P(Size_Dp_MinDist, OCL_HoughCircles,
-            testing::Combine(
-                testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p),
-                testing::Values(1.0f, 2.0f, 4.0f),
-                testing::Values(1.0f, 10.0f)))
-{
-    const Size_Dp_MinDist_t params = GetParam();
-    const cv::Size size = std::tr1::get<0>(params);
-    const float dp      = std::tr1::get<1>(params);
-    const float minDist = std::tr1::get<2>(params);
-
-    const int minRadius = 10;
-    const int maxRadius = 30;
-    const int cannyThreshold = 100;
-    const int votesThreshold = 15;
-
-    cv::RNG rng(123456789);
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0)), circles;
-
-    const int numCircles = rng.uniform(50, 100);
-    for (int i = 0; i < numCircles; ++i)
-    {
-        cv::Point center(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        const int radius = rng.uniform(minRadius, maxRadius + 1);
-
-        cv::circle(src, center, radius, cv::Scalar::all(255), -1);
-    }
-
-    declare.time(10.0).iterations(25);
-
-    if (RUN_OCL_IMPL)
-    {
-        cv::ocl::oclMat ocl_src(src), ocl_circles;
-
-        OCL_TEST_CYCLE() cv::ocl::HoughCircles(ocl_src, ocl_circles, HOUGH_GRADIENT, dp, minDist,
-                                               cannyThreshold, votesThreshold, minRadius, maxRadius);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::HoughCircles(src, circles, HOUGH_GRADIENT, dp, minDist, cannyThreshold,
-                                      votesThreshold, minRadius, maxRadius);
-    }
-    else
-        OCL_PERF_ELSE
-
-    int value = 0;
-    SANITY_CHECK(value);
-}
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/perf/perf_imgproc.cpp b/modules/ocl/perf/perf_imgproc.cpp
deleted file mode 100644
index 51b354f41..000000000
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ /dev/null
@@ -1,737 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-
-///////////// equalizeHist ////////////////////////
-
-typedef TestBaseWithParam<Size> equalizeHistFixture;
-
-PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    const double eps = 1 + DBL_EPSILON;
-
-    Mat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, src.type());
-
-        OCL_TEST_CYCLE() cv::ocl::equalizeHist(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::equalizeHist(src, dst);
-
-        SANITY_CHECK(dst, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-/////////// CopyMakeBorder //////////////////////
-
-CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,
-        BORDER_WRAP, BORDER_REFLECT_101)
-
-typedef tuple<Size, MatType, Border> CopyMakeBorderParamType;
-typedef TestBaseWithParam<CopyMakeBorderParamType> CopyMakeBorderFixture;
-
-PERF_TEST_P(CopyMakeBorderFixture, CopyMakeBorder,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4),
-                               Border::all()))
-{
-    const CopyMakeBorderParamType params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), borderType = get<2>(params);
-
-    Mat src(srcSize, type), dst;
-    const Size dstSize = srcSize + Size(12, 12);
-    dst.create(dstSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::copyMakeBorder(oclSrc, oclDst, 7, 5, 5, 7, borderType, cv::Scalar(1.0));
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::copyMakeBorder(src, dst, 7, 5, 5, 7, borderType, cv::Scalar(1.0));
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// cornerMinEigenVal ////////////////////////
-
-typedef Size_MatType cornerMinEigenValFixture;
-
-PERF_TEST_P(cornerMinEigenValFixture, cornerMinEigenVal,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), borderType = BORDER_REFLECT;
-    const int blockSize = 7, apertureSize = 1 + 2 * 3;
-
-    Mat src(srcSize, type), dst(srcSize, CV_32FC1);
-    declare.in(src, WARMUP_RNG).out(dst)
-            .time(srcSize == OCL_SIZE_4000 ? 20 : srcSize == OCL_SIZE_2000 ? 5 : 3);
-
-    const int depth = CV_MAT_DEPTH(type);
-    const ERROR_TYPE errorType = depth == CV_8U ? ERROR_ABSOLUTE : ERROR_RELATIVE;
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, CV_32FC1);
-
-        OCL_TEST_CYCLE() cv::ocl::cornerMinEigenVal(oclSrc, oclDst, blockSize, apertureSize, borderType);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, errorType);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::cornerMinEigenVal(src, dst, blockSize, apertureSize, borderType);
-
-        SANITY_CHECK(dst, 1e-6, errorType);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// cornerHarris ////////////////////////
-
-typedef Size_MatType cornerHarrisFixture;
-
-PERF_TEST_P(cornerHarrisFixture, cornerHarris,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), borderType = BORDER_REFLECT;
-
-    Mat src(srcSize, type), dst(srcSize, CV_32FC1);
-    randu(src, 0, 1);
-    declare.in(src).out(dst)
-            .time(srcSize == OCL_SIZE_4000 ? 20 : srcSize == OCL_SIZE_2000 ? 5 : 3);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, CV_32FC1);
-
-        OCL_TEST_CYCLE() cv::ocl::cornerHarris(oclSrc, oclDst, 5, 7, 0.1, borderType);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 3e-5);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::cornerHarris(src, dst, 5, 7, 0.1, borderType);
-
-        SANITY_CHECK(dst, 3e-5);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// integral ////////////////////////
-
-typedef TestBaseWithParam<Size> integralFixture;
-
-PERF_TEST_P(integralFixture, integral, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src(srcSize, CV_8UC1), dst;
-    declare.in(src, WARMUP_RNG);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst;
-
-        OCL_TEST_CYCLE() cv::ocl::integral(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::integral(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// threshold////////////////////////
-
-CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TOZERO_INV)
-
-typedef tuple<Size, MatType, ThreshType> ThreshParams;
-typedef TestBaseWithParam<ThreshParams> ThreshFixture;
-
-PERF_TEST_P(ThreshFixture, threshold,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC4, CV_32FC1),
-                               ThreshType::all()))
-{
-    const ThreshParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int srcType = get<1>(params);
-    const int threshType = get<2>(params);
-    const double maxValue = 220.0, threshold = 50;
-
-    Mat src(srcSize, srcType), dst(srcSize, srcType);
-    randu(src, 0, 100);
-    declare.in(src).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, CV_8U);
-
-        OCL_TEST_CYCLE() cv::ocl::threshold(oclSrc, oclDst, threshold, maxValue, threshType);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::threshold(src, dst, threshold, maxValue, threshType);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// meanShiftFiltering////////////////////////
-
-typedef struct _COOR
-{
-    short x;
-    short y;
-} COOR;
-
-static COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, cv::Size size, int sp, int sr, int maxIter, float eps, int *tab)
-{
-
-    int isr2 = sr * sr;
-    int c0, c1, c2, c3;
-    int iter;
-    uchar *ptr = NULL;
-    uchar *pstart = NULL;
-    int revx = 0, revy = 0;
-    c0 = sptr[0];
-    c1 = sptr[1];
-    c2 = sptr[2];
-    c3 = sptr[3];
-    // iterate meanshift procedure
-    for(iter = 0; iter < maxIter; iter++ )
-    {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0 - sp;
-        int miny = y0 - sp;
-        int maxx = x0 + sp;
-        int maxy = y0 + sp;
-
-        //deal with the image boundary
-        if(minx < 0) minx = 0;
-        if(miny < 0) miny = 0;
-        if(maxx >= size.width) maxx = size.width - 1;
-        if(maxy >= size.height) maxy = size.height - 1;
-        if(iter == 0)
-        {
-            pstart = sptr;
-        }
-        else
-        {
-            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
-        }
-        ptr = pstart;
-        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
-
-        for( int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
-        {
-            int rowCount = 0;
-            int x = minx;
-#if CV_ENABLE_UNROLLED
-            for( ; x + 4 <= maxx; x += 4, ptr += 16)
-            {
-                int t0, t1, t2;
-                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 1;
-                    rowCount++;
-                }
-                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 2;
-                    rowCount++;
-                }
-                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 3;
-                    rowCount++;
-                }
-            }
-#endif
-            for(; x <= maxx; x++, ptr += 4)
-            {
-                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-            }
-            if(rowCount == 0)
-                continue;
-            count += rowCount;
-            sy += y * rowCount;
-        }
-
-        if( count == 0 )
-            break;
-
-        int x1 = sx / count;
-        int y1 = sy / count;
-        s0 = s0 / count;
-        s1 = s1 / count;
-        s2 = s2 / count;
-
-        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-            tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
-
-        //revise the pointer corresponding to the new (y0,x0)
-        revx = x1 - x0;
-        revy = y1 - y0;
-
-        x0 = x1;
-        y0 = y1;
-        c0 = s0;
-        c1 = s1;
-        c2 = s2;
-
-        if( stopFlag )
-            break;
-    } //for iter
-
-    dptr[0] = (uchar)c0;
-    dptr[1] = (uchar)c1;
-    dptr[2] = (uchar)c2;
-    dptr[3] = (uchar)c3;
-
-    COOR coor;
-    coor.x = static_cast<short>(x0);
-    coor.y = static_cast<short>(y0);
-    return coor;
-}
-
-static void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, cv::TermCriteria crit)
-{
-    if( src_roi.empty() )
-        CV_Error( Error::StsBadArg, "The input image is empty" );
-
-    if( src_roi.depth() != CV_8U || src_roi.channels() != 4 )
-        CV_Error( Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
-
-    dst_roi.create(src_roi.size(), src_roi.type());
-
-    CV_Assert( (src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) );
-    CV_Assert( !(dst_roi.step & 0x3) );
-
-    if( !(crit.type & cv::TermCriteria::MAX_ITER) )
-        crit.maxCount = 5;
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-    if( !(crit.type & cv::TermCriteria::EPS) )
-        eps = 1.f;
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-    for(int i = 0; i < 512; i++)
-        tab[i] = (i - 255) * (i - 255);
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    cv::Size size = src_roi.size();
-
-    for(int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-        dptr += dstep - (size.width << 2))
-    {
-        for(int j = 0; j < size.width; j++, sptr += 4, dptr += 4)
-        {
-            do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-}
-
-typedef TestBaseWithParam<Size> meanShiftFilteringFixture;
-
-PERF_TEST_P(meanShiftFilteringFixture, meanShiftFiltering,
-            OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    const int sp = 5, sr = 6;
-    cv::TermCriteria crit(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 5, 1);
-
-    Mat src(srcSize, CV_8UC4), dst(srcSize, CV_8UC4);
-    declare.in(src, WARMUP_RNG).out(dst)
-            .time(srcSize == OCL_SIZE_4000 ?
-                      56 : srcSize == OCL_SIZE_2000 ? 15 : 3.8);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() meanShiftFiltering_(src, dst, sp, sr, crit);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, CV_8UC4);
-
-        OCL_TEST_CYCLE() ocl::meanShiftFiltering(oclSrc, oclDst, sp, sr, crit);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-static void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, cv::TermCriteria crit)
-{
-    if (src_roi.empty())
-    {
-        CV_Error(Error::StsBadArg, "The input image is empty");
-    }
-    if (src_roi.depth() != CV_8U || src_roi.channels() != 4)
-    {
-        CV_Error(Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
-    }
-
-    dst_roi.create(src_roi.size(), src_roi.type());
-    dstCoor_roi.create(src_roi.size(), CV_16SC2);
-
-    CV_Assert((src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
-              (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
-    CV_Assert(!(dstCoor_roi.step & 0x3));
-
-    if (!(crit.type & cv::TermCriteria::MAX_ITER))
-    {
-        crit.maxCount = 5;
-    }
-
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-
-    if (!(crit.type & cv::TermCriteria::EPS))
-    {
-        eps = 1.f;
-    }
-
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-
-    for (int i = 0; i < 512; i++)
-    {
-        tab[i] = (i - 255) * (i - 255);
-    }
-
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    short *dCoorptr = (short *)dstCoor_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    int dCoorstep = (int)dstCoor_roi.step >> 1;
-    cv::Size size = src_roi.size();
-
-    for (int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
-    {
-        for (int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
-        {
-            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-
-}
-
-typedef TestBaseWithParam<Size> meanShiftProcFixture;
-
-PERF_TEST_P(meanShiftProcFixture, meanShiftProc,
-            OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    TermCriteria crit(TermCriteria::COUNT + TermCriteria::EPS, 5, 1);
-
-    Mat src(srcSize, CV_8UC4), dst1(srcSize, CV_8UC4),
-            dst2(srcSize, CV_16SC2);
-    declare.in(src, WARMUP_RNG).out(dst1, dst2)
-            .time(srcSize == OCL_SIZE_4000 ?
-                      56 : srcSize == OCL_SIZE_2000 ? 15 : 3.8);;
-
-    if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() meanShiftProc_(src, dst1, dst2, 5, 6, crit);
-
-        SANITY_CHECK(dst1);
-        SANITY_CHECK(dst2);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst1(srcSize, CV_8UC4),
-                oclDst2(srcSize, CV_16SC2);
-
-        OCL_TEST_CYCLE() ocl::meanShiftProc(oclSrc, oclDst1, oclDst2, 5, 6, crit);
-
-        oclDst1.download(dst1);
-        oclDst2.download(dst2);
-
-        SANITY_CHECK(dst1);
-        SANITY_CHECK(dst2);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// CLAHE ////////////////////////
-
-typedef TestBaseWithParam<Size> CLAHEFixture;
-
-PERF_TEST_P(CLAHEFixture, CLAHE, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-    const string impl = getSelectedImpl();
-
-    Mat src(srcSize, CV_8UC1), dst;
-    const double clipLimit = 40.0;
-    declare.in(src, WARMUP_RNG);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(11);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst;
-        cv::Ptr<cv::CLAHE> oclClahe = cv::ocl::createCLAHE(clipLimit);
-
-        OCL_TEST_CYCLE() oclClahe->apply(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        cv::Ptr<cv::CLAHE> clahe = cv::createCLAHE(clipLimit);
-        TEST_CYCLE() clahe->apply(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// columnSum////////////////////////
-
-typedef TestBaseWithParam<Size> columnSumFixture;
-
-static void columnSumPerfTest(const Mat & src, Mat & dst)
-{
-    for (int j = 0; j < src.cols; j++)
-        dst.at<float>(0, j) = src.at<float>(0, j);
-
-    for (int i = 1; i < src.rows; ++i)
-        for (int j = 0; j < src.cols; ++j)
-            dst.at<float>(i, j) = dst.at<float>(i - 1 , j) + src.at<float>(i , j);
-}
-
-PERF_TEST_P(columnSumFixture, columnSum, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src(srcSize, CV_32FC1), dst(srcSize, CV_32FC1);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (srcSize == OCL_SIZE_4000)
-        declare.time(5);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, CV_32FC1);
-
-        OCL_TEST_CYCLE() cv::ocl::columnSum(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() columnSumPerfTest(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-//////////////////////////////distanceToCenters////////////////////////////////////////////////
-
-CV_ENUM(DistType, NORM_L1, NORM_L2SQR)
-
-typedef tuple<Size, DistType> distanceToCentersParameters;
-typedef TestBaseWithParam<distanceToCentersParameters> distanceToCentersFixture;
-
-static void distanceToCentersPerfTest(Mat& src, Mat& centers, Mat& dists, Mat& labels, int distType)
-{
-    Mat batch_dists;
-    cv::batchDistance(src, centers, batch_dists, CV_32FC1, noArray(), distType);
-
-    std::vector<float> dists_v;
-    std::vector<int> labels_v;
-
-    for (int i = 0; i < batch_dists.rows; i++)
-    {
-        Mat r = batch_dists.row(i);
-        double mVal;
-        Point mLoc;
-
-        minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
-        dists_v.push_back(static_cast<float>(mVal));
-        labels_v.push_back(mLoc.x);
-    }
-
-    Mat(dists_v).copyTo(dists);
-    Mat(labels_v).copyTo(labels);
-}
-
-PERF_TEST_P(distanceToCentersFixture, distanceToCenters, ::testing::Combine(::testing::Values(cv::Size(256,256), cv::Size(512,512)), DistType::all()) )
-{
-    Size size = get<0>(GetParam());
-    int distType = get<1>(GetParam());
-
-    Mat src(size, CV_32FC1), centers(size, CV_32FC1);
-    Mat dists(src.rows, 1, CV_32FC1), labels(src.rows, 1, CV_32SC1);
-
-    declare.in(src, centers, WARMUP_RNG).out(dists, labels);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat ocl_src(src), ocl_centers(centers);
-
-        OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_src, ocl_centers, dists, labels, distType);
-
-        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
-        SANITY_CHECK(labels);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() distanceToCentersPerfTest(src, centers, dists, labels, distType);
-
-        SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
-        SANITY_CHECK(labels);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_imgwarp.cpp b/modules/ocl/perf/perf_imgwarp.cpp
deleted file mode 100644
index e768d6621..000000000
--- a/modules/ocl/perf/perf_imgwarp.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-
-///////////// WarpAffine ////////////////////////
-
-typedef Size_MatType WarpAffineFixture;
-
-PERF_TEST_P(WarpAffineFixture, WarpAffine,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    static const double coeffs[2][3] =
-    {
-        { cos(CV_PI / 6), -sin(CV_PI / 6), 100.0 },
-        { sin(CV_PI / 6), cos(CV_PI / 6), -100.0 }
-    };
-    Mat M(2, 3, CV_64F, (void *)coeffs);
-    const int interpolation = INTER_NEAREST;
-
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::warpAffine(oclSrc, oclDst, M, srcSize, interpolation);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::warpAffine(src, dst, M, srcSize, interpolation);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// WarpPerspective ////////////////////////
-
-typedef Size_MatType WarpPerspectiveFixture;
-
-PERF_TEST_P(WarpPerspectiveFixture, WarpPerspective,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    static const double coeffs[3][3] =
-    {
-        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
-        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0},
-        {0.0, 0.0, 1.0}
-    };
-    Mat M(3, 3, CV_64F, (void *)coeffs);
-    const int interpolation = INTER_LINEAR;
-
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst)
-            .time(srcSize == OCL_SIZE_4000 ? 18 : srcSize == OCL_SIZE_2000 ? 5 : 2);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::warpPerspective(oclSrc, oclDst, M, srcSize, interpolation);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::warpPerspective(src, dst, M, srcSize, interpolation);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// resize ////////////////////////
-
-CV_ENUM(resizeInterType, INTER_NEAREST, INTER_LINEAR)
-
-typedef tuple<Size, MatType, resizeInterType, double> resizeParams;
-typedef TestBaseWithParam<resizeParams> resizeFixture;
-
-PERF_TEST_P(resizeFixture, resize,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4),
-                               resizeInterType::all(),
-                               ::testing::Values(0.5, 2.0)))
-{
-    const resizeParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), interType = get<2>(params);
-    double scale = get<3>(params);
-    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-    checkDeviceMaxMemoryAllocSize(dstSize, type);
-
-    Mat src(srcSize, type), dst;
-    dst.create(dstSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-    if (interType == INTER_LINEAR && type == CV_8UC4 && OCL_SIZE_4000 == srcSize)
-        declare.time(11);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::resize(oclSrc, oclDst, Size(), scale, scale, interType);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, interType);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-typedef tuple<Size, MatType, double> resizeAreaParams;
-typedef TestBaseWithParam<resizeAreaParams> resizeAreaFixture;
-
-PERF_TEST_P(resizeAreaFixture, resize,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                               ::testing::Values(0.3, 0.5, 0.6)))
-{
-    const resizeAreaParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    double scale = get<2>(params);
-    const Size dstSize(cvRound(srcSize.width * scale), cvRound(srcSize.height * scale));
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type), dst;
-    dst.create(dstSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(dstSize, type);
-
-        OCL_TEST_CYCLE() cv::ocl::resize(oclSrc, oclDst, Size(), scale, scale, cv::INTER_AREA);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::resize(src, dst, Size(), scale, scale, cv::INTER_AREA);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// remap////////////////////////
-
-CV_ENUM(RemapInterType, INTER_NEAREST, INTER_LINEAR)
-
-typedef tuple<Size, MatType, RemapInterType> remapParams;
-typedef TestBaseWithParam<remapParams> remapFixture;
-
-PERF_TEST_P(remapFixture, remap,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4),
-                               RemapInterType::all()))
-{
-    const remapParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params), interpolation = get<2>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (srcSize == OCL_SIZE_4000 && interpolation == INTER_LINEAR)
-        declare.time(9);
-
-    Mat xmap, ymap;
-    xmap.create(srcSize, CV_32FC1);
-    ymap.create(srcSize, CV_32FC1);
-
-    for (int i = 0; i < srcSize.height; ++i)
-    {
-        float * const xmap_row = xmap.ptr<float>(i);
-        float * const ymap_row = ymap.ptr<float>(i);
-
-        for (int j = 0; j < srcSize.width; ++j)
-        {
-            xmap_row[j] = (j - srcSize.width * 0.5f) * 0.75f + srcSize.width * 0.5f;
-            ymap_row[j] = (i - srcSize.height * 0.5f) * 0.75f + srcSize.height * 0.5f;
-        }
-    }
-
-    const int borderMode = BORDER_CONSTANT;
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-        ocl::oclMat oclXMap(xmap), oclYMap(ymap);
-
-        OCL_TEST_CYCLE() cv::ocl::remap(oclSrc, oclDst, oclXMap, oclYMap, interpolation, borderMode);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-        SANITY_CHECK(dst, 1 + DBL_EPSILON);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-
-///////////// buildWarpPerspectiveMaps ////////////////////////
-
-static void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, Mat &xmap, Mat &ymap)
-{
-    CV_Assert(M.rows == 3 && M.cols == 3);
-    CV_Assert(dsize.area() > 0);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[3 * 3];
-    Mat coeffsMat(3, 3, CV_32F, (void *)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invert(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    for (int y = 0; y < dsize.height; ++y)
-    {
-        float * const xmap_ptr = xmap.ptr<float>(y);
-        float * const ymap_ptr = ymap.ptr<float>(y);
-
-        for (int x = 0; x < dsize.width; ++x)
-        {
-            float coeff = 1.0f / (x * coeffs[6] + y * coeffs[7] + coeffs[8]);
-            xmap_ptr[x] = (x * coeffs[0] + y * coeffs[1] + coeffs[2]) * coeff;
-            ymap_ptr[x] = (x * coeffs[3] + y * coeffs[4] + coeffs[5]) * coeff;
-        }
-    }
-}
-
-typedef TestBaseWithParam<Size> buildWarpPerspectiveMapsFixture;
-
-PERF_TEST_P(buildWarpPerspectiveMapsFixture, Inverse, OCL_TYPICAL_MAT_SIZES)
-{
-    static const double coeffs[3][3] =
-    {
-        {cos(CV_PI / 6), -sin(CV_PI / 6), 100.0},
-        {sin(CV_PI / 6), cos(CV_PI / 6), -100.0},
-        {0.0, 0.0, 1.0}
-    };
-    Mat M(3, 3, CV_64F, (void *)coeffs);
-    const Size dsize = GetParam();
-    const double eps = 5e-4;
-
-    Mat xmap(dsize, CV_32FC1), ymap(dsize, CV_32FC1);
-    declare.in(M).out(xmap, ymap);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclXMap(dsize, CV_32FC1), oclYMap(dsize, CV_32FC1);
-
-        OCL_TEST_CYCLE() cv::ocl::buildWarpPerspectiveMaps(M, true, dsize, oclXMap, oclYMap);
-
-        oclXMap.download(xmap);
-        oclYMap.download(ymap);
-
-        SANITY_CHECK(xmap, eps);
-        SANITY_CHECK(ymap, eps);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() buildWarpPerspectiveMaps(M, true, dsize, xmap, ymap);
-
-        SANITY_CHECK(xmap, eps);
-        SANITY_CHECK(ymap, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_matrix_operation.cpp b/modules/ocl/perf/perf_matrix_operation.cpp
deleted file mode 100644
index 5ca322e22..000000000
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-
-///////////// ConvertTo////////////////////////
-
-typedef Size_MatType ConvertToFixture;
-
-PERF_TEST_P(ConvertToFixture, ConvertTo,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst;
-    const int dstType = CV_MAKE_TYPE(CV_32F, src.channels());
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
-
-    dst.create(srcSize, dstType);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, dstType);
-
-        OCL_TEST_CYCLE() oclSrc.convertTo(oclDst, dstType);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() src.convertTo(dst, dstType);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// copyTo////////////////////////
-
-typedef Size_MatType copyToFixture;
-
-PERF_TEST_P(copyToFixture, copyTo,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-
-    Mat src(srcSize, type), dst(srcSize, type);
-    declare.in(src, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src), oclDst(srcSize, type);
-
-        OCL_TEST_CYCLE() oclSrc.copyTo(oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() src.copyTo(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// setTo////////////////////////
-
-typedef Size_MatType setToFixture;
-
-PERF_TEST_P(setToFixture, setTo,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    const Scalar val(1, 2, 3, 4);
-
-    Mat src(srcSize, type);
-    declare.in(src);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(srcSize, type);
-
-        OCL_TEST_CYCLE() oclSrc.setTo(val);
-        oclSrc.download(src);
-
-        SANITY_CHECK(src);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() src.setTo(val);
-
-        SANITY_CHECK(src);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-#if 0
-
-/////////////////// upload ///////////////////////////
-
-typedef tuple<Size, MatDepth, int> uploadParams;
-typedef TestBaseWithParam<uploadParams> uploadFixture;
-
-PERF_TEST_P(uploadFixture, upload,
-            testing::Combine(
-                OCL_TYPICAL_MAT_SIZES,
-                testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
-                testing::Range(1, 5)))
-{
-    const uploadParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int depth = get<1>(params), cn = get<2>(params);
-    const int type = CV_MAKE_TYPE(depth, cn);
-
-    Mat src(srcSize, type), dst;
-    declare.in(src, WARMUP_RNG);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclDst;
-
-        for(; startTimer(), next(); ocl::finish(), stopTimer(), oclDst.release())
-            oclDst.upload(src);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
-            dst = src.clone();
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK_NOTHING();
-}
-
-/////////////////// download ///////////////////////////
-
-typedef TestBaseWithParam<uploadParams> downloadFixture;
-
-PERF_TEST_P(downloadFixture, download,
-            testing::Combine(
-                OCL_TYPICAL_MAT_SIZES,
-                testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
-                testing::Range(1, 5)))
-{
-    const uploadParams params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int depth = get<1>(params), cn = get<2>(params);
-    const int type = CV_MAKE_TYPE(depth, cn);
-
-    Mat src(srcSize, type), dst;
-    declare.in(src, WARMUP_RNG);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-
-        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
-            oclSrc.download(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        for(; startTimer(), next(); ocl::finish(), stopTimer(), dst.release())
-            dst = src.clone();
-    }
-    else
-        OCL_PERF_ELSE
-
-    SANITY_CHECK_NOTHING();
-}
-
-#endif
diff --git a/modules/ocl/perf/perf_ml.cpp b/modules/ocl/perf/perf_ml.cpp
deleted file mode 100644
index db45eceb8..000000000
--- a/modules/ocl/perf/perf_ml.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//    Xiaopeng Fu, fuxiaopeng2222@163.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-using namespace perf;
-using namespace std;
-using namespace cv::ocl;
-using namespace cv;
-using std::tr1::tuple;
-using std::tr1::get;
-////////////////////////////////// K-NEAREST NEIGHBOR ////////////////////////////////////
-static void genData(Mat& trainData, Size size, Mat& trainLabel = Mat().setTo(Scalar::all(0)), int nClasses = 0)
-{
-    trainData.create(size, CV_32FC1);
-    randu(trainData, 1.0, 100.0);
-
-    if(nClasses != 0)
-    {
-        trainLabel.create(size.height, 1, CV_8UC1);
-        randu(trainLabel, 0, nClasses - 1);
-        trainLabel.convertTo(trainLabel, CV_32FC1);
-    }
-}
-
-typedef tuple<int> KNNParamType;
-typedef TestBaseWithParam<KNNParamType> KNNFixture;
-
-PERF_TEST_P(KNNFixture, KNN,
-            testing::Values(1000, 2000, 4000))
-{
-    KNNParamType params = GetParam();
-    const int rows = get<0>(params);
-    int columns = 100;
-    int k = rows/250;
-
-    Mat trainData, trainLabels;
-    Size size(columns, rows);
-    genData(trainData, size, trainLabels, 3);
-
-    Mat testData;
-    genData(testData, size);
-    Mat best_label;
-
-    if(RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE()
-        {
-            CvKNearest knn_cpu;
-            knn_cpu.train(trainData, trainLabels);
-            knn_cpu.find_nearest(testData, k, &best_label);
-        }
-    }else if(RUN_OCL_IMPL)
-    {
-        cv::ocl::oclMat best_label_ocl;
-        cv::ocl::oclMat testdata;
-        testdata.upload(testData);
-
-        OCL_TEST_CYCLE()
-        {
-            cv::ocl::KNearestNeighbour knn_ocl;
-            knn_ocl.train(trainData, trainLabels);
-            knn_ocl.find_nearest(testdata, k, best_label_ocl);
-        }
-        best_label_ocl.download(best_label);
-    }else
-        OCL_PERF_ELSE
-    SANITY_CHECK(best_label);
-}
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_moments.cpp b/modules/ocl/perf/perf_moments.cpp
deleted file mode 100644
index 631031ecb..000000000
--- a/modules/ocl/perf/perf_moments.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other Materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-
-///////////// Moments ////////////////////////
-//*! performance of image
-typedef tuple<Size, MatType, bool> MomentsParamType;
-typedef TestBaseWithParam<MomentsParamType> MomentsFixture;
-
-PERF_TEST_P(MomentsFixture, Moments,
-    ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                       OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_16UC1, CV_32FC1), ::testing::Bool()))
-{
-    const MomentsParamType params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int type = get<1>(params);
-    const bool binaryImage = get<2>(params);
-
-    Mat  src(srcSize, type), dst(7, 1, CV_64F);
-    randu(src, 0, 255);
-
-    cv::Moments mom;
-    if (RUN_OCL_IMPL)
-    {
-        oclMat src_d(src);
-        OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() mom = cv::moments(src, binaryImage);
-    }
-    else
-        OCL_PERF_ELSE
-    cv::HuMoments(mom, dst);
-    SANITY_CHECK(dst, 2e-1);
-}
diff --git a/modules/ocl/perf/perf_opticalflow.cpp b/modules/ocl/perf/perf_opticalflow.cpp
deleted file mode 100644
index bc1761b49..000000000
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-///////////// PyrLKOpticalFlow ////////////////////////
-
-using namespace perf;
-using std::tr1::get;
-using std::tr1::tuple;
-using std::tr1::make_tuple;
-
-CV_ENUM(LoadMode, IMREAD_GRAYSCALE, IMREAD_COLOR)
-
-typedef tuple<int, tuple<string, string, LoadMode> > PyrLKOpticalFlowParamType;
-typedef TestBaseWithParam<PyrLKOpticalFlowParamType> PyrLKOpticalFlowFixture;
-
-PERF_TEST_P(PyrLKOpticalFlowFixture,
-            PyrLKOpticalFlow,
-            ::testing::Combine(
-                ::testing::Values(1000, 2000, 4000),
-                ::testing::Values(
-                    make_tuple<string, string, LoadMode>
-                    (
-                        string("gpu/opticalflow/rubberwhale1.png"),
-                        string("gpu/opticalflow/rubberwhale2.png"),
-                        LoadMode(IMREAD_COLOR)
-                        ),
-                    make_tuple<string, string, LoadMode>
-                    (
-                        string("gpu/stereobm/aloe-L.png"),
-                        string("gpu/stereobm/aloe-R.png"),
-                        LoadMode(IMREAD_GRAYSCALE)
-                        )
-                    )
-                )
-            )
-{
-    PyrLKOpticalFlowParamType params = GetParam();
-    tuple<string, string, LoadMode> fileParam = get<1>(params);
-    const int pointsCount = get<0>(params);
-    const int openMode = static_cast<int>(get<2>(fileParam));
-    const string fileName0 = get<0>(fileParam), fileName1 = get<1>(fileParam);
-    Mat frame0 = imread(getDataPath(fileName0), openMode);
-    Mat frame1 = imread(getDataPath(fileName1), openMode);
-
-    declare.in(frame0, frame1);
-
-    ASSERT_FALSE(frame0.empty()) << "can't load " << fileName0;
-    ASSERT_FALSE(frame1.empty()) << "can't load " << fileName1;
-
-    Mat grayFrame;
-    if (openMode == IMREAD_COLOR)
-        cvtColor(frame0, grayFrame, COLOR_BGR2GRAY);
-    else
-        grayFrame = frame0;
-
-    vector<Point2f> pts, nextPts;
-    vector<unsigned char> status;
-    vector<float> err;
-    goodFeaturesToTrack(grayFrame, pts, pointsCount, 0.01, 0.0);
-    Mat ptsMat(1, static_cast<int>(pts.size()), CV_32FC2, (void *)&pts[0]);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE()
-                cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::PyrLKOpticalFlow oclPyrLK;
-        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1);
-        ocl::oclMat oclPts(ptsMat);
-        ocl::oclMat oclNextPts, oclStatus, oclErr;
-
-        OCL_TEST_CYCLE()
-                oclPyrLK.sparse(oclFrame0, oclFrame1, oclPts, oclNextPts, oclStatus, &oclErr);
-    }
-    else
-        OCL_PERF_ELSE
-
-    int value = 0;
-    SANITY_CHECK(value);
-}
-
-PERF_TEST(tvl1flowFixture, tvl1flow)
-{
-    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
-
-    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
-
-    const Size srcSize = frame0.size();
-    const double eps = 1.2;
-    Mat flow(srcSize, CV_32FC2), flow1(srcSize, CV_32FC1), flow2(srcSize, CV_32FC1);
-    declare.in(frame0, frame1).out(flow1, flow2).time(159);
-
-    if (RUN_PLAIN_IMPL)
-    {
-        Ptr<DenseOpticalFlow> alg = createOptFlow_DualTVL1();
-
-        TEST_CYCLE() alg->calc(frame0, frame1, flow);
-
-        alg->collectGarbage();
-        Mat flows[2] = { flow1, flow2 };
-        split(flow, flows);
-
-        SANITY_CHECK(flow1, eps);
-        SANITY_CHECK(flow2, eps);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::OpticalFlowDual_TVL1_OCL oclAlg;
-        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1), oclFlow1(srcSize, CV_32FC1),
-                oclFlow2(srcSize, CV_32FC1);
-
-        OCL_TEST_CYCLE() oclAlg(oclFrame0, oclFrame1, oclFlow1, oclFlow2);
-
-        oclAlg.collectGarbage();
-
-        oclFlow1.download(flow1);
-        oclFlow2.download(flow2);
-
-        SANITY_CHECK(flow1, eps);
-        SANITY_CHECK(flow2, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// FarnebackOpticalFlow ////////////////////////
-
-CV_ENUM(farneFlagType, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
-
-typedef tuple<tuple<int, double>, farneFlagType, bool> FarnebackOpticalFlowParams;
-typedef TestBaseWithParam<FarnebackOpticalFlowParams> FarnebackOpticalFlowFixture;
-
-PERF_TEST_P(FarnebackOpticalFlowFixture, FarnebackOpticalFlow,
-            ::testing::Combine(
-                ::testing::Values(make_tuple<int, double>(5, 1.1),
-                                  make_tuple<int, double>(7, 1.5)),
-                farneFlagType::all(),
-                ::testing::Bool()))
-{
-    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
-
-    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
-
-    const Size srcSize = frame0.size();
-
-    const FarnebackOpticalFlowParams params = GetParam();
-    const tuple<int, double> polyParams = get<0>(params);
-    const int polyN = get<0>(polyParams), flags = get<1>(params);
-    const double polySigma = get<1>(polyParams), pyrScale = 0.5;
-    const bool useInitFlow = get<2>(params);
-    const double eps = 1.5;
-
-    Mat flowx(srcSize, CV_32FC1), flowy(srcSize, CV_32FC1), flow(srcSize, CV_32FC2);
-    declare.in(frame0, frame1).out(flowx, flowy);
-
-    ocl::FarnebackOpticalFlow farn;
-    farn.pyrScale = pyrScale;
-    farn.polyN = polyN;
-    farn.polySigma = polySigma;
-    farn.flags = flags;
-
-    if (RUN_PLAIN_IMPL)
-    {
-        if (useInitFlow)
-        {
-            calcOpticalFlowFarneback(
-                        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-                        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-            farn.flags |= OPTFLOW_USE_INITIAL_FLOW;
-        }
-
-        TEST_CYCLE()
-                calcOpticalFlowFarneback(
-                    frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-                    farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-
-        Mat flowxy[2] = { flowx, flowy };
-        split(flow, flowxy);
-
-        SANITY_CHECK(flowx, eps);
-        SANITY_CHECK(flowy, eps);
-    }
-    else if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1),
-                oclFlowx(srcSize, CV_32FC1), oclFlowy(srcSize, CV_32FC1);
-
-        if (useInitFlow)
-        {
-            farn(oclFrame0, oclFrame1, oclFlowx, oclFlowy);
-            farn.flags |= OPTFLOW_USE_INITIAL_FLOW;
-        }
-
-        OCL_TEST_CYCLE()
-                farn(oclFrame0, oclFrame1, oclFlowx, oclFlowy);
-
-        oclFlowx.download(flowx);
-        oclFlowy.download(flowy);
-
-        SANITY_CHECK(flowx, eps);
-        SANITY_CHECK(flowy, eps);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/perf/perf_precomp.hpp b/modules/ocl/perf/perf_precomp.hpp
deleted file mode 100644
index 01626d5a7..000000000
--- a/modules/ocl/perf/perf_precomp.hpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#ifndef __OPENCV_PERF_PRECOMP_HPP__
-#define __OPENCV_PERF_PRECOMP_HPP__
-
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  if defined __clang__ || defined __APPLE__
-#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#    pragma GCC diagnostic ignored "-Wextra"
-#  endif
-#endif
-
-#include <iomanip>
-#include <stdexcept>
-#include <string>
-#include <iostream>
-#include <cstdio>
-#include <vector>
-#include <numeric>
-
-#include "cvconfig.h"
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/calib3d.hpp"
-#include "opencv2/video.hpp"
-#include "opencv2/objdetect.hpp"
-#include "opencv2/features2d.hpp"
-#include "opencv2/ocl.hpp"
-#include "opencv2/ts.hpp"
-
-using namespace std;
-using namespace cv;
-
-#define OCL_SIZE_1000 Size(1000, 1000)
-#define OCL_SIZE_2000 Size(2000, 2000)
-#define OCL_SIZE_4000 Size(4000, 4000)
-
-#define OCL_TYPICAL_MAT_SIZES ::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000, OCL_SIZE_4000)
-
-#define OCL_PERF_ENUM(type, ...) ::testing::Values(type, ## __VA_ARGS__ )
-
-#define IMPL_OCL "ocl"
-#define IMPL_GPU "gpu"
-#define IMPL_PLAIN "plain"
-
-#define RUN_OCL_IMPL (IMPL_OCL == getSelectedImpl())
-#define RUN_PLAIN_IMPL (IMPL_PLAIN == getSelectedImpl())
-
-#ifdef HAVE_OPENCV_GPU
-# define RUN_GPU_IMPL (IMPL_GPU == getSelectedImpl())
-#endif
-
-#ifdef HAVE_OPENCV_GPU
-#define OCL_PERF_ELSE               \
-        if (RUN_GPU_IMPL)           \
-            CV_TEST_FAIL_NO_IMPL(); \
-        else                        \
-            CV_TEST_FAIL_NO_IMPL();
-#else
-#define OCL_PERF_ELSE               \
-            CV_TEST_FAIL_NO_IMPL();
-#endif
-
-#define OCL_TEST_CYCLE_N(n) for(declare.iterations(n); startTimer(), next(); cv::ocl::finish(), stopTimer())
-#define OCL_TEST_CYCLE() for(; startTimer(), next(); cv::ocl::finish(), stopTimer())
-#define OCL_TEST_CYCLE_MULTIRUN(runsNum) for(declare.runs(runsNum); startTimer(), next(); stopTimer()) for(int r = 0; r < runsNum; cv::ocl::finish(), ++r)
-
-// TODO: Move to the ts module
-namespace cvtest {
-namespace ocl {
-inline void checkDeviceMaxMemoryAllocSize(const Size& size, int type, int factor = 1)
-{
-    assert(factor > 0);
-    if (!(IMPL_OCL == perf::TestBase::getSelectedImpl()))
-        return; // OpenCL devices are not used
-    int cn = CV_MAT_CN(type);
-    int cn_ocl = cn == 3 ? 4 : cn;
-    int type_ocl = CV_MAKE_TYPE(CV_MAT_DEPTH(type), cn_ocl);
-    size_t memSize = size.area() * CV_ELEM_SIZE(type_ocl);
-    const cv::ocl::DeviceInfo& devInfo = cv::ocl::Context::getContext()->getDeviceInfo();
-    if (memSize * factor >= devInfo.maxMemAllocSize)
-    {
-        throw perf::TestBase::PerfSkipTestException();
-    }
-}
-
-struct KeypointIdxCompare
-{
-    std::vector<cv::KeyPoint>* keypoints;
-
-    explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
-
-    bool operator ()(size_t i1, size_t i2) const
-    {
-        cv::KeyPoint kp1 = (*keypoints)[i1];
-        cv::KeyPoint kp2 = (*keypoints)[i2];
-        if (kp1.pt.x != kp2.pt.x)
-            return kp1.pt.x < kp2.pt.x;
-        if (kp1.pt.y != kp2.pt.y)
-            return kp1.pt.y < kp2.pt.y;
-        if (kp1.response != kp2.response)
-            return kp1.response < kp2.response;
-        return kp1.octave < kp2.octave;
-    }
-};
-
-inline void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
-{
-    std::vector<size_t> indexies(keypoints.size());
-    for (size_t i = 0; i < indexies.size(); ++i)
-        indexies[i] = i;
-
-    std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
-
-    std::vector<cv::KeyPoint> new_keypoints;
-    cv::Mat new_descriptors;
-
-    new_keypoints.resize(keypoints.size());
-
-    cv::Mat descriptors;
-    if (_descriptors.needed())
-    {
-        descriptors = _descriptors.getMat();
-        new_descriptors.create(descriptors.size(), descriptors.type());
-    }
-
-    for (size_t i = 0; i < indexies.size(); ++i)
-    {
-        size_t new_idx = indexies[i];
-        new_keypoints[i] = keypoints[new_idx];
-        if (!new_descriptors.empty())
-            descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
-    }
-
-    keypoints.swap(new_keypoints);
-    if (_descriptors.needed())
-        new_descriptors.copyTo(_descriptors);
-}
-
-} // namespace cvtest::ocl
-} // namespace cvtest
-
-using namespace cvtest::ocl;
-
-#endif
diff --git a/modules/ocl/perf/perf_split_merge.cpp b/modules/ocl/perf/perf_split_merge.cpp
deleted file mode 100644
index ecfc49e33..000000000
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Fangfang Bai, fangfang@multicorewareinc.com
-//    Jin Ma,       jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "perf_precomp.hpp"
-
-using namespace perf;
-using std::tr1::tuple;
-using std::tr1::get;
-
-///////////// Merge////////////////////////
-
-typedef Size_MatType MergeFixture;
-
-PERF_TEST_P(MergeFixture, Merge,
-            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
-                               OCL_PERF_ENUM(CV_8U, CV_32F)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int depth = get<1>(params), channels = 3;
-    const int dstType = CV_MAKE_TYPE(depth, channels);
-
-    checkDeviceMaxMemoryAllocSize(srcSize, dstType);
-
-    Mat dst(srcSize, dstType);
-    vector<Mat> src(channels);
-    for (vector<Mat>::iterator i = src.begin(), end = src.end(); i != end; ++i)
-    {
-        i->create(srcSize, CV_MAKE_TYPE(depth, 1));
-        declare.in(*i, WARMUP_RNG);
-    }
-    declare.out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclDst(srcSize, dstType);
-        vector<ocl::oclMat> oclSrc(src.size());
-        for (vector<ocl::oclMat>::size_type i = 0, end = src.size(); i < end; ++i)
-            oclSrc[i] = src[i];
-
-        OCL_TEST_CYCLE() cv::ocl::merge(oclSrc, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        TEST_CYCLE() cv::merge(src, dst);
-
-        SANITY_CHECK(dst);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
-///////////// Split////////////////////////
-
-typedef Size_MatType SplitFixture;
-
-PERF_TEST_P(SplitFixture, Split,
-            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
-                               OCL_PERF_ENUM(CV_8U, CV_32F)))
-{
-    const Size_MatType_t params = GetParam();
-    const Size srcSize = get<0>(params);
-    const int depth = get<1>(params), channels = 3;
-    const int type = CV_MAKE_TYPE(depth, channels);
-
-    checkDeviceMaxMemoryAllocSize(srcSize, type);
-
-    Mat src(srcSize, type);
-    declare.in(src, WARMUP_RNG);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc(src);
-        vector<ocl::oclMat> oclDst(channels, ocl::oclMat(srcSize, CV_MAKE_TYPE(depth, 1)));
-
-        OCL_TEST_CYCLE() cv::ocl::split(oclSrc, oclDst);
-
-        ASSERT_EQ(3, channels);
-        Mat dst0, dst1, dst2;
-        oclDst[0].download(dst0);
-        oclDst[1].download(dst1);
-        oclDst[2].download(dst2);
-        SANITY_CHECK(dst0);
-        SANITY_CHECK(dst1);
-        SANITY_CHECK(dst2);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        vector<Mat> dst(channels, Mat(srcSize, CV_MAKE_TYPE(depth, 1)));
-        TEST_CYCLE() cv::split(src, dst);
-
-        ASSERT_EQ(3, channels);
-        Mat & dst0 = dst[0], & dst1 = dst[1], & dst2 = dst[2];
-        SANITY_CHECK(dst0);
-        SANITY_CHECK(dst1);
-        SANITY_CHECK(dst2);
-    }
-    else
-        OCL_PERF_ELSE
-}
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
deleted file mode 100644
index d008e8b40..000000000
--- a/modules/ocl/src/arithm.cpp
+++ /dev/null
@@ -1,1804 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, jlyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
-{
-    CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
-
-    static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
-                               sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
-
-    int elemSize1 = sizeMap[depth];
-    int bufSize = elemSize1 * ocn;
-    std::vector<uchar> _buf(bufSize);
-    uchar * buf = &_buf[0];
-    scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
-    memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
-
-    return _buf;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////// add subtract multiply divide min max /////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-enum { ADD = 0, SUB, MUL, DIV, ABS, ABS_DIFF, MIN, MAX };
-
-static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const Scalar & scalar, const oclMat & mask,
-                            oclMat &dst, int op_type, bool use_scalar = false)
-{
-    Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
-    if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
-    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
-    CV_Assert(op_type >= ADD && op_type <= MAX);
-
-    dst.create(src1.size(), src1.type());
-
-    int oclChannels = src1.oclchannels(), depth = src1.depth();
-    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
-    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
-    int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
-    int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
-    std::vector<uchar> m;
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 16, 10, 1 };
-#else
-    size_t localThreads[3]  = { 16, 16, 1 };
-#endif
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    std::string kernelName = "arithm_binary_op";
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" };
-    const char * const funcMap[] = { "FUNC_ADD", "FUNC_SUB", "FUNC_MUL", "FUNC_DIV", "FUNC_ABS", "FUNC_ABS_DIFF", "FUNC_MIN", "FUNC_MAX" };
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    bool haveScalar = use_scalar || src2.empty();
-
-    int WDepth = depth;
-    if (haveScalar)
-        WDepth = hasDouble && WDepth == CV_64F ? CV_64F : CV_32F;
-    if (op_type == DIV)
-        WDepth = hasDouble ? CV_64F : CV_32F;
-    else if (op_type == MUL)
-        WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F;
-
-    std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D %s "
-                                      "-D convertToWT=convert_%s%s",
-                                      typeMap[depth], channelMap[oclChannels],
-                                      WTypeMap[WDepth], channelMap[oclChannels],
-                                      typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")),
-                                      funcMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
-
-    if (!src2.empty())
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
-
-        kernelName += "_mat";
-
-        if (haveScalar)
-            buildOptions += " -D HAVE_SCALAR";
-    }
-
-    if (haveScalar)
-    {
-        const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
-        m = scalarToVector(scalar, WDepthMap[WDepth], oclChannels, src1.channels());
-
-        args.push_back( std::make_pair( m.size(), (void *)&m[0]));
-
-        kernelName += "_scalar";
-    }
-
-    if (!mask.empty())
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskstep1 ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskoffset1 ));
-
-        kernelName += "_mask";
-    }
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-
-    openCLExecuteKernel(clCxt, mask.empty() ?
-                            (!src2.empty() ? &arithm_add : &arithm_add_scalar) :
-                            (!src2.empty() ? &arithm_add_mask : &arithm_add_scalar_mask),
-                        kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, ADD);
-}
-
-void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, ADD);
-}
-
-void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run_generic(src1, src2, Scalar(), mask, dst, SUB);
-}
-
-void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
-{
-    arithmetic_run_generic(src1, oclMat(), src2, mask, dst, SUB);
-}
-
-void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
-    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, MUL, use_scalar);
-}
-
-void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
-{
-    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, MUL);
-}
-
-void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
-{
-    const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
-    arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, DIV, use_scalar);
-}
-
-void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
-{
-    arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, DIV);
-}
-
-void cv::ocl::min(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run_generic(src1, src2, Scalar::all(0), oclMat(), dst, MIN);
-}
-
-void cv::ocl::max(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run_generic(src1, src2, Scalar::all(0), oclMat(), dst, MAX);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////Abs, Absdiff ////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-void cv::ocl::abs(const oclMat &src, oclMat &dst)
-{
-    // explicitly uses use_scalar (even if zero) so that the correct kernel is used
-    arithmetic_run_generic(src, oclMat(), Scalar(), oclMat(), dst, ABS, true);
-}
-
-void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    arithmetic_run_generic(src1, src2, Scalar(), oclMat(), dst, ABS_DIFF);
-}
-
-void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
-{
-    arithmetic_run_generic(src1, oclMat(), src2, oclMat(), dst, ABS_DIFF);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////  compare ///////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
-                        String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    dst.create(src1.size(), CV_8UC1);
-
-    int depth = src1.depth();
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
-    int src2step1 = src2.step1(), src2offset1 = src2.offset / src2.elemSize1();
-    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
-    std::string buildOptions = format("-D T=%s -D Operation=%s", typeMap[depth], operationMap[cmpOp]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-
-    openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
-{
-    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(src1.type() == src2.type() && src1.channels() == 1);
-    CV_Assert(cmpOp >= CMP_EQ && cmpOp <= CMP_NE);
-
-    compare_run(src1, src2, dst, cmpOp, "arithm_compare", &arithm_compare);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// sum  //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-enum { SUM = 0, ABS_SUM, SQR_SUM };
-
-static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int groupnum, int type, int ddepth)
-{
-    int ochannels = src.oclchannels();
-    int all_cols = src.step / src.elemSize();
-    int pre_cols = (src.offset % src.step) / src.elemSize();
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
-    int invalid_cols = pre_cols + sec_cols;
-    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / src.elemSize();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const funcMap[] = { "FUNC_SUM", "FUNC_ABS_SUM", "FUNC_SQR_SUM" };
-    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
-    String buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s -D %s",
-                                 typeMap[src.depth()], channelMap[ochannels],
-                                 typeMap[ddepth], channelMap[ochannels],
-                                 typeMap[ddepth], channelMap[ochannels],
-                                 funcMap[type]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-    size_t globalThreads[3] = { groupnum * 256, 1, 1 };
-
-#ifdef ANDROID
-    openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, NULL,
-                        args, -1, -1, buildOptions.c_str());
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-    openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-#endif
-}
-
-template <typename T>
-Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
-{
-    CV_Assert(src.step % src.elemSize() == 0);
-
-    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
-    CV_Assert(groupnum != 0);
-
-    int dbsize = groupnum * src.oclchannels();
-    Context *clCxt = src.clCxt;
-
-    AutoBuffer<T> _buf(dbsize);
-    T *p = (T*)_buf;
-    memset(p, 0, dbsize * sizeof(T));
-
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
-    arithmetic_sum_buffer_run(src, dstBuffer, groupnum, type, ddepth);
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
-    openCLFree(dstBuffer);
-
-    Scalar s = Scalar::all(0.0);
-    for (int i = 0; i < dbsize;)
-         for (int j = 0; j < src.oclchannels(); j++, i++)
-            s.val[j] += p[i];
-
-    return s;
-}
-
-typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth);
-
-Scalar cv::ocl::sum(const oclMat &src)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return Scalar::all(0);
-    }
-    static sumFunc functab[3] =
-    {
-        arithmetic_sum<int>,
-        arithmetic_sum<float>,
-        arithmetic_sum<double>
-    };
-
-    int ddepth = std::max(src.depth(), CV_32S);
-    sumFunc func = functab[ddepth - CV_32S];
-    return func(src, SUM, ddepth);
-}
-
-Scalar cv::ocl::absSum(const oclMat &src)
-{
-    int sdepth = src.depth();
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && sdepth == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return cv::Scalar::all(0);
-    }
-
-    if (sdepth == CV_8U || sdepth == CV_16U)
-        return sum(src);
-
-    static sumFunc functab[3] =
-    {
-        arithmetic_sum<int>,
-        arithmetic_sum<float>,
-        arithmetic_sum<double>
-    };
-
-    int ddepth = std::max(sdepth, CV_32S);
-    sumFunc func = functab[ddepth - CV_32S];
-    return func(src, ABS_SUM, ddepth);
-}
-
-Scalar cv::ocl::sqrSum(const oclMat &src)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return cv::Scalar::all(0);
-    }
-    static sumFunc functab[3] =
-    {
-        arithmetic_sum<int>,
-        arithmetic_sum<float>,
-        arithmetic_sum<double>
-    };
-
-    int ddepth = std::max(src.depth(), CV_32S);
-    sumFunc func = functab[ddepth - CV_32S];
-    return func(src, SQR_SUM, ddepth);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//////////////////////////////// meanStdDev //////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
-{
-    if (src.depth() == CV_64F && !src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    double total = 1.0 / src.size().area();
-
-    mean = sum(src);
-    stddev = sqrSum(src);
-
-    for (int i = 0; i < 4; ++i)
-    {
-        mean[i] *= total;
-        stddev[i] = std::sqrt(std::max(stddev[i] * total - mean.val[i] * mean.val[i] , 0.));
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////// minMax  /////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename WT>
-static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int groupnum, String kernelName)
-{
-    int all_cols = src.step / src.elemSize();
-    int pre_cols = (src.offset % src.step) / src.elemSize();
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
-    int invalid_cols = pre_cols + sec_cols;
-    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
-    int offset = src.offset / src.elemSize();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
-
-    std::ostringstream stream;
-    stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
-    if (std::numeric_limits<T>::is_integer)
-    {
-        stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
-        stream << " -D MIN_VAL=" << (WT)std::numeric_limits<T>::min();
-    }
-    else
-        stream << " -D DEPTH_" << src.depth();
-    std::string buildOptions = stream.str();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
-
-    int minvalid_cols = 0, moffset = 0;
-    if (!mask.empty())
-    {
-        int mall_cols = mask.step / mask.elemSize();
-        int mpre_cols = (mask.offset % mask.step) / mask.elemSize();
-        int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / mask.elemSize() - 1;
-        minvalid_cols = mpre_cols + msec_cols;
-        moffset = mask.offset / mask.elemSize();
-
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
-
-        kernelName += "_mask";
-    }
-
-    size_t globalThreads[3] = {groupnum * 256, 1, 1};
-    size_t localThreads[3] = {256, 1, 1};
-
-    // kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
-    openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-template <typename T, typename WT>
-void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
-{
-    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
-    CV_Assert(groupnum != 0);
-
-    int dbsize = groupnum * 2 * src.elemSize();
-    oclMat buf;
-    ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
-
-    cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
-    arithmetic_minMax_run<T, WT>(src, mask, buf_data, groupnum, "arithm_op_minMax");
-
-    Mat matbuf = Mat(buf);
-    T *p = matbuf.ptr<T>();
-    if (minVal != NULL)
-    {
-        *minVal = std::numeric_limits<double>::max();
-        for (int i = 0, end = src.oclchannels() * (int)groupnum; i < end; i++)
-            *minVal = *minVal < p[i] ? *minVal : p[i];
-    }
-    if (maxVal != NULL)
-    {
-        *maxVal = -std::numeric_limits<double>::max();
-        for (int i = src.oclchannels() * (int)groupnum, end = i << 1; i < end; i++)
-            *maxVal = *maxVal > p[i] ? *maxVal : p[i];
-    }
-}
-
-typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
-
-void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
-{
-    CV_Assert(src.channels() == 1);
-    CV_Assert(src.size() == mask.size() || mask.empty());
-    CV_Assert(src.step % src.elemSize() == 0);
-
-    if (minVal == NULL && maxVal == NULL)
-        return;
-
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    static minMaxFunc functab[] =
-    {
-        arithmetic_minMax<uchar, int>,
-        arithmetic_minMax<char, int>,
-        arithmetic_minMax<ushort, int>,
-        arithmetic_minMax<short, int>,
-        arithmetic_minMax<int, int>,
-        arithmetic_minMax<float, float>,
-        arithmetic_minMax<double, double>,
-        0
-    };
-
-    minMaxFunc func = functab[src.depth()];
-    CV_Assert(func != 0);
-
-    func(src, minVal, maxVal, mask);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// norm /////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-double cv::ocl::norm(const oclMat &src1, int normType)
-{
-    CV_Assert((normType & NORM_RELATIVE) == 0);
-    return norm(src1, oclMat(), normType);
-}
-
-static void arithm_absdiff_nonsaturate_run(const oclMat & src1, const oclMat & src2, oclMat & diff, int ntype)
-{
-    Context *clCxt = src1.clCxt;
-    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-    CV_Assert(src1.step % src1.elemSize() == 0 && (src2.empty() || src2.step % src2.elemSize() == 0));
-
-    if (src2.empty() && (src1.depth() == CV_8U || src1.depth() == CV_16U))
-    {
-        src1.convertTo(diff, CV_32S);
-        return;
-    }
-
-    int ddepth = std::max(src1.depth(), CV_32S);
-    if (ntype == NORM_L2)
-        ddepth = std::max<int>(CV_32F, ddepth);
-
-    diff.create(src1.size(), CV_MAKE_TYPE(ddepth, src1.channels()));
-    CV_Assert(diff.step % diff.elemSize() == 0);
-
-    int oclChannels = src1.oclchannels(), sdepth = src1.depth();
-    int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
-    int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
-    int diffstep1 = diff.step / diff.elemSize(), diffoffset1 = diff.offset / diff.elemSize();
-
-    String kernelName = "arithm_absdiff_nonsaturate";
-#ifdef ANDROID
-    size_t localThreads[3]  = { 16, 10, 1 };
-#else
-    size_t localThreads[3]  = { 16, 16, 1 };
-#endif
-    size_t globalThreads[3] = { diff.cols, diff.rows, 1 };
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-
-    std::string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s",
-                                      typeMap[sdepth], channelMap[oclChannels],
-                                      typeMap[ddepth], channelMap[oclChannels],
-                                      typeMap[ddepth], channelMap[oclChannels]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
-
-    if (!src2.empty())
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
-
-        kernelName += "_binary";
-        buildOptions += " -D BINARY";
-    }
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&diff.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&diffstep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&diffoffset1 ));
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-
-    openCLExecuteKernel(clCxt, &arithm_absdiff_nonsaturate,
-                        kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
-{
-    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return -1;
-    }
-    CV_Assert(src2.empty() || (src1.type() == src2.type() && src1.size() == src2.size()));
-
-    bool isRelative = (normType & NORM_RELATIVE) != 0;
-    normType &= NORM_TYPE_MASK;
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-
-    Scalar s;
-    int cn = src1.channels();
-    double r = 0;
-    oclMat diff;
-
-    arithm_absdiff_nonsaturate_run(src1, src2, diff, normType);
-
-    switch (normType)
-    {
-    case NORM_INF:
-        diff = diff.reshape(1);
-        minMax(diff, NULL, &r);
-        break;
-    case NORM_L1:
-        s = sum(diff);
-        for (int i = 0; i < cn; ++i)
-            r += s[i];
-        break;
-    case NORM_L2:
-        s = sqrSum(diff);
-        for (int i = 0; i < cn; ++i)
-            r += s[i];
-        r = std::sqrt(r);
-        break;
-    }
-    if (isRelative)
-        r = r / (norm(src2, normType) + DBL_EPSILON);
-
-    return r;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// flip //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
-
-static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType)
-{
-    int cols = dst.cols, rows = dst.rows;
-    if ((cols == 1 && flipType == FLIP_COLS) ||
-            (rows == 1 && flipType == FLIP_ROWS) ||
-            (rows == 1 && cols == 1 && flipType == FLIP_BOTH))
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols;
-    rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows;
-
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]);
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { cols, rows, 1 };
-
-    int elemSize = src.elemSize();
-    int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
-    int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-
-    openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args,
-                        -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    if (flipCode == 0)
-        arithmetic_flip_run(src, dst, "arithm_flip_rows", FLIP_ROWS);
-    else if (flipCode > 0)
-        arithmetic_flip_run(src, dst, "arithm_flip_cols", FLIP_COLS);
-    else
-        arithmetic_flip_run(src, dst, "arithm_flip_rows_cols", FLIP_BOTH);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// LUT  //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_lut_run(const oclMat &src, const oclMat &lut, oclMat &dst, String kernelName)
-{
-    int sdepth = src.depth();
-    int src_step1 = src.step1(), dst_step1 = dst.step1();
-    int src_offset1 = src.offset / src.elemSize1(), dst_offset1 = dst.offset / dst.elemSize1();
-    int lut_offset1 = lut.offset / lut.elemSize1() + (sdepth == CV_8U ? 0 : 128) * lut.channels();
-    int cols1 = src.cols * src.oclchannels();
-
-    size_t localSize[] = { 16, 16, 1 };
-    size_t globalSize[] = { lut.channels() == 1 ? cols1 : src.cols, src.rows, 1 };
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D srcT=%s -D dstT=%s", typeMap[sdepth], typeMap[dst.depth()]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut_offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(src.clCxt, &arithm_LUT, kernelName, globalSize, localSize,
-                        args, lut.oclchannels(), -1, buildOptions.c_str());
-}
-
-void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
-{
-    if (!lut.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && lut.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    int cn = src.channels(), depth = src.depth();
-
-    CV_Assert(depth == CV_8U || depth == CV_8S);
-    CV_Assert(lut.channels() == 1 || lut.channels() == src.channels());
-    CV_Assert(lut.rows == 1 && lut.cols == 256);
-
-    dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
-    arithmetic_lut_run(src, lut, dst, "LUT");
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//////////////////////////////// exp log /////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_exp_log_sqrt_run(const oclMat &src, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    Context  *clCxt = src.clCxt;
-    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert( src.depth() == CV_32F || src.depth() == CV_64F);
-    dst.create(src.size(), src.type());
-
-    int ddepth = dst.depth();
-    int cols1 = src.cols * src.oclchannels();
-    int srcoffset1 = src.offset / src.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
-    int srcstep1 = src.step1(), dststep1 = dst.step1();
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    std::string buildOptions = format("-D srcT=%s",
-                                      ddepth == CV_32F ? "float" : "double");
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-
-    openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
-                        args, src.oclchannels(), -1, buildOptions.c_str());
-}
-
-void cv::ocl::exp(const oclMat &src, oclMat &dst)
-{
-    arithmetic_exp_log_sqrt_run(src, dst, "arithm_exp", &arithm_exp);
-}
-
-void cv::ocl::log(const oclMat &src, oclMat &dst)
-{
-    arithmetic_exp_log_sqrt_run(src, dst, "arithm_log", &arithm_log);
-}
-
-void cv::ocl::sqrt(const oclMat &src, oclMat &dst)
-{
-    arithmetic_exp_log_sqrt_run(src, dst, "arithm_sqrt", &arithm_sqrt);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////// magnitude phase ///////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName)
-{
-    int depth = dst.depth();
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
-    int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
-    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
-
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    std::string buildOptions = format("-D T=%s%s", depth == CV_32F ? "float" : "double", channelMap[dst.channels()]);
-
-    openCLExecuteKernel(src1.clCxt, &arithm_magnitude, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
-{
-    if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
-              (src1.depth() == CV_32F || src1.depth() == CV_64F));
-
-    dst.create(src1.size(), src1.type());
-    arithmetic_magnitude_phase_run(src1, src2, dst, "arithm_magnitude");
-}
-
-static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    int depth = dst.depth(), cols1 = src1.cols * src1.oclchannels();
-    int src1step1 = src1.step / src1.elemSize1(), src1offset1 = src1.offset / src1.elemSize1();
-    int src2step1 = src2.step / src2.elemSize1(), src2offset1 = src2.offset / src2.elemSize1();
-    int dststep1 = dst.step / dst.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { cols1, dst.rows, 1 };
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-
-    openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
-}
-
-void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees)
-{
-    if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
-    CV_Assert(x.step % x.elemSize() == 0 && y.step % y.elemSize() == 0);
-
-    Angle.create(x.size(), x.type());
-    arithmetic_phase_run(x, y, Angle, angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians", &arithm_phase);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// cartToPolar ///////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
-                                String kernelName, bool angleInDegrees)
-{
-    int channels = src1.oclchannels();
-    int depth = src1.depth();
-
-    int cols = src1.cols * channels;
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { cols, src1.rows, 1 };
-
-    int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1();
-    int src2_step = src2.step / src2.elemSize1(), src2_offset = src2.offset / src2.elemSize1();
-    int dst_mag_step = dst_mag.step / dst_mag.elemSize1(), dst_mag_offset = dst_mag.offset / dst_mag.elemSize1();
-    int dst_cart_step = dst_cart.step / dst_cart.elemSize1(), dst_cart_offset = dst_cart.offset / dst_cart.elemSize1();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_mag.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_cart.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-
-    openCLExecuteKernel(src1.clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args,
-                        -1, depth, angleInDegrees ? "-D DEGREE" : "-D RADIAN");
-}
-
-void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat &angle, bool angleInDegrees)
-{
-    if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
-
-    mag.create(x.size(), x.type());
-    angle.create(x.size(), x.type());
-
-    arithmetic_cartToPolar_run(x, y, mag, angle, "arithm_cartToPolar", angleInDegrees);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// polarToCart ///////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
-                        String kernelName)
-{
-    int channels = src2.oclchannels(), depth = src2.depth();
-    int cols = src2.cols * channels, rows = src2.rows;
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { cols, rows, 1 };
-
-    int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1();
-    int src2_step = src2.step / src2.elemSize1(), src2_offset = src2.offset / src2.elemSize1();
-    int dst1_step = dst1.step / dst1.elemSize1(), dst1_offset = dst1.offset / dst1.elemSize1();
-    int dst2_step = dst2.step / dst2.elemSize1(), dst2_offset = dst2.offset / dst2.elemSize1();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    if (src1.data)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
-    }
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-
-    openCLExecuteKernel(src1.clCxt, &arithm_polarToCart, kernelName, globalThreads, localThreads,
-                        args, -1, depth, angleInDegrees ? "-D DEGREE" : "-D RADIAN");
-}
-
-void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees)
-{
-    if (!magnitude.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && magnitude.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(angle.depth() == CV_32F || angle.depth() == CV_64F);
-    CV_Assert(magnitude.size() == angle.size() && magnitude.type() == angle.type());
-
-    x.create(angle.size(), angle.type());
-    y.create(angle.size(), angle.type());
-
-    if ( magnitude.data )
-        arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart_mag");
-    else
-        arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart");
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// minMaxLoc ////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_minMaxLoc_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum)
-{
-    std::vector<std::pair<size_t , const void *> > args;
-    int all_cols = src.step / (vlen * src.elemSize1());
-    int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1());
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize1() - 1) / (vlen * src.elemSize1()) - 1;
-    int invalid_cols = pre_cols + sec_cols;
-    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / (vlen * src.elemSize1());
-    int repeat_s = src.offset / src.elemSize1() - offset * vlen;
-    int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-    char build_options[50];
-    sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e);
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-
-    // kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
-    openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc, "arithm_op_minMaxLoc", gt, lt, args, -1, -1, build_options);
-}
-
-static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen, int groupnum)
-{
-    std::vector<std::pair<size_t , const void *> > args;
-    size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    char build_options[50];
-    if (src.oclchannels() == 1)
-    {
-        int cols = (src.cols - 1) / vlen + 1;
-        int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
-        int offset = src.offset / src.elemSize1();
-        int repeat_me = vlen - (mask.cols % vlen == 0 ? vlen : mask.cols % vlen);
-        int minvalid_cols = mask.step / (vlen * mask.elemSize1()) - cols;
-        int moffset = mask.offset / mask.elemSize1();
-        int elemnum = cols * src.rows;
-        sprintf(build_options, "-D DEPTH_%d -D REPEAT_E%d", src.depth(), repeat_me);
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-
-        // kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
-        openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
-    }
-}
-
-template <typename T>
-void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
-                          Point *minLoc, Point *maxLoc, const oclMat &mask)
-{
-    CV_Assert(src.oclchannels() == 1);
-    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
-    CV_Assert(groupnum != 0);
-    int minloc = -1 , maxloc = -1;
-    int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
-    Context *clCxt = src.clCxt;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
-    *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
-
-    if (mask.empty())
-        arithmetic_minMaxLoc_run(src, dstBuffer, vlen, groupnum);
-    else
-        arithmetic_minMaxLoc_mask_run(src, mask, dstBuffer, vlen, groupnum);
-
-    AutoBuffer<T> _buf(groupnum * vlen * 4);
-    T *p = (T*)_buf;
-    memset(p, 0, dbsize);
-
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    for (int i = 0; i < vlen * (int)groupnum; i++)
-    {
-        *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
-        minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
-    }
-    for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
-    {
-        *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
-        maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
-    }
-
-    int pre_rows = src.offset / src.step;
-    int pre_cols = (src.offset % src.step) / src.elemSize1();
-    int wholecols = src.step / src.elemSize1();
-    if ( minLoc )
-    {
-        if ( minloc >= 0 )
-        {
-            minLoc->y = minloc / wholecols - pre_rows;
-            minLoc->x = minloc % wholecols - pre_cols;
-        }
-        else
-            minLoc->x = minLoc->y = -1;
-    }
-    if ( maxLoc )
-    {
-        if ( maxloc >= 0 )
-        {
-            maxLoc->y = maxloc / wholecols - pre_rows;
-            maxLoc->x = maxloc % wholecols - pre_cols;
-        }
-        else
-            maxLoc->x = maxLoc->y = -1;
-    }
-
-    openCLSafeCall(clReleaseMemObject(dstBuffer));
-}
-
-typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
-                              Point *minLoc, Point *maxLoc, const oclMat &mask);
-
-void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
-                        Point *minLoc, Point *maxLoc, const oclMat &mask)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    static minMaxLocFunc functab[2] =
-    {
-        arithmetic_minMaxLoc<float>,
-        arithmetic_minMaxLoc<double>
-    };
-
-    minMaxLocFunc func;
-    func = functab[(int)src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)];
-    func(src, minVal, maxVal, minLoc, maxLoc, mask);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-///////////////////////////// countNonZero ///////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int groupnum, String kernelName)
-{
-    int ochannels = src.oclchannels();
-    int all_cols = src.step / src.elemSize();
-    int pre_cols = (src.offset % src.step) / src.elemSize();
-    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
-    int invalid_cols = pre_cols + sec_cols;
-    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
-    int offset = src.offset / src.elemSize();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const channelMap[] = { " ", " ", "2", "4", "4" };
-    String buildOptions = format("-D srcT=%s%s -D dstT=int%s", typeMap[src.depth()], channelMap[ochannels],
-                                 channelMap[ochannels]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
-
-    size_t globalThreads[3] = { groupnum * 256, 1, 1 };
-
-#ifdef ANDROID
-    openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, NULL,
-                        args, -1, -1, buildOptions.c_str());
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-    openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-#endif
-}
-
-int cv::ocl::countNonZero(const oclMat &src)
-{
-    CV_Assert(src.step % src.elemSize() == 0);
-    CV_Assert(src.channels() == 1);
-
-    Context *clCxt = src.clCxt;
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "selected device doesn't support double");
-        return -1;
-    }
-
-    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
-    CV_Assert(groupnum != 0);
-    int dbsize = groupnum;
-
-    String kernelName = "arithm_op_nonzero";
-
-    AutoBuffer<int> _buf(dbsize);
-    int *p = (int*)_buf, nonzero = 0;
-    memset(p, 0, dbsize * sizeof(int));
-
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
-    arithmetic_countNonZero_run(src, dstBuffer, groupnum, kernelName);
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
-
-    for (int i = 0; i < dbsize; i++)
-        nonzero += p[i];
-
-    openCLSafeCall(clReleaseMemObject(dstBuffer));
-
-    return nonzero;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////bitwise_op////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void bitwise_unary_run(const oclMat &src1, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    dst.create(src1.size(), src1.type());
-
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1},
-        {4, 4, 4, 4, 1, 1, 1}
-    };
-
-    size_t vector_length = vector_lengths[channels - 1][depth];
-    int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
-    int cols = divUp(dst.cols * channels + offset_cols, vector_length);
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 64, 2, 1 };
-#else
-    size_t localThreads[3]  = { 64, 4, 1 };
-#endif
-    size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-    int dst_step1 = dst.cols * dst.elemSize();
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-
-    openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
-}
-
-enum { AND = 0, OR, XOR };
-
-static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
-                               oclMat &dst, int operationType)
-{
-    CV_Assert(operationType >= AND && operationType <= XOR);
-    CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
-    CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
-
-    dst.create(src1.size(), src1.type());
-    oclMat m;
-
-    const char operationMap[] = { '&', '|', '^' };
-    std::string kernelName("arithm_bitwise_binary");
-
-    int vlen = std::min<int>(8, src1.elemSize1() * src1.oclchannels());
-    std::string vlenstr = vlen > 1 ? format("%d", vlen) : "";
-    std::string buildOptions = format("-D Operation=%c -D vloadn=vload%s -D vstoren=vstore%s -D elemSize=%d -D vlen=%d"
-                                      " -D ucharv=uchar%s",
-                                      operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(),
-                                      (int)src1.elemSize(), vlen, vlenstr.c_str());
-
-#ifdef ANDROID
-    size_t localThreads[3]  = { 16, 10, 1 };
-#else
-    size_t localThreads[3]  = { 16, 16, 1 };
-#endif
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
-
-    if (src2.empty())
-    {
-        m.create(1, 1, dst.type());
-        m.setTo(src3);
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&m.data ));
-
-        kernelName += "_scalar";
-    }
-    else
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
-    }
-
-    if (!mask.empty())
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.offset ));
-
-        kernelName += "_mask";
-    }
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-
-    openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
-                                              (!src2.empty() ? &arithm_bitwise_binary_mask : &arithm_bitwise_binary_scalar_mask),
-                        kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-    bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not);
-}
-
-void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, src2, Scalar(), mask, dst, OR);
-}
-
-void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, oclMat(), src2, mask, dst, OR);
-}
-
-void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, src2, Scalar(), mask, dst, AND);
-}
-
-void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, oclMat(), src2, mask, dst, AND);
-}
-
-void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, src2, Scalar(), mask, dst, XOR);
-}
-
-void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
-{
-    bitwise_binary_run(src1, oclMat(), src2, mask, dst, XOR);
-}
-
-oclMat cv::ocl::operator ~ (const oclMat &src)
-{
-    return oclMatExpr(src, oclMat(), MAT_NOT);
-}
-
-oclMat cv::ocl::operator | (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, MAT_OR);
-}
-
-oclMat cv::ocl::operator & (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, MAT_AND);
-}
-
-oclMat cv::ocl::operator ^ (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, MAT_XOR);
-}
-
-cv::ocl::oclMatExpr cv::ocl::operator + (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, cv::ocl::MAT_ADD);
-}
-
-cv::ocl::oclMatExpr cv::ocl::operator - (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, cv::ocl::MAT_SUB);
-}
-
-cv::ocl::oclMatExpr cv::ocl::operator * (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, cv::ocl::MAT_MUL);
-}
-
-cv::ocl::oclMatExpr cv::ocl::operator / (const oclMat &src1, const oclMat &src2)
-{
-    return oclMatExpr(src1, src2, cv::ocl::MAT_DIV);
-}
-
-void oclMatExpr::assign(oclMat& m) const
-{
-    switch (op)
-    {
-        case MAT_ADD:
-            add(a, b, m);
-            break;
-        case MAT_SUB:
-            subtract(a, b, m);
-            break;
-        case MAT_MUL:
-            multiply(a, b, m);
-            break;
-        case MAT_DIV:
-            divide(a, b, m);
-            break;
-        case MAT_NOT:
-            bitwise_not(a, m);
-            break;
-        case MAT_AND:
-            bitwise_and(a, b, m);
-            break;
-        case MAT_OR:
-            bitwise_or(a, b, m);
-            break;
-        case MAT_XOR:
-            bitwise_xor(a, b, m);
-            break;
-    }
-}
-
-oclMatExpr::operator oclMat() const
-{
-    oclMat m;
-    assign(m);
-    return m;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////// transpose ////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-#define TILE_DIM   (32)
-#define BLOCK_ROWS (256 / TILE_DIM)
-
-static void transpose_run(const oclMat &src, oclMat &dst, String kernelName, bool inplace = false)
-{
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char channelsString[] = { ' ', ' ', '2', '4', '4' };
-    std::string buildOptions = format("-D T=%s%c", typeMap[src.depth()],
-                                      channelsString[src.channels()]);
-
-    size_t localThreads[3]  = { TILE_DIM, BLOCK_ROWS, 1 };
-    size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
-
-    int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
-    int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
-
-    openCLExecuteKernel(src.clCxt, &arithm_transpose, kernelName, globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::transpose(const oclMat &src, oclMat &dst)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
-         && dst.size() == src.size())
-        transpose_run( src, dst, "transpose_inplace", true);
-    else
-    {
-        dst.create(src.cols, src.rows, src.type());
-        transpose_run( src, dst, "transpose");
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////// addWeighted ///////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
-{
-    Context *clCxt = src1.clCxt;
-    bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
-    if (!hasDouble && src1.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(src1.size() ==  src2.size() && src1.type() == src2.type());
-    dst.create(src1.size(), src1.type());
-
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    int cols1 = src1.cols * channels;
-    int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
-    int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
-    int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
-                                      typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
-                                      depth >= CV_32F ? "" : "_sat_rte");
-
-    size_t globalThreads[3] = { cols1, dst.rows, 1};
-
-    float alpha_f = static_cast<float>(alpha),
-            beta_f = static_cast<float>(beta),
-            gama_f = static_cast<float>(gama);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1));
-
-    if (!hasDouble)
-    {
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&alpha_f ));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&beta_f ));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&gama_f ));
-    }
-    else
-    {
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&alpha ));
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&beta ));
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&gama ));
-    }
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
-
-#ifdef ANDROID
-    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, NULL,
-                        args, -1, -1, buildOptions.c_str());
-#else
-    size_t localThreads[3] = { 256, 1, 1};
-    openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// Pow //////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-static void arithmetic_pow_run(const oclMat &src, double p, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    int channels = dst.oclchannels();
-    int depth = dst.depth();
-
-    size_t localThreads[3]  = { 64, 4, 1 };
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    const char * const typeStr = depth == CV_32F ? "float" : "double";
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    std::string buildOptions = format("-D VT=%s%s -D T=%s", typeStr, channelMap[channels], typeStr);
-
-    int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
-    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
-
-    float pf = static_cast<float>(p);
-    if(src.depth() == CV_32F)
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
-    else
-        args.push_back( std::make_pair( sizeof(cl_double), (void *)&p ));
-
-    openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
-}
-
-void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
-{
-    if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(x.depth() == CV_32F || x.depth() == CV_64F);
-    y.create(x.size(), x.type());
-
-    arithmetic_pow_run(x, p, y, "arithm_pow", &arithm_pow);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////// setIdentity //////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-void cv::ocl::setIdentity(oclMat& src, const Scalar & scalar)
-{
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(src.step % src.elemSize() == 0);
-
-    int src_step1 = src.step / src.elemSize(), src_offset1 = src.offset / src.elemSize();
-    size_t local_threads[] = { 16, 16, 1 };
-    size_t global_threads[] = { src.cols, src.rows, 1 };
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    String buildOptions = format("-D T=%s%s", typeMap[src.depth()], channelMap[src.oclchannels()]);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-
-    oclMat sc(1, 1, src.type(), scalar);
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sc.data ));
-
-    openCLExecuteKernel(src.clCxt, &arithm_setidentity, "setIdentity", global_threads, local_threads,
-                        args, -1, -1, buildOptions.c_str());
-}
-
-//////////////////////////////////////////////////////////////////////////////
-////////////////////////////////// Repeat ////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-
-void cv::ocl::repeat(const oclMat & src, int ny, int nx, oclMat & dst)
-{
-    CV_Assert(nx > 0 && ny > 0);
-    dst.create(src.rows * ny, src.cols * nx, src.type());
-
-    for (int y = 0; y < ny; ++y)
-        for (int x = 0; x < nx; ++x)
-        {
-            Rect roi(x * src.cols, y * src.rows, src.cols, src.rows);
-            oclMat hdr = dst(roi);
-            src.copyTo(hdr);
-        }
-}
diff --git a/modules/ocl/src/bgfg_mog.cpp b/modules/ocl/src/bgfg_mog.cpp
deleted file mode 100644
index c6883661b..000000000
--- a/modules/ocl/src/bgfg_mog.cpp
+++ /dev/null
@@ -1,639 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        typedef struct _contant_struct
-        {
-            cl_float c_Tb;
-            cl_float c_TB;
-            cl_float c_Tg;
-            cl_float c_varInit;
-            cl_float c_varMin;
-            cl_float c_varMax;
-            cl_float c_tau;
-            cl_uchar c_shadowVal;
-        }contant_struct;
-
-        cl_mem cl_constants = NULL;
-        float c_TB;
-    }
-}
-
-#if defined _MSC_VER
-#define snprintf sprintf_s
-#endif
-
-namespace cv { namespace ocl { namespace device
-{
-    namespace mog
-    {
-        void mog_ocl(const oclMat& frame, int cn, oclMat& fgmask, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
-            int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma);
-
-        void getBackgroundImage_ocl(int cn, const oclMat& weight, const oclMat& mean, oclMat& dst, int nmixtures, float backgroundRatio);
-
-        void loadConstants(float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau,
-                            unsigned char shadowVal);
-
-        void mog2_ocl(const oclMat& frame, int cn, oclMat& fgmask, oclMat& modesUsed, oclMat& weight, oclMat& variance, oclMat& mean,
-                      float alphaT, float prune, bool detectShadows, int nmixtures);
-
-        void getBackgroundImage2_ocl(int cn, const oclMat& modesUsed, const oclMat& weight, const oclMat& mean, oclMat& dst, int nmixtures);
-    }
-}}}
-
-namespace mog
-{
-    const int defaultNMixtures = 5;
-    const int defaultHistory = 200;
-    const float defaultBackgroundRatio = 0.7f;
-    const float defaultVarThreshold = 2.5f * 2.5f;
-    const float defaultNoiseSigma = 30.0f * 0.5f;
-    const float defaultInitialWeight = 0.05f;
-}
-void cv::ocl::BackgroundSubtractor::operator()(const oclMat&, oclMat&, float)
-{
-
-}
-cv::ocl::BackgroundSubtractor::~BackgroundSubtractor()
-{
-
-}
-
-cv::ocl::MOG::MOG(int nmixtures) :
-frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = std::min(nmixtures > 0 ? nmixtures : mog::defaultNMixtures, 8);
-    history = mog::defaultHistory;
-    varThreshold = mog::defaultVarThreshold;
-    backgroundRatio = mog::defaultBackgroundRatio;
-    noiseSigma = mog::defaultNoiseSigma;
-}
-
-void cv::ocl::MOG::initialize(cv::Size frameSize, int frameType)
-{
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store
-    // the mixture sort key (w/sum_of_variances), the mixture weight (w),
-    // the mean (nchannels values) and
-    // the diagonal covariance matrix (another nchannels values)
-
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-    var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    weight_.setTo(cv::Scalar::all(0));
-    sortKey_.setTo(cv::Scalar::all(0));
-    mean_.setTo(cv::Scalar::all(0));
-    var_.setTo(cv::Scalar::all(0));
-
-    nframes_ = 0;
-}
-
-void cv::ocl::MOG::operator()(const cv::ocl::oclMat& frame, cv::ocl::oclMat& fgmask, float learningRate)
-{
-    using namespace cv::ocl::device::mog;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.oclchannels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.oclchannels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog_ocl(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
-        varThreshold, learningRate, backgroundRatio, noiseSigma);
-}
-
-void cv::ocl::MOG::getBackgroundImage(oclMat& backgroundImage) const
-{
-    using namespace cv::ocl::device::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    cv::ocl::device::mog::getBackgroundImage_ocl(backgroundImage.oclchannels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio);
-}
-
-void cv::ocl::MOG::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    sortKey_.release();
-    mean_.release();
-    var_.release();
-    clReleaseMemObject(cl_constants);
-}
-
-static void mog_withoutLearning(const oclMat& frame, int cn, oclMat& fgmask, oclMat& weight, oclMat& mean, oclMat& var,
-    int nmixtures, float varThreshold, float backgroundRatio)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {frame.cols, frame.rows, 1};
-
-    int frame_step = (int)(frame.step/frame.elemSize());
-    int fgmask_step = (int)(fgmask.step/fgmask.elemSize());
-    int weight_step = (int)(weight.step/weight.elemSize());
-    int mean_step = (int)(mean.step/mean.elemSize());
-    int var_step = (int)(var.step/var.elemSize());
-
-    int fgmask_offset_y = (int)(fgmask.offset/fgmask.step);
-    int fgmask_offset_x = (int)(fgmask.offset%fgmask.step);
-    fgmask_offset_x = fgmask_offset_x/(int)fgmask.elemSize();
-
-    int frame_offset_y = (int)(frame.offset/frame.step);
-    int frame_offset_x = (int)(frame.offset%frame.step);
-    frame_offset_x = frame_offset_x/(int)frame.elemSize();
-
-    char build_option[50];
-    if(cn == 1)
-    {
-        snprintf(build_option, 50, "-D CN1 -D NMIXTURES=%d", nmixtures);
-    }else
-    {
-        snprintf(build_option, 50, "-D NMIXTURES=%d", nmixtures);
-    }
-
-    String kernel_name = "mog_withoutLearning_kernel";
-    std::vector<std::pair<size_t, const void*> > args;
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&frame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&fgmask.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&weight.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&mean.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&var.data));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.cols));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&weight_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&mean_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&var_step));
-
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&varThreshold));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&backgroundRatio));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_y));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_y));
-
-    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-}
-
-
-static void mog_withLearning(const oclMat& frame, int cn, oclMat& fgmask_raw, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
-    int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {frame.cols, frame.rows, 1};
-
-    oclMat fgmask(fgmask_raw.size(), CV_32SC1);
-
-    int frame_step = (int)(frame.step/frame.elemSize());
-    int fgmask_step = (int)(fgmask.step/fgmask.elemSize());
-    int weight_step = (int)(weight.step/weight.elemSize());
-    int sortKey_step = (int)(sortKey.step/sortKey.elemSize());
-    int mean_step = (int)(mean.step/mean.elemSize());
-    int var_step = (int)(var.step/var.elemSize());
-
-    int fgmask_offset_y = (int)(fgmask.offset/fgmask.step);
-    int fgmask_offset_x = (int)(fgmask.offset%fgmask.step);
-    fgmask_offset_x = fgmask_offset_x/(int)fgmask.elemSize();
-
-    int frame_offset_y = (int)(frame.offset/frame.step);
-    int frame_offset_x = (int)(frame.offset%frame.step);
-    frame_offset_x = frame_offset_x/(int)frame.elemSize();
-
-    char build_option[50];
-    if(cn == 1)
-    {
-        snprintf(build_option, 50, "-D CN1 -D NMIXTURES=%d", nmixtures);
-    }else
-    {
-        snprintf(build_option, 50, "-D NMIXTURES=%d", nmixtures);
-    }
-
-    String kernel_name = "mog_withLearning_kernel";
-    std::vector<std::pair<size_t, const void*> > args;
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&frame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&fgmask.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&weight.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&sortKey.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&mean.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&var.data));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.cols));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&weight_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&sortKey_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&mean_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&var_step));
-
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&varThreshold));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&backgroundRatio));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&learningRate));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&minVar));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_y));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_y));
-
-    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-    fgmask.convertTo(fgmask, CV_8U);
-    fgmask.copyTo(fgmask_raw);
-}
-
-void cv::ocl::device::mog::mog_ocl(const oclMat& frame, int cn, oclMat& fgmask, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
-    int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma)
-{
-    const float minVar = noiseSigma * noiseSigma;
-
-    if(learningRate > 0.0f)
-        mog_withLearning(frame, cn, fgmask, weight, sortKey, mean, var, nmixtures,
-                         varThreshold, backgroundRatio, learningRate, minVar);
-    else
-        mog_withoutLearning(frame, cn, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio);
-}
-
-void cv::ocl::device::mog::getBackgroundImage_ocl(int cn, const oclMat& weight, const oclMat& mean, oclMat& dst, int nmixtures, float backgroundRatio)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {dst.cols, dst.rows, 1};
-
-    int weight_step = (int)(weight.step/weight.elemSize());
-    int mean_step = (int)(mean.step/mean.elemSize());
-    int dst_step = (int)(dst.step/dst.elemSize());
-
-    char build_option[50];
-    if(cn == 1)
-    {
-        snprintf(build_option, 50, "-D CN1 -D NMIXTURES=%d", nmixtures);
-    }else
-    {
-        snprintf(build_option, 50, "-D NMIXTURES=%d", nmixtures);
-    }
-
-    String kernel_name = "getBackgroundImage_kernel";
-    std::vector<std::pair<size_t, const void*> > args;
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&weight.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&mean.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dst.data));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst.cols));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&weight_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&mean_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst_step));
-
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&backgroundRatio));
-
-    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-}
-
-void cv::ocl::device::mog::loadConstants(float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
-{
-    varMin = cv::min(varMin, varMax);
-    varMax = cv::max(varMin, varMax);
-
-    c_TB = TB;
-
-    _contant_struct *constants = new _contant_struct;
-    constants->c_Tb = Tb;
-    constants->c_TB = TB;
-    constants->c_Tg = Tg;
-    constants->c_varInit = varInit;
-    constants->c_varMin = varMin;
-    constants->c_varMax = varMax;
-    constants->c_tau = tau;
-    constants->c_shadowVal = shadowVal;
-
-    cl_constants = load_constant(*((cl_context*)getClContextPtr()), *((cl_command_queue*)getClCommandQueuePtr()),
-        (void *)constants, sizeof(_contant_struct));
-}
-
-void cv::ocl::device::mog::mog2_ocl(const oclMat& frame, int cn, oclMat& fgmaskRaw, oclMat& modesUsed, oclMat& weight, oclMat& variance,
-                                oclMat& mean, float alphaT, float prune, bool detectShadows, int nmixtures)
-{
-    oclMat fgmask(fgmaskRaw.size(), CV_32SC1);
-
-    Context* clCxt = Context::getContext();
-
-    const float alpha1 = 1.0f - alphaT;
-
-    cl_int detectShadows_flag = 0;
-    if(detectShadows)
-        detectShadows_flag = 1;
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {frame.cols, frame.rows, 1};
-
-    int frame_step = (int)(frame.step/frame.elemSize());
-    int fgmask_step = (int)(fgmask.step/fgmask.elemSize());
-    int weight_step = (int)(weight.step/weight.elemSize());
-    int modesUsed_step = (int)(modesUsed.step/modesUsed.elemSize());
-    int mean_step = (int)(mean.step/mean.elemSize());
-    int var_step = (int)(variance.step/variance.elemSize());
-
-    int fgmask_offset_y = (int)(fgmask.offset/fgmask.step);
-    int fgmask_offset_x = (int)(fgmask.offset%fgmask.step);
-    fgmask_offset_x = fgmask_offset_x/(int)fgmask.elemSize();
-
-    int frame_offset_y = (int)(frame.offset/frame.step);
-    int frame_offset_x = (int)(frame.offset%frame.step);
-    frame_offset_x = frame_offset_x/(int)frame.elemSize();
-
-    String kernel_name = "mog2_kernel";
-    std::vector<std::pair<size_t, const void*> > args;
-
-    char build_option[50];
-    if(cn == 1)
-    {
-        snprintf(build_option, 50, "-D CN1 -D NMIXTURES=%d", nmixtures);
-    }else
-    {
-        snprintf(build_option, 50, "-D NMIXTURES=%d", nmixtures);
-    }
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&frame.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&fgmask.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&weight.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&mean.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&modesUsed.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&variance.data));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame.cols));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&weight_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&mean_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&modesUsed_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&var_step));
-
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&alphaT));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&alpha1));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&prune));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&detectShadows_flag));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&fgmask_offset_y));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&frame_offset_y));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&cl_constants));
-
-    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-
-    fgmask.convertTo(fgmask, CV_8U);
-    fgmask.copyTo(fgmaskRaw);
-}
-
-void cv::ocl::device::mog::getBackgroundImage2_ocl(int cn, const oclMat& modesUsed, const oclMat& weight, const oclMat& mean, oclMat& dst, int nmixtures)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {modesUsed.cols, modesUsed.rows, 1};
-
-    int weight_step = (int)(weight.step/weight.elemSize());
-    int modesUsed_step = (int)(modesUsed.step/modesUsed.elemSize());
-    int mean_step = (int)(mean.step/mean.elemSize());
-    int dst_step = (int)(dst.step/dst.elemSize());
-
-    int dst_y = (int)(dst.offset/dst.step);
-    int dst_x = (int)(dst.offset%dst.step);
-    dst_x = dst_x/(int)dst.elemSize();
-
-    String kernel_name = "getBackgroundImage2_kernel";
-    std::vector<std::pair<size_t, const void*> > args;
-
-    char build_option[50];
-    if(cn == 1)
-    {
-        snprintf(build_option, 50, "-D CN1 -D NMIXTURES=%d", nmixtures);
-    }else
-    {
-        snprintf(build_option, 50, "-D NMIXTURES=%d", nmixtures);
-    }
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&modesUsed.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&weight.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&mean.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void*)&c_TB));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&modesUsed.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&modesUsed.cols));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&modesUsed_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&weight_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&mean_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst_step));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&dst_y));
-
-    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-}
-
-/////////////////////////////////////////////////////////////////
-// MOG2
-
-namespace mog2
-{
-    // default parameters of gaussian background detection algorithm
-    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
-    const float defaultVarThreshold = 4.0f * 4.0f;
-    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
-    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
-    const float defaultVarThresholdGen = 3.0f * 3.0f;
-    const float defaultVarInit = 15.0f; // initial variance for new components
-    const float defaultVarMax = 5.0f * defaultVarInit;
-    const float defaultVarMin = 4.0f;
-
-    // additional parameters
-    const float defaultfCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
-    const unsigned char defaultnShadowDetection = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
-    const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
-}
-
-cv::ocl::MOG2::MOG2(int nmixtures) : frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = nmixtures > 0 ? nmixtures : mog2::defaultNMixtures;
-
-    history = mog2::defaultHistory;
-    varThreshold = mog2::defaultVarThreshold;
-    bShadowDetection = true;
-
-    backgroundRatio = mog2::defaultBackgroundRatio;
-    fVarInit = mog2::defaultVarInit;
-    fVarMax  = mog2::defaultVarMax;
-    fVarMin = mog2::defaultVarMin;
-
-    varThresholdGen = mog2::defaultVarThresholdGen;
-    fCT = mog2::defaultfCT;
-    nShadowDetection =  mog2::defaultnShadowDetection;
-    fTau = mog2::defaultfTau;
-}
-
-void cv::ocl::MOG2::initialize(cv::Size frameSize, int frameType)
-{
-    using namespace cv::ocl::device::mog;
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-    nframes_ = 0;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store ...
-    // the mixture weight (w),
-    // the mean (nchannels values) and
-    // the covariance
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    weight_.setTo(Scalar::all(0));
-
-    variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    variance_.setTo(Scalar::all(0));
-
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch)); //4 channels
-    mean_.setTo(Scalar::all(0));
-
-    //make the array for keeping track of the used modes per pixel - all zeros at start
-    bgmodelUsedModes_.create(frameSize_, CV_32FC1);
-    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
-
-    loadConstants(varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
-}
-
-void cv::ocl::MOG2::operator()(const oclMat& frame, oclMat& fgmask, float learningRate)
-{
-    using namespace cv::ocl::device::mog;
-
-    int ch = frame.oclchannels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.oclchannels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-    fgmask.setTo(cv::Scalar::all(0));
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog2_ocl(frame, frame.oclchannels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, nmixtures_);
-}
-
-void cv::ocl::MOG2::getBackgroundImage(oclMat& backgroundImage) const
-{
-    using namespace cv::ocl::device::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    cv::ocl::device::mog::getBackgroundImage2_ocl(backgroundImage.oclchannels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, nmixtures_);
-}
-
-void cv::ocl::MOG2::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    variance_.release();
-    mean_.release();
-
-    bgmodelUsedModes_.release();
-}
diff --git a/modules/ocl/src/blend.cpp b/modules/ocl/src/blend.cpp
deleted file mode 100644
index 39f09c47b..000000000
--- a/modules/ocl/src/blend.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Nathan, liujun@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-void cv::ocl::blendLinear(const oclMat &src1, const oclMat &src2, const oclMat &weights1, const oclMat &weights2,
-                          oclMat &dst)
-{
-    CV_Assert(src1.depth() <= CV_32F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(weights1.size() == weights2.size() && weights1.size() == src1.size() &&
-              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
-
-    dst.create(src1.size(), src1.type());
-
-    size_t globalSize[] = { dst.cols, dst.rows, 1};
-    size_t localSize[] = { 16, 16, 1 };
-
-    int depth = dst.depth(), ocn = dst.oclchannels();
-    int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
-    int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
-    int weight1_step = weights1.step / weights1.elemSize(), weight1_offset = weights1.offset / weights1.elemSize();
-    int weight2_step = weights2.step / weights2.elemSize(), weight2_offset = weights2.offset / weights2.elemSize();
-    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-
-    const char * const channelMap[] = { "", "", "2", "4", "4" };
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D T=%s%s -D convertToT=convert_%s%s%s -D FT=float%s -D convertToFT=convert_float%s",
-                                      typeMap[depth], channelMap[ocn], typeMap[depth], channelMap[ocn],
-                                      depth >= CV_32S ? "" : "_sat_rte", channelMap[ocn], channelMap[ocn]);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
-
-    openCLExecuteKernel(src1.clCxt, &blend_linear, "blendLinear", globalSize, localSize, args,
-                        -1, -1, buildOptions.c_str());
-}
diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp
deleted file mode 100644
index ca16f4346..000000000
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ /dev/null
@@ -1,1213 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Nathan, liujun@multicorewareinc.com
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <functional>
-#include <iterator>
-#include <vector>
-#include <algorithm>
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-static const int OPT_SIZE = 100;
-
-static const char * T_ARR [] = {
-    "uchar",
-    "char",
-    "ushort",
-    "short",
-    "int",
-    "float -D T_FLOAT",
-    "double"};
-
-template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
-                         const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    int m_size = MAX_DESC_LEN;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-        T_ARR[query.depth()], distType, block_size, m_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_UnrollMatch";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
-                         const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
-{
-}
-
-template < int BLOCK_SIZE/*, typename Mask*/ >
-void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
-           const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
-        T_ARR[query.depth()], distType, block_size);
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_Match";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-template < int BLOCK_SIZE/*, typename Mask*/ >
-void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
-           const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
-{
-}
-
-//radius_matchUnrolledCached
-template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
-                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    int m_size = MAX_DESC_LEN;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-        T_ARR[query.depth()], distType, block_size, m_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.step ));
-
-        String kernelName = "BruteForceMatch_RadiusUnrollMatch";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-//radius_match
-template < int BLOCK_SIZE/*, typename Mask*/ >
-void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
-                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
-        T_ARR[query.depth()], distType, block_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.step ));
-
-        String kernelName = "BruteForceMatch_RadiusMatch";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-static void matchDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    const oclMat zeroMask;
-    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = isCpuDevice();
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64>(query, train, tempMask, trainIdx, distance, distType);
-    }
-    else if (query.cols <= 128 && !is_cpu)
-    {
-        matchUnrolledCached<16, 128>(query, train, tempMask, trainIdx,  distance, distType);
-    }
-    else
-    {
-        match<16>(query, train, tempMask, trainIdx, distance, distType);
-    }
-}
-
-static void matchDispatcher(const oclMat &query, const oclMat *trains, int n, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, int distType)
-{
-    const oclMat zeroMask;
-    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = isCpuDevice();
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
-    }
-    else if (query.cols <= 128 && !is_cpu)
-    {
-        matchUnrolledCached<16, 128>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
-    }
-    else
-    {
-        match<16>(query, trains, n, tempMask, trainIdx, imgIdx, distance, distType);
-    }
-}
-
-//radius matchDispatcher
-static void matchDispatcher(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
-                     const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
-{
-    const oclMat zeroMask;
-    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    bool is_cpu = isCpuDevice();
-    if (query.cols <= 64)
-    {
-        matchUnrolledCached<16, 64>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
-    }
-    else if (query.cols <= 128 && !is_cpu)
-    {
-        matchUnrolledCached<16, 128>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
-    }
-    else
-    {
-        radius_match<16>(query, train, maxDistance, tempMask, trainIdx, distance, nMatches, distType);
-    }
-}
-
-//knn match Dispatcher
-template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
-void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
-                             const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    int m_size = MAX_DESC_LEN;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-        T_ARR[query.depth()], distType, block_size, m_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_knnUnrollMatch";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-template < int BLOCK_SIZE/*, typename Mask*/ >
-void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
-               const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
-        T_ARR[query.depth()], distType, block_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_knnMatch";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
-void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    int m_size = MAX_DESC_LEN;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
-        T_ARR[query.depth()], distType, block_size, m_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&m_size ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_calcDistanceUnrolled";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-template < int BLOCK_SIZE/*, typename Mask*/ >
-void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
-{
-    cv::ocl::Context *ctx = query.clCxt;
-    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
-    size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-    int block_size = BLOCK_SIZE;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    char opt [OPT_SIZE] = "";
-    sprintf(opt,
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
-        T_ARR[query.depth()], distType, block_size);
-
-    if(globalSize[0] != 0)
-    {
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
-        args.push_back( std::make_pair( smemSize, (void *)NULL));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        String kernelName = "BruteForceMatch_calcDistance";
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Calc Distance dispatcher
-static void calcDistanceDispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
-                            const oclMat &allDist, int distType)
-{
-    if (query.cols <= 64)
-    {
-        calcDistanceUnrolled<16, 64>(query, train, mask, allDist, distType);
-    }
-    else if (query.cols <= 128)
-    {
-        calcDistanceUnrolled<16, 128>(query, train, mask, allDist, distType);
-    }
-    else
-    {
-        calcDistance<16>(query, train, mask, allDist, distType);
-    }
-}
-
-static void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &mask,
-                      const oclMat &trainIdx, const oclMat &distance, int distType)
-{
-    bool is_cpu = isCpuDevice();
-    if (query.cols <= 64)
-    {
-        knn_matchUnrolledCached<16, 64>(query, train, mask, trainIdx, distance, distType);
-    }
-    else if (query.cols <= 128 && !is_cpu)
-    {
-        knn_matchUnrolledCached<16, 128>(query, train, mask, trainIdx, distance, distType);
-    }
-    else
-    {
-        knn_match<16>(query, train, mask, trainIdx, distance, distType);
-    }
-}
-
-template <int BLOCK_SIZE>
-void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int /*distType*/)
-{
-    cv::ocl::Context *ctx = trainIdx.clCxt;
-    size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1};
-    size_t localSize[] = {BLOCK_SIZE, 1, 1};
-    int block_size = BLOCK_SIZE;
-    String kernelName = "BruteForceMatch_findBestMatch";
-
-    for (int i = 0; i < k; ++i)
-    {
-        std::vector< std::pair<size_t, const void *> > args;
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&i));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
-        //args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
-        //args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
-        //args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
-
-        openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
-    }
-}
-
-static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
-{
-    findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
-}
-
-static void kmatchDispatcher(const oclMat &query, const oclMat &train, int k, const oclMat &mask,
-                      const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
-{
-    const oclMat zeroMask;
-    const oclMat &tempMask = mask.data ? mask : zeroMask;
-    if (k == 2)
-    {
-        match2Dispatcher(query, train, tempMask, trainIdx, distance, distType);
-    }
-    else
-    {
-        calcDistanceDispatcher(query, train, tempMask, allDist, distType);
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, distType);
-    }
-}
-
-cv::ocl::BruteForceMatcher_OCL_base::BruteForceMatcher_OCL_base(DistType distType_) : distType(distType_)
-{
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::add(const std::vector<oclMat> &descCollection)
-{
-    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
-}
-
-const std::vector<oclMat> &cv::ocl::BruteForceMatcher_OCL_base::getTrainDescriptors() const
-{
-    return trainDescCollection;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::clear()
-{
-    trainDescCollection.clear();
-}
-
-bool cv::ocl::BruteForceMatcher_OCL_base::empty() const
-{
-    return trainDescCollection.empty();
-}
-
-bool cv::ocl::BruteForceMatcher_OCL_base::isMaskSupported() const
-{
-    return true;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const oclMat &train,
-        oclMat &trainIdx, oclMat &distance, const oclMat &mask)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.cols == query.cols && train.type() == query.type());
-
-    ensureSizeIsEnough(1, query.rows, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, query.rows, CV_32F, distance);
-
-    matchDispatcher(query, train, mask, trainIdx, distance, distType);
-
-    return;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, distanceCPU, matches);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *trainIdx_ptr = trainIdx.ptr<int>();
-    const float *distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
-    {
-        int trainIdx = *trainIdx_ptr;
-
-        if (trainIdx == -1)
-            continue;
-
-        float distance = *distance_ptr;
-
-        DMatch m(queryIdx, trainIdx, 0, distance);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask)
-{
-    CV_Assert(mask.empty()); // mask is not supported at the moment
-    oclMat trainIdx, distance;
-    matchSingle(query, train, trainIdx, distance, mask);
-    matchDownload(trainIdx, distance, matches);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks)
-{
-
-    if (empty())
-        return;
-
-    if (masks.empty())
-    {
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(oclMat)));
-
-        oclMat *trainCollectionCPU_ptr = trainCollectionCPU.ptr<oclMat>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr)
-            *trainCollectionCPU_ptr = trainDescCollection[i];
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.release();
-    }
-    else
-    {
-        CV_Assert(masks.size() == trainDescCollection.size());
-
-        Mat trainCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(oclMat)));
-        Mat maskCollectionCPU(1, static_cast<int>(trainDescCollection.size()), CV_8UC(sizeof(oclMat)));
-
-        oclMat *trainCollectionCPU_ptr = trainCollectionCPU.ptr<oclMat>();
-        oclMat *maskCollectionCPU_ptr = maskCollectionCPU.ptr<oclMat>();
-
-        for (size_t i = 0, size = trainDescCollection.size(); i < size; ++i, ++trainCollectionCPU_ptr, ++maskCollectionCPU_ptr)
-        {
-            const oclMat &train = trainDescCollection[i];
-            const oclMat &mask = masks[i];
-
-            CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.cols == train.rows));
-
-            *trainCollectionCPU_ptr = train;
-            *maskCollectionCPU_ptr = mask;
-        }
-
-        trainCollection.upload(trainCollectionCPU);
-        maskCollection.upload(maskCollectionCPU);
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, const oclMat &trainCollection, oclMat &trainIdx,
-        oclMat &imgIdx, oclMat &distance, const oclMat &masks)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32F, distance);
-
-    matchDispatcher(query, &trainCollection, trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
-
-    return;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC1 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *trainIdx_ptr = trainIdx.ptr<int>();
-    const int *imgIdx_ptr = imgIdx.ptr<int>();
-    const float *distance_ptr =  distance.ptr<float>();
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-    {
-        int trainIdx = *trainIdx_ptr;
-
-        if (trainIdx == -1)
-            continue;
-
-        int imgIdx = *imgIdx_ptr;
-
-        float distance = *distance_ptr;
-
-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
-
-        matches.push_back(m);
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks)
-{
-    oclMat trainCollection;
-    oclMat maskCollection;
-
-    makeGpuCollection(trainCollection, maskCollection, masks);
-
-    oclMat trainIdx, imgIdx, distance;
-
-    matchCollection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-    matchDownload(trainIdx, imgIdx, distance, matches);
-}
-
-// knn match
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, const oclMat &train, oclMat &trainIdx,
-        oclMat &distance, oclMat &allDist, int k, const oclMat &mask)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    if (k == 2)
-    {
-        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-    }
-    else
-    {
-        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
-        ensureSizeIsEnough(nQuery, k, CV_32F, distance);
-        ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
-    }
-
-    trainIdx.setTo(Scalar::all(-1));
-
-    kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType);
-
-    return;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-
-    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatchConvert(const Mat &trainIdx, const Mat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2 || trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC2 || distance.type() == CV_32FC1);
-    CV_Assert(distance.size() == trainIdx.size());
-    CV_Assert(trainIdx.isContinuous() && distance.isContinuous());
-
-    const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;
-    const int k = trainIdx.type() == CV_32SC2 ? 2 : trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *trainIdx_ptr = trainIdx.ptr<int>();
-    const float *distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch> &curMatches = matches.back();
-        curMatches.reserve(k);
-
-        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int trainIdx = *trainIdx_ptr;
-
-            if (trainIdx != -1)
-            {
-                float distance = *distance_ptr;
-
-                DMatch m(queryIdx, trainIdx, 0, distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, const oclMat &train, std::vector< std::vector<DMatch> > &matches
-        , int k, const oclMat &mask, bool compactResult)
-{
-    oclMat trainIdx, distance, allDist;
-    knnMatchSingle(query, train, trainIdx, distance, allDist, k, mask);
-    knnMatchDownload(trainIdx, distance, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
-        oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &/*maskCollection*/)
-{
-    if (query.empty() || trainCollection.empty())
-        return;
-
-    // typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
-    //                          const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-
-    const int nQuery = query.rows;
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
-    ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
-
-    trainIdx.setTo(Scalar::all(-1));
-
-    //caller_t func = callers[distType][query.depth()];
-    //CV_Assert(func != 0);
-
-    //func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx,
-        const oclMat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-
-    knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
-        std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC2);
-    CV_Assert(imgIdx.type() == CV_32SC2 && imgIdx.cols == trainIdx.cols);
-    CV_Assert(distance.type() == CV_32FC2 && distance.cols == trainIdx.cols);
-
-    const int nQuery = trainIdx.cols;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *trainIdx_ptr = trainIdx.ptr<int>();
-    const int *imgIdx_ptr = imgIdx.ptr<int>();
-    const float *distance_ptr = distance.ptr<float>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch> &curMatches = matches.back();
-        curMatches.reserve(2);
-
-        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int trainIdx = *trainIdx_ptr;
-
-            if (trainIdx != -1)
-            {
-                int imgIdx = *imgIdx_ptr;
-
-                float distance = *distance_ptr;
-
-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
-
-                curMatches.push_back(m);
-            }
-        }
-
-        if (compactResult && curMatches.empty())
-            matches.pop_back();
-    }
-}
-
-namespace
-{
-    struct ImgIdxSetter
-    {
-        explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
-        inline void operator()(DMatch &m) const
-        {
-            m.imgIdx = imgIdx;
-        }
-        int imgIdx;
-    };
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
-        const std::vector<oclMat> &masks, bool compactResult)
-{
-    if (k == 2)
-    {
-        oclMat trainCollection;
-        oclMat maskCollection;
-
-        makeGpuCollection(trainCollection, maskCollection, masks);
-
-        oclMat trainIdx, imgIdx, distance;
-
-        knnMatch2Collection(query, trainCollection, trainIdx, imgIdx, distance, maskCollection);
-        knnMatch2Download(trainIdx, imgIdx, distance, matches);
-    }
-    else
-    {
-        if (query.empty() || empty())
-            return;
-
-        std::vector< std::vector<DMatch> > curMatches;
-        std::vector<DMatch> temp;
-        temp.reserve(2 * k);
-
-        matches.resize(query.rows);
-        for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&std::vector<DMatch>::reserve), k));
-
-        for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx)
-        {
-            knnMatch(query, trainDescCollection[imgIdx], curMatches, k, masks.empty() ? oclMat() : masks[imgIdx]);
-
-            for (int queryIdx = 0; queryIdx < query.rows; ++queryIdx)
-            {
-                std::vector<DMatch> &localMatch = curMatches[queryIdx];
-                std::vector<DMatch> &globalMatch = matches[queryIdx];
-
-                std::for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
-
-                temp.clear();
-                std::merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
-
-                globalMatch.clear();
-                const size_t count = std::min((size_t)k, temp.size());
-                std::copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));
-            }
-        }
-
-        if (compactResult)
-        {
-            std::vector< std::vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&std::vector<DMatch>::empty));
-            matches.erase(new_end, matches.end());
-        }
-    }
-}
-
-// radiusMatchSingle
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
-        oclMat &trainIdx,   oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
-{
-    if (query.empty() || train.empty())
-        return;
-
-    const int nQuery = query.rows;
-    const int nTrain = train.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(train.type() == query.type() && train.cols == query.cols);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size()));
-
-    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
-    if (trainIdx.empty())
-    {
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
-        ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
-    }
-
-    nMatches.setTo(Scalar::all(0));
-
-    matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
-
-    return;
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
-        std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
-        std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int *trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const float *distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatches == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>(nMatches));
-        std::vector<DMatch> &curMatches = matches.back();
-
-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
-        {
-            int trainIdx = *trainIdx_ptr;
-
-            float distance = *distance_ptr;
-
-            DMatch m(queryIdx, trainIdx, 0, distance);
-
-            curMatches[i] = m;
-        }
-
-        std::sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, const oclMat &train, std::vector< std::vector<DMatch> > &matches,
-        float maxDistance, const oclMat &mask, bool compactResult)
-{
-    oclMat trainIdx, distance, nMatches;
-    radiusMatchSingle(query, train, trainIdx, distance, nMatches, maxDistance, mask);
-    radiusMatchDownload(trainIdx, distance, nMatches, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
-        oclMat &nMatches, float /*maxDistance*/, const std::vector<oclMat> &masks)
-{
-    if (query.empty() || empty())
-        return;
-
-#if 0
-    typedef void (*caller_t)(const oclMat & query, const oclMat * trains, int n, float maxDistance, const oclMat * masks,
-                             const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance, const oclMat & nMatches);
-    static const caller_t callers[3][6] =
-    {
-        {
-            ocl_matchL1_gpu<unsigned char>, 0/*matchL1_gpu<signed char>*/,
-            ocl_matchL1_gpu<unsigned short>, matchL1_gpu<short>,
-            ocl_matchL1_gpu<int>, matchL1_gpu<float>
-        },
-        {
-            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
-            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
-            0/*matchL2_gpu<int>*/, ocl_matchL2_gpu<float>
-        },
-        {
-            ocl_matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
-            ocl_matchHamming_gpu<unsigned short>, 0/*matchHamming_gpu<short>*/,
-            ocl_matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
-        }
-    };
-#endif
-    const int nQuery = query.rows;
-
-    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size() && trainIdx.size() == imgIdx.size()));
-
-    nMatches.create(1, nQuery, CV_32SC1);
-    if (trainIdx.empty())
-    {
-        trainIdx.create(nQuery, std::max((nQuery / 100), 10), CV_32SC1);
-        imgIdx.create(nQuery, std::max((nQuery / 100), 10), CV_32SC1);
-        distance.create(nQuery, std::max((nQuery / 100), 10), CV_32FC1);
-    }
-
-    nMatches.setTo(Scalar::all(0));
-
-    //caller_t func = callers[distType][query.depth()];
-    //CV_Assert(func != 0);
-
-    std::vector<oclMat> trains_(trainDescCollection.begin(), trainDescCollection.end());
-    std::vector<oclMat> masks_(masks.begin(), masks.end());
-
-    /*  func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
-          trainIdx, imgIdx, distance, nMatches));*/
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
-        const oclMat &nMatches, std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    Mat trainIdxCPU(trainIdx);
-    Mat imgIdxCPU(imgIdx);
-    Mat distanceCPU(distance);
-    Mat nMatchesCPU(nMatches);
-
-    radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
-        std::vector< std::vector<DMatch> > &matches, bool compactResult)
-{
-    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
-        return;
-
-    CV_Assert(trainIdx.type() == CV_32SC1);
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.size() == trainIdx.size());
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.cols == trainIdx.rows);
-
-    const int nQuery = trainIdx.rows;
-
-    matches.clear();
-    matches.reserve(nQuery);
-
-    const int *nMatches_ptr = nMatches.ptr<int>();
-
-    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
-    {
-        const int *trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
-        const int *imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
-        const float *distance_ptr = distance.ptr<float>(queryIdx);
-
-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
-
-        if (nMatches == 0)
-        {
-            if (!compactResult)
-                matches.push_back(std::vector<DMatch>());
-            continue;
-        }
-
-        matches.push_back(std::vector<DMatch>());
-        std::vector<DMatch> &curMatches = matches.back();
-        curMatches.reserve(nMatches);
-
-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
-        {
-            int trainIdx = *trainIdx_ptr;
-            int imgIdx = *imgIdx_ptr;
-            float distance = *distance_ptr;
-
-            DMatch m(queryIdx, trainIdx, imgIdx, distance);
-
-            curMatches.push_back(m);
-        }
-
-        std::sort(curMatches.begin(), curMatches.end());
-    }
-}
-
-void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
-        const std::vector<oclMat> &masks, bool compactResult)
-{
-    oclMat trainIdx, imgIdx, distance, nMatches;
-    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
-    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
-}
diff --git a/modules/ocl/src/build_warps.cpp b/modules/ocl/src/build_warps.cpp
deleted file mode 100644
index 011672847..000000000
--- a/modules/ocl/src/build_warps.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpPlaneMaps
-
-void cv::ocl::buildWarpPlaneMaps(Size /*src_size*/, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T,
-                                 float scale, oclMat &xmap, oclMat &ymap)
-{
-    CV_Assert(K.size() == Size(3, 3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3, 3) && R.type() == CV_32F);
-    CV_Assert((T.size() == Size(3, 1) || T.size() == Size(1, 3)) && T.type() == CV_32F && T.isContinuous());
-
-    Mat K_Rinv = K * R.t();
-    CV_Assert(K_Rinv.isContinuous());
-
-    Mat KRT_mat(1, 12, CV_32FC1); // 9 + 3
-    KRT_mat(Range::all(), Range(0, 8)) = K_Rinv.reshape(1, 1);
-    KRT_mat(Range::all(), Range(9, 11)) = T;
-
-    oclMat KRT_oclMat(KRT_mat);
-    // transfer K_Rinv and T into a single cl_mem
-    xmap.create(dst_roi.size(), CV_32F);
-    ymap.create(dst_roi.size(), CV_32F);
-
-    int tl_u = dst_roi.tl().x;
-    int tl_v = dst_roi.tl().y;
-
-    int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
-    int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KRT_mat.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
-
-    size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
-#ifdef ANDROID
-    size_t localThreads[3]  = {32, 4, 1};
-#else
-    size_t localThreads[3]  = {32, 8, 1};
-#endif
-    openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpPlaneMaps", globalThreads, localThreads, args, -1, -1);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpCylyndricalMaps
-
-void cv::ocl::buildWarpCylindricalMaps(Size /*src_size*/, Rect dst_roi, const Mat &K, const Mat &R, float scale,
-                                       oclMat &xmap, oclMat &ymap)
-{
-    CV_Assert(K.size() == Size(3, 3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3, 3) && R.type() == CV_32F);
-
-    Mat K_Rinv = K * R.t();
-    CV_Assert(K_Rinv.isContinuous());
-
-    oclMat KR_oclMat(K_Rinv.reshape(1, 1));
-
-    xmap.create(dst_roi.size(), CV_32F);
-    ymap.create(dst_roi.size(), CV_32F);
-
-    int tl_u = dst_roi.tl().x;
-    int tl_v = dst_roi.tl().y;
-
-    int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
-    int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
-
-    size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
-#ifdef ANDROID
-    size_t localThreads[3]  = {32, 1, 1};
-#else
-    size_t localThreads[3]  = {32, 8, 1};
-#endif
-    openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpCylindricalMaps", globalThreads, localThreads, args, -1, -1);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpSphericalMaps
-
-void cv::ocl::buildWarpSphericalMaps(Size /*src_size*/, Rect dst_roi, const Mat &K, const Mat &R, float scale,
-                                     oclMat &xmap, oclMat &ymap)
-{
-    CV_Assert(K.size() == Size(3, 3) && K.type() == CV_32F);
-    CV_Assert(R.size() == Size(3, 3) && R.type() == CV_32F);
-
-    Mat K_Rinv = K * R.t();
-    CV_Assert(K_Rinv.isContinuous());
-
-    oclMat KR_oclMat(K_Rinv.reshape(1, 1));
-    // transfer K_Rinv, R_Kinv into a single cl_mem
-    xmap.create(dst_roi.size(), CV_32F);
-    ymap.create(dst_roi.size(), CV_32F);
-
-    int tl_u = dst_roi.tl().x;
-    int tl_v = dst_roi.tl().y;
-
-    int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
-    int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&KR_oclMat.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_u));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&tl_v));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
-
-    size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
-#ifdef ANDROID
-    size_t localThreads[3]  = {32, 4, 1};
-#else
-    size_t localThreads[3]  = {32, 8, 1};
-#endif
-    openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpSphericalMaps", globalThreads, localThreads, args, -1, -1);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpAffineMaps
-
-void cv::ocl::buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap)
-{
-    CV_Assert(M.rows == 2 && M.cols == 3);
-    CV_Assert(dsize.area());
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[2 * 3];
-    Mat coeffsMat(2, 3, CV_32F, (void *)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invertAffineTransform(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
-    int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
-
-    oclMat coeffsOclMat(coeffsMat.reshape(1, 1));
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
-
-    size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
-#ifdef ANDROID
-    size_t localThreads[3]  = {32, 4, 1};
-#else
-    size_t localThreads[3]  = {32, 8, 1};
-#endif
-    openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpAffineMaps", globalThreads, localThreads, args, -1, -1);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// buildWarpPerspectiveMaps
-
-void cv::ocl::buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap)
-{
-    CV_Assert(M.rows == 3 && M.cols == 3);
-    CV_Assert(dsize.area() > 0);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[3 * 3];
-    Mat coeffsMat(3, 3, CV_32F, (void *)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invert(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    oclMat coeffsOclMat(coeffsMat.reshape(1, 1));
-
-    int xmap_step = xmap.step / xmap.elemSize(), xmap_offset = xmap.offset / xmap.elemSize();
-    int ymap_step = ymap.step / ymap.elemSize(), ymap_offset = ymap.offset / ymap.elemSize();
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&xmap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ymap.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&coeffsOclMat.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&xmap_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&ymap_offset));
-
-    size_t globalThreads[3] = { xmap.cols, xmap.rows, 1 };
-
-    openCLExecuteKernel(Context::getContext(), &build_warps, "buildWarpPerspectiveMaps", globalThreads, NULL, args, -1, -1);
-}
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
deleted file mode 100644
index 8c68d8bac..000000000
--- a/modules/ocl/src/canny.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(1, 1, CV_32SC1)
-{
-    CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
-
-    create(dx_.size(), -1);
-}
-
-void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
-{
-    ensureSizeIsEnough(image_size, CV_32SC1, dx);
-    ensureSizeIsEnough(image_size, CV_32SC1, dy);
-
-    if(apperture_size == 3)
-    {
-        ensureSizeIsEnough(image_size, CV_32SC1, dx_buf);
-        ensureSizeIsEnough(image_size, CV_32SC1, dy_buf);
-    }
-    else if(apperture_size > 0)
-    {
-        Mat kx, ky;
-        if (!filterDX)
-        {
-            filterDX = createDerivFilter_GPU(CV_8U, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-        }
-        if (!filterDY)
-        {
-            filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
-        }
-    }
-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, magBuf);
-    ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, mapBuf);
-
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
-}
-
-void cv::ocl::CannyBuf::release()
-{
-    dx.release();
-    dy.release();
-    dx_buf.release();
-    dy_buf.release();
-    magBuf.release();
-    mapBuf.release();
-    trackBuf1.release();
-    trackBuf2.release();
-}
-
-namespace cv
-{
-    namespace ocl
-    {
-        namespace canny
-        {
-            void calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_buf, int rows, int cols);
-
-            void calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad);
-            void calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad);
-
-            void calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh);
-
-            void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols);
-
-            void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols);
-
-            void getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols);
-        }
-    }
-}// cv::ocl
-
-namespace
-{
-    void CannyCaller(CannyBuf &buf, oclMat &dst, float low_thresh, float high_thresh)
-    {
-        using namespace ::cv::ocl::canny;
-        calcMap_gpu(buf.dx, buf.dy, buf.magBuf, buf.mapBuf, dst.rows, dst.cols, low_thresh, high_thresh);
-
-        edgesHysteresisLocal_gpu(buf.mapBuf, buf.trackBuf1, buf.counter, dst.rows, dst.cols);
-
-        edgesHysteresisGlobal_gpu(buf.mapBuf, buf.trackBuf1, buf.trackBuf2, buf.counter, dst.rows, dst.cols);
-
-        getEdges_gpu(buf.mapBuf, dst, dst.rows, dst.cols);
-    }
-}
-
-void cv::ocl::Canny(const oclMat &src, oclMat &dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    CannyBuf buf(src.size(), apperture_size);
-    Canny(src, buf, dst, low_thresh, high_thresh, apperture_size, L2gradient);
-}
-
-void cv::ocl::Canny(const oclMat &src, CannyBuf &buf, oclMat &dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
-{
-    using namespace ::cv::ocl::canny;
-
-    CV_Assert(src.type() == CV_8UC1);
-
-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh );
-
-    dst.create(src.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
-    buf.create(src.size(), apperture_size);
-    buf.magBuf.setTo(Scalar::all(0));
-
-    if (apperture_size == 3)
-    {
-        calcSobelRowPass_gpu(src, buf.dx_buf, buf.dy_buf, src.rows, src.cols);
-
-        calcMagnitude_gpu(buf.dx_buf, buf.dy_buf, buf.dx, buf.dy, buf.magBuf, src.rows, src.cols, L2gradient);
-    }
-    else
-    {
-        buf.filterDX->apply(src, buf.dx);
-        buf.filterDY->apply(src, buf.dy);
-
-        calcMagnitude_gpu(buf.dx, buf.dy, buf.magBuf, src.rows, src.cols, L2gradient);
-    }
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
-}
-void cv::ocl::Canny(const oclMat &dx, const oclMat &dy, oclMat &dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    CannyBuf buf(dx, dy);
-    Canny(dx, dy, buf, dst, low_thresh, high_thresh, L2gradient);
-}
-
-void cv::ocl::Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &dst, double low_thresh, double high_thresh, bool L2gradient)
-{
-    using namespace ::cv::ocl::canny;
-
-    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
-
-    if( low_thresh > high_thresh )
-        std::swap( low_thresh, high_thresh);
-
-    dst.create(dx.size(), CV_8U);
-    dst.setTo(Scalar::all(0));
-
-    buf.dx = dx;
-    buf.dy = dy;
-    buf.create(dx.size(), -1);
-    buf.magBuf.setTo(Scalar::all(0));
-    calcMagnitude_gpu(buf.dx, buf.dy, buf.magBuf, dx.rows, dx.cols, L2gradient);
-
-    CannyCaller(buf, dst, static_cast<float>(low_thresh), static_cast<float>(high_thresh));
-}
-
-void canny::calcSobelRowPass_gpu(const oclMat &src, oclMat &dx_buf, oclMat &dy_buf, int rows, int cols)
-{
-    Context *clCxt = src.clCxt;
-    String kernelName = "calcSobelRowPass";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx_buf.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy_buf.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void canny::calcMagnitude_gpu(const oclMat &dx_buf, const oclMat &dy_buf, oclMat &dx, oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
-{
-    Context *clCxt = dx_buf.clCxt;
-    String kernelName = "calcMagnitude_buf";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dx_buf.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dy_buf.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dx.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dy.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mag.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx_buf.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx_buf.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy_buf.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy_buf.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.offset));
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-
-    const char * build_options = L2Grad ? "-D L2GRAD":"";
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-void canny::calcMagnitude_gpu(const oclMat &dx, const oclMat &dy, oclMat &mag, int rows, int cols, bool L2Grad)
-{
-    Context *clCxt = dx.clCxt;
-    String kernelName = "calcMagnitude";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dx.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dy.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mag.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.offset));
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-
-    const char * build_options = L2Grad ? "-D L2GRAD":"";
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh)
-{
-    Context *clCxt = dx.clCxt;
-
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dx.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dy.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mag.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&low_thresh));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&high_thresh));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dx.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dy.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mag.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
-
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    String kernelName = "calcMap";
-    size_t localThreads[3]  = {16, 16, 1};
-
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols)
-{
-    Context *clCxt = map.clCxt;
-    std::vector< std::pair<size_t, const void *> > args;
-
-    Mat counterMat(counter.rows, counter.cols, counter.type());
-    counterMat.at<int>(0, 0) = 0;
-    counter.upload(counterMat);
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    cl_int stepBytes = map.step;
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&stepBytes));
-    cl_int offsetBytes = map.offset;
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&offsetBytes));
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-
-    openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1);
-}
-
-void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols)
-{
-    Context *clCxt = map.clCxt;
-    std::vector< std::pair<size_t, const void *> > args;
-    size_t localThreads[3]  = {128, 1, 1};
-
-    while(1 > 0)
-    {
-        Mat counterMat; counter.download(counterMat);
-        int count = counterMat.at<int>(0, 0);
-        CV_Assert(count >= 0);
-        if (count == 0)
-            break;
-
-        counterMat.at<int>(0, 0) = 0;
-        counter.upload(counterMat);
-
-        args.clear();
-        size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1};
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&count));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
-
-        openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1);
-        std::swap(st1, st2);
-    }
-}
-
-void canny::getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols)
-{
-    Context *clCxt = map.clCxt;
-    String kernelName = "getEdges";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
-
-    size_t globalThreads[3] = {cols, rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
-}
diff --git a/modules/ocl/src/cl_context.cpp b/modules/ocl/src/cl_context.cpp
deleted file mode 100644
index d6d081fe1..000000000
--- a/modules/ocl/src/cl_context.cpp
+++ /dev/null
@@ -1,944 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Guoping Long, longguoping@gmail.com
-//    Niko Li, newlife20080214@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <stdlib.h>
-#include <ctype.h>
-#include <iomanip>
-#include <fstream>
-#include "cl_programcache.hpp"
-
-#include "opencv2/ocl/private/opencl_utils.hpp"
-
-namespace cv {
-namespace ocl {
-
-using namespace cl_utils;
-
-#if defined(WIN32)
-static bool __termination = false;
-#endif
-
-struct __Module
-{
-    __Module();
-    ~__Module();
-    cv::Mutex initializationMutex;
-    cv::Mutex currentContextMutex;
-};
-static __Module __module;
-
-cv::Mutex& getInitializationMutex()
-{
-    return __module.initializationMutex;
-}
-
-static cv::Mutex& getCurrentContextMutex()
-{
-    return __module.currentContextMutex;
-}
-
-static bool parseOpenCLVersion(const std::string& versionStr, int& major, int& minor)
-{
-    size_t p0 = versionStr.find(' ');
-    while (true)
-    {
-        if (p0 == std::string::npos)
-            break;
-        if (p0 + 1 >= versionStr.length())
-            break;
-        char c = versionStr[p0 + 1];
-        if (isdigit(c))
-            break;
-        p0 = versionStr.find(' ', p0 + 1);
-    }
-    size_t p1 = versionStr.find('.', p0);
-    size_t p2 = versionStr.find(' ', p1);
-    if (p0 == std::string::npos || p1 == std::string::npos || p2 == std::string::npos)
-    {
-        major = 0;
-        minor = 0;
-        return false;
-    }
-    std::string majorStr = versionStr.substr(p0 + 1, p1 - p0 - 1);
-    std::string minorStr = versionStr.substr(p1 + 1, p2 - p1 - 1);
-    major = atoi(majorStr.c_str());
-    minor = atoi(minorStr.c_str());
-    return true;
-}
-
-struct PlatformInfoImpl : public PlatformInfo
-{
-    cl_platform_id platform_id;
-
-    std::vector<int> deviceIDs;
-
-    PlatformInfoImpl()
-        : platform_id(NULL)
-    {
-    }
-
-    void init(int id, cl_platform_id platform)
-    {
-        CV_Assert(platform_id == NULL);
-
-        this->_id = id;
-        platform_id = platform;
-
-        openCLSafeCall(getStringInfo(clGetPlatformInfo, platform, CL_PLATFORM_PROFILE, this->platformProfile));
-        openCLSafeCall(getStringInfo(clGetPlatformInfo, platform, CL_PLATFORM_VERSION, this->platformVersion));
-        openCLSafeCall(getStringInfo(clGetPlatformInfo, platform, CL_PLATFORM_NAME, this->platformName));
-        openCLSafeCall(getStringInfo(clGetPlatformInfo, platform, CL_PLATFORM_VENDOR, this->platformVendor));
-        openCLSafeCall(getStringInfo(clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, this->platformExtensons));
-
-        parseOpenCLVersion(this->platformVersion,
-                this->platformVersionMajor, this->platformVersionMinor);
-    }
-
-};
-
-struct DeviceInfoImpl: public DeviceInfo
-{
-    cl_platform_id platform_id;
-    cl_device_id device_id;
-
-    DeviceInfoImpl()
-        : platform_id(NULL), device_id(NULL)
-    {
-    }
-
-    void init(int id, PlatformInfoImpl& platformInfoImpl, cl_device_id device)
-    {
-        CV_Assert(device_id == NULL);
-
-        this->_id = id;
-        platform_id = platformInfoImpl.platform_id;
-        device_id = device;
-
-        this->platform = &platformInfoImpl;
-
-        cl_device_type type = cl_device_type(-1);
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_TYPE, type));
-        this->deviceType = DeviceType(type);
-
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_PROFILE, this->deviceProfile));
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_VERSION, this->deviceVersion));
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_NAME, this->deviceName));
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_VENDOR, this->deviceVendor));
-        cl_uint vendorID = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_VENDOR_ID, vendorID));
-        this->deviceVendorId = vendorID;
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DRIVER_VERSION, this->deviceDriverVersion));
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, this->deviceExtensions));
-
-        parseOpenCLVersion(this->deviceVersion,
-                this->deviceVersionMajor, this->deviceVersionMinor);
-
-        size_t maxWorkGroupSize = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_MAX_WORK_GROUP_SIZE, maxWorkGroupSize));
-        this->maxWorkGroupSize = maxWorkGroupSize;
-
-        cl_uint maxDimensions = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, maxDimensions));
-        std::vector<size_t> maxWorkItemSizes(maxDimensions);
-        openCLSafeCall(clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions,
-                (void *)&maxWorkItemSizes[0], 0));
-        this->maxWorkItemSizes = maxWorkItemSizes;
-
-        cl_uint maxComputeUnits = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_MAX_COMPUTE_UNITS, maxComputeUnits));
-        this->maxComputeUnits = maxComputeUnits;
-
-        cl_ulong localMemorySize = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_LOCAL_MEM_SIZE, localMemorySize));
-        this->localMemorySize = (size_t)localMemorySize;
-
-        cl_ulong maxMemAllocSize = 0;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, maxMemAllocSize));
-        this->maxMemAllocSize = (size_t)maxMemAllocSize;
-
-        cl_bool unifiedMemory = false;
-        openCLSafeCall(getScalarInfo(clGetDeviceInfo, device, CL_DEVICE_HOST_UNIFIED_MEMORY, unifiedMemory));
-        this->isUnifiedMemory = unifiedMemory != 0;
-
-        //initialize extra options for compilation. Currently only fp64 is included.
-        //Assume 4KB is enough to store all possible extensions.
-        openCLSafeCall(getStringInfo(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, this->deviceExtensions));
-
-        size_t fp64_khr = this->deviceExtensions.find("cl_khr_fp64");
-        if(fp64_khr != std::string::npos)
-        {
-            this->compilationExtraOptions += "-D DOUBLE_SUPPORT";
-            this->haveDoubleSupport = true;
-        }
-        else
-        {
-            this->haveDoubleSupport = false;
-        }
-
-        size_t intel_platform = platformInfoImpl.platformVendor.find("Intel");
-        if(intel_platform != std::string::npos)
-        {
-            this->compilationExtraOptions += " -D INTEL_DEVICE";
-            this->isIntelDevice = true;
-        }
-        else
-        {
-            this->isIntelDevice = false;
-        }
-
-        if (id < 0)
-        {
-#ifdef CL_VERSION_1_2
-            if (this->deviceVersionMajor > 1 || (this->deviceVersionMajor == 1 && this->deviceVersionMinor >= 2))
-            {
-                ::clRetainDevice(device);
-            }
-#endif
-        }
-    }
-};
-
-static std::vector<PlatformInfoImpl> global_platforms;
-static std::vector<DeviceInfoImpl> global_devices;
-
-static void split(const std::string &s, char delim, std::vector<std::string> &elems) {
-    std::stringstream ss(s);
-    std::string item;
-    while (std::getline(ss, item, delim)) {
-        elems.push_back(item);
-    }
-}
-
-static std::vector<std::string> split(const std::string &s, char delim) {
-    std::vector<std::string> elems;
-    split(s, delim, elems);
-    return elems;
-}
-
-// Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
-// Sample: AMD:GPU:
-// Sample: AMD:GPU:Tahiti
-// Sample: :GPU|CPU: = '' = ':' = '::'
-static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
-        std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
-{
-    std::string deviceTypesStr;
-    size_t p0 = configurationStr.find(':');
-    if (p0 != std::string::npos)
-    {
-        size_t p1 = configurationStr.find(':', p0 + 1);
-        if (p1 != std::string::npos)
-        {
-            size_t p2 = configurationStr.find(':', p1 + 1);
-            if (p2 != std::string::npos)
-            {
-                std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
-                return false;
-            }
-            else
-            {
-                // assume platform + device types + device name/id
-                platform = configurationStr.substr(0, p0);
-                deviceTypesStr = configurationStr.substr(p0 + 1, p1 - (p0 + 1));
-                deviceNameOrID = configurationStr.substr(p1 + 1, configurationStr.length() - (p1 + 1));
-            }
-        }
-        else
-        {
-            // assume platform + device types
-            platform = configurationStr.substr(0, p0);
-            deviceTypesStr = configurationStr.substr(p0 + 1, configurationStr.length() - (p0 + 1));
-        }
-    }
-    else
-    {
-        // assume only platform
-        platform = configurationStr;
-    }
-    deviceTypes = split(deviceTypesStr, '|');
-    return true;
-}
-
-static bool selectOpenCLDevice()
-{
-    std::string platform;
-    std::vector<std::string> deviceTypes;
-    std::string deviceName;
-    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
-    if (configuration)
-    {
-        if (!parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
-            return false;
-    }
-
-    bool isID = false;
-    int deviceID = -1;
-    if (deviceName.length() == 1)
-    // We limit ID range to 0..9, because we want to write:
-    // - '2500' to mean i5-2500
-    // - '8350' to mean AMD FX-8350
-    // - '650' to mean GeForce 650
-    // To extend ID range change condition to '> 0'
-    {
-        isID = true;
-        for (size_t i = 0; i < deviceName.length(); i++)
-        {
-            if (!isdigit(deviceName[i]))
-            {
-                isID = false;
-                break;
-            }
-        }
-        if (isID)
-        {
-            deviceID = atoi(deviceName.c_str());
-            CV_Assert(deviceID >= 0);
-        }
-    }
-
-    const PlatformInfo* platformInfo = NULL;
-    if (platform.length() > 0)
-    {
-        PlatformsInfo platforms;
-        getOpenCLPlatforms(platforms);
-        for (size_t i = 0; i < platforms.size(); i++)
-        {
-            if (platforms[i]->platformName.find(platform) != std::string::npos)
-            {
-                platformInfo = platforms[i];
-                break;
-            }
-        }
-        if (platformInfo == NULL)
-        {
-            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
-            goto not_found;
-        }
-    }
-
-    if (deviceTypes.size() == 0)
-    {
-        if (!isID)
-        {
-            deviceTypes.push_back("GPU");
-            deviceTypes.push_back("CPU");
-        }
-        else
-        {
-            deviceTypes.push_back("ALL");
-        }
-    }
-    for (size_t t = 0; t < deviceTypes.size(); t++)
-    {
-        int deviceType = 0;
-        if (deviceTypes[t] == "GPU")
-        {
-            deviceType = CVCL_DEVICE_TYPE_GPU;
-        }
-        else if (deviceTypes[t] == "CPU")
-        {
-            deviceType = CVCL_DEVICE_TYPE_CPU;
-        }
-        else if (deviceTypes[t] == "ACCELERATOR")
-        {
-            deviceType = CVCL_DEVICE_TYPE_ACCELERATOR;
-        }
-        else if (deviceTypes[t] == "ALL")
-        {
-            deviceType = CVCL_DEVICE_TYPE_ALL;
-        }
-        else
-        {
-            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
-            goto not_found;
-        }
-
-        DevicesInfo devices;
-        getOpenCLDevices(devices, deviceType, platformInfo);
-
-        for (size_t i = (isID ? deviceID : 0);
-             (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
-             i++)
-        {
-            if (isID || devices[i]->deviceName.find(deviceName) != std::string::npos)
-            {
-                // check for OpenCL 1.1
-                if (devices[i]->deviceVersionMajor < 1 ||
-                        (devices[i]->deviceVersionMajor == 1 && devices[i]->deviceVersionMinor < 1))
-                {
-                    std::cerr << "Skip unsupported version of OpenCL device: " << devices[i]->deviceName
-                            << "(" << devices[i]->platform->platformName << ")" << std::endl;
-                    continue; // unsupported version of device, skip it
-                }
-                try
-                {
-                    setDevice(devices[i]);
-                }
-                catch (...)
-                {
-                    std::cerr << "ERROR: Can't select OpenCL device: " << devices[i]->deviceName
-                            << "(" << devices[i]->platform->platformName << ")" << std::endl;
-                    goto not_found;
-                }
-                return true;
-            }
-        }
-    }
-not_found:
-    std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
-            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
-            << "    Device types: ";
-    for (size_t t = 0; t < deviceTypes.size(); t++)
-    {
-        std::cerr << deviceTypes[t] << " ";
-    }
-    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
-    return false;
-}
-
-static bool __initialized = false;
-static int initializeOpenCLDevices()
-{
-    assert(!__initialized);
-    __initialized = true;
-
-    assert(global_devices.size() == 0);
-
-    std::vector<cl_platform_id> platforms;
-    try
-    {
-        openCLSafeCall(getPlatforms(platforms));
-    }
-    catch (cv::Exception&)
-    {
-        return 0; // OpenCL not found
-    }
-
-    global_platforms.resize(platforms.size());
-
-    for (size_t i = 0; i < platforms.size(); ++i)
-    {
-        PlatformInfoImpl& platformInfo = global_platforms[i];
-
-        cl_platform_id platform = platforms[i];
-        platformInfo.init(i, platform);
-
-        std::vector<cl_device_id> devices;
-        cl_int status = getDevices(platform, CL_DEVICE_TYPE_ALL, devices);
-        if(status != CL_DEVICE_NOT_FOUND)
-            openCLVerifyCall(status);
-
-        if(devices.size() > 0)
-        {
-            int baseIndx = global_devices.size();
-            global_devices.resize(baseIndx + devices.size());
-            platformInfo.deviceIDs.resize(devices.size());
-            platformInfo.devices.resize(devices.size());
-
-            for(size_t j = 0; j < devices.size(); ++j)
-            {
-                cl_device_id device = devices[j];
-
-                DeviceInfoImpl& deviceInfo = global_devices[baseIndx + j];
-                platformInfo.deviceIDs[j] = baseIndx + j;
-                deviceInfo.init(baseIndx + j, platformInfo, device);
-            }
-        }
-    }
-
-    for (size_t i = 0; i < platforms.size(); ++i)
-    {
-        PlatformInfoImpl& platformInfo = global_platforms[i];
-        for(size_t j = 0; j < platformInfo.deviceIDs.size(); ++j)
-        {
-            DeviceInfoImpl& deviceInfo = global_devices[platformInfo.deviceIDs[j]];
-            platformInfo.devices[j] = &deviceInfo;
-        }
-    }
-
-    return global_devices.size();
-}
-
-
-DeviceInfo::DeviceInfo()
-    : _id(-1), deviceType(DeviceType(0)),
-      deviceVendorId(-1),
-      maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0),
-      deviceVersionMajor(0), deviceVersionMinor(0),
-      haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false),
-      platform(NULL)
-{
-    // nothing
-}
-
-DeviceInfo::~DeviceInfo() { }
-
-PlatformInfo::PlatformInfo()
-    : _id(-1),
-      platformVersionMajor(0), platformVersionMinor(0)
-{
-    // nothing
-}
-
-PlatformInfo::~PlatformInfo() { }
-
-class ContextImpl;
-
-struct CommandQueue
-{
-    ContextImpl* context_;
-    cl_command_queue clQueue_;
-
-    CommandQueue() : context_(NULL), clQueue_(NULL) { }
-    ~CommandQueue() { release(); }
-
-    void create(ContextImpl* context_);
-    void release()
-    {
-#ifdef WIN32
-        // if process is on termination stage (ExitProcess was called and other threads were terminated)
-        // then disable command queue release because it may cause program hang
-        if (!__termination)
-#endif
-        {
-            if(clQueue_)
-            {
-                openCLSafeCall(clReleaseCommandQueue(clQueue_)); // some cleanup problems are here
-            }
-
-        }
-        clQueue_ = NULL;
-        context_ = NULL;
-    }
-};
-
-cv::TLSData<CommandQueue> commandQueueTLSData;
-
-//////////////////////////////// OpenCL context ////////////////////////
-//This is a global singleton class used to represent a OpenCL context.
-class ContextImpl : public Context
-{
-public:
-    cl_device_id clDeviceID;
-    cl_context clContext;
-    const DeviceInfoImpl& deviceInfoImpl;
-
-protected:
-    ContextImpl(const DeviceInfoImpl& _deviceInfoImpl, cl_context context)
-        : clDeviceID(_deviceInfoImpl.device_id), clContext(context), deviceInfoImpl(_deviceInfoImpl)
-    {
-#ifdef CL_VERSION_1_2
-        if (supportsFeature(FEATURE_CL_VER_1_2))
-        {
-            openCLSafeCall(clRetainDevice(clDeviceID));
-        }
-#endif
-        openCLSafeCall(clRetainContext(clContext));
-
-        ContextImpl* old = NULL;
-        {
-            cv::AutoLock lock(getCurrentContextMutex());
-            old = currentContext;
-            currentContext = this;
-        }
-        if (old != NULL)
-        {
-            delete old;
-        }
-    }
-    ~ContextImpl()
-    {
-        CV_Assert(this != currentContext);
-
-#ifdef CL_VERSION_1_2
-        if (supportsFeature(FEATURE_CL_VER_1_2))
-        {
-            openCLSafeCall(clReleaseDevice(clDeviceID));
-        }
-#endif
-        if (deviceInfoImpl._id < 0) // not in the global registry, so we should cleanup it
-        {
-#ifdef CL_VERSION_1_2
-            if (supportsFeature(FEATURE_CL_VER_1_2))
-            {
-                openCLSafeCall(clReleaseDevice(deviceInfoImpl.device_id));
-            }
-#endif
-            PlatformInfoImpl* platformImpl = (PlatformInfoImpl*)(deviceInfoImpl.platform);
-            delete platformImpl;
-            delete const_cast<DeviceInfoImpl*>(&deviceInfoImpl);
-        }
-        clDeviceID = NULL;
-
-#ifdef WIN32
-        // if process is on termination stage (ExitProcess was called and other threads were terminated)
-        // then disable command queue release because it may cause program hang
-        if (!__termination)
-#endif
-        {
-            if(clContext)
-            {
-                openCLSafeCall(clReleaseContext(clContext));
-            }
-        }
-        clContext = NULL;
-    }
-public:
-    static void setContext(const DeviceInfo* deviceInfo);
-    static void initializeContext(void* pClPlatform, void* pClContext, void* pClDevice);
-
-    bool supportsFeature(FEATURE_TYPE featureType) const;
-
-    static void cleanupContext(void);
-
-    static ContextImpl* getContext();
-private:
-    ContextImpl(const ContextImpl&); // disabled
-    ContextImpl& operator=(const ContextImpl&); // disabled
-
-    static ContextImpl* currentContext;
-};
-
-ContextImpl* ContextImpl::currentContext = NULL;
-
-static bool __deviceSelected = false;
-
-Context* Context::getContext()
-{
-    return ContextImpl::getContext();
-}
-
-ContextImpl* ContextImpl::getContext()
-{
-    if (currentContext == NULL)
-    {
-        static bool defaultInitiaization = false;
-        if (!defaultInitiaization)
-        {
-            cv::AutoLock lock(getInitializationMutex());
-            try
-            {
-                if (!__initialized)
-                {
-                    if (initializeOpenCLDevices() == 0)
-                    {
-                        CV_Error(Error::OpenCLInitError, "OpenCL not available");
-                    }
-                }
-                if (!__deviceSelected)
-                {
-                    if (!selectOpenCLDevice())
-                    {
-                        CV_Error(Error::OpenCLInitError, "Can't select OpenCL device");
-                    }
-                }
-                defaultInitiaization = true;
-            }
-            catch (...)
-            {
-                defaultInitiaization = true;
-                throw;
-            }
-        }
-        CV_Assert(currentContext != NULL);
-    }
-    return currentContext;
-}
-
-bool Context::supportsFeature(FEATURE_TYPE featureType) const
-{
-    return ((ContextImpl*)this)->supportsFeature(featureType);
-}
-
-const DeviceInfo& Context::getDeviceInfo() const
-{
-    return ((ContextImpl*)this)->deviceInfoImpl;
-}
-
-const void* Context::getOpenCLContextPtr() const
-{
-    return &(((ContextImpl*)this)->clContext);
-}
-
-const void* Context::getOpenCLCommandQueuePtr() const
-{
-    ContextImpl* pThis = (ContextImpl*)this;
-    CommandQueue* commandQueue = commandQueueTLSData.get();
-    if (commandQueue->context_ != pThis)
-    {
-        commandQueue->create(pThis);
-    }
-    return &commandQueue->clQueue_;
-}
-
-const void* Context::getOpenCLDeviceIDPtr() const
-{
-    return &(((ContextImpl*)this)->clDeviceID);
-}
-
-
-bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
-{
-    switch (featureType)
-    {
-    case FEATURE_CL_INTEL_DEVICE:
-        return deviceInfoImpl.isIntelDevice;
-    case FEATURE_CL_DOUBLE:
-        return deviceInfoImpl.haveDoubleSupport;
-    case FEATURE_CL_UNIFIED_MEM:
-        return deviceInfoImpl.isUnifiedMemory;
-    case FEATURE_CL_VER_1_2:
-        return deviceInfoImpl.deviceVersionMajor > 1 || (deviceInfoImpl.deviceVersionMajor == 1 && deviceInfoImpl.deviceVersionMinor >= 2);
-    }
-    CV_Error(CV_StsBadArg, "Invalid feature type");
-    return false;
-}
-
-void fft_teardown();
-void clBlasTeardown();
-
-void ContextImpl::cleanupContext(void)
-{
-    fft_teardown();
-    clBlasTeardown();
-
-    cv::AutoLock lock(getCurrentContextMutex());
-    if (currentContext)
-    {
-        ContextImpl* ctx = currentContext;
-        currentContext = NULL;
-        delete ctx;
-    }
-}
-
-void ContextImpl::setContext(const DeviceInfo* deviceInfo)
-{
-    CV_Assert(deviceInfo->_id >= 0); // we can't specify custom devices
-    CV_Assert(deviceInfo->_id < (int)global_devices.size());
-
-    {
-        cv::AutoLock lock(getCurrentContextMutex());
-        if (currentContext)
-        {
-            if (currentContext->deviceInfoImpl._id == deviceInfo->_id)
-                return;
-        }
-    }
-
-    DeviceInfoImpl& infoImpl = global_devices[deviceInfo->_id];
-    CV_Assert(deviceInfo == &infoImpl);
-
-    cl_int status = 0;
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(infoImpl.platform_id), 0 };
-    cl_context clContext = clCreateContext(cps, 1, &infoImpl.device_id, NULL, NULL, &status);
-    openCLVerifyCall(status);
-
-    ContextImpl* ctx = new ContextImpl(infoImpl, clContext);
-    clReleaseContext(clContext);
-    (void)ctx;
-}
-
-void ContextImpl::initializeContext(void* pClPlatform, void* pClContext, void* pClDevice)
-{
-    CV_Assert(pClPlatform != NULL);
-    CV_Assert(pClContext != NULL);
-    CV_Assert(pClDevice != NULL);
-    cl_platform_id platform = *(cl_platform_id*)pClPlatform;
-    cl_context context = *(cl_context*)pClContext;
-    cl_device_id device = *(cl_device_id*)pClDevice;
-
-    PlatformInfoImpl* platformInfoImpl = new PlatformInfoImpl();
-    platformInfoImpl->init(-1, platform);
-    DeviceInfoImpl* deviceInfoImpl = new DeviceInfoImpl();
-    deviceInfoImpl->init(-1, *platformInfoImpl, device);
-
-    ContextImpl* ctx = new ContextImpl(*deviceInfoImpl, context);
-    (void)ctx;
-}
-
-void CommandQueue::create(ContextImpl* context)
-{
-    release();
-    cl_int status = 0;
-    // TODO add CL_QUEUE_PROFILING_ENABLE
-    cl_command_queue clCmdQueue = clCreateCommandQueue(context->clContext, context->clDeviceID, 0, &status);
-    openCLVerifyCall(status);
-    context_ = context;
-    clQueue_ = clCmdQueue;
-}
-
-int getOpenCLPlatforms(PlatformsInfo& platforms)
-{
-    if (!__initialized)
-        initializeOpenCLDevices();
-
-    platforms.clear();
-
-    for (size_t id = 0; id < global_platforms.size(); ++id)
-    {
-        PlatformInfoImpl& impl = global_platforms[id];
-        platforms.push_back(&impl);
-    }
-
-    return platforms.size();
-}
-
-int getOpenCLDevices(std::vector<const DeviceInfo*> &devices, int deviceType, const PlatformInfo* platform)
-{
-    if (!__initialized)
-        initializeOpenCLDevices();
-
-    devices.clear();
-
-    switch(deviceType)
-    {
-    case CVCL_DEVICE_TYPE_DEFAULT:
-    case CVCL_DEVICE_TYPE_CPU:
-    case CVCL_DEVICE_TYPE_GPU:
-    case CVCL_DEVICE_TYPE_ACCELERATOR:
-    case CVCL_DEVICE_TYPE_ALL:
-        break;
-    default:
-        return 0;
-    }
-
-    if (platform == NULL)
-    {
-        for (size_t id = 0; id < global_devices.size(); ++id)
-        {
-            DeviceInfoImpl& deviceInfo = global_devices[id];
-            if (((int)deviceInfo.deviceType & deviceType) != 0)
-            {
-                devices.push_back(&deviceInfo);
-            }
-        }
-    }
-    else
-    {
-        for (size_t id = 0; id < platform->devices.size(); ++id)
-        {
-            const DeviceInfo* deviceInfo = platform->devices[id];
-            if (((int)deviceInfo->deviceType & deviceType) == deviceType)
-            {
-                devices.push_back(deviceInfo);
-            }
-        }
-    }
-
-    return (int)devices.size();
-}
-
-void setDevice(const DeviceInfo* info)
-{
-    try
-    {
-        ContextImpl::setContext(info);
-        __deviceSelected = true;
-    }
-    catch (...)
-    {
-        __deviceSelected = true;
-        throw;
-    }
-}
-
-void initializeContext(void* pClPlatform, void* pClContext, void* pClDevice)
-{
-    try
-    {
-        ContextImpl::initializeContext(pClPlatform, pClContext, pClDevice);
-        __deviceSelected = true;
-    }
-    catch (...)
-    {
-        __deviceSelected = true;
-        throw;
-    }
-}
-
-bool supportsFeature(FEATURE_TYPE featureType)
-{
-    return Context::getContext()->supportsFeature(featureType);
-}
-
-__Module::__Module()
-{
-    /* moved to Context::getContext(): initializeOpenCLDevices(); */
-}
-
-__Module::~__Module()
-{
-#if defined(WIN32) && defined(CVAPI_EXPORTS)
-    // nothing, see DllMain
-#else
-    ContextImpl::cleanupContext();
-#endif
-}
-
-} // namespace ocl
-} // namespace cv
-
-
-#if defined(WIN32) && defined(CVAPI_EXPORTS)
-
-extern "C"
-BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD fdwReason, LPVOID lpReserved);
-
-extern "C"
-BOOL WINAPI DllMain(HINSTANCE /*hInst*/, DWORD fdwReason, LPVOID lpReserved)
-{
-    if (fdwReason == DLL_PROCESS_DETACH)
-    {
-        if (lpReserved != NULL) // called after ExitProcess() call
-            cv::ocl::__termination = true;
-        cv::ocl::ContextImpl::cleanupContext();
-    }
-    return TRUE;
-}
-
-#endif
diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp
deleted file mode 100644
index aa44c4874..000000000
--- a/modules/ocl/src/cl_operations.cpp
+++ /dev/null
@@ -1,549 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Guoping Long, longguoping@gmail.com
-//    Niko Li, newlife20080214@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <iomanip>
-#include <fstream>
-#include "cl_programcache.hpp"
-
-//#define PRINT_KERNEL_RUN_TIME
-#define RUN_TIMES 100
-#ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
-#define CL_MEM_USE_PERSISTENT_MEM_AMD 0
-#endif
-//#define AMD_DOUBLE_DIFFER
-
-namespace cv {
-namespace ocl {
-
-DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
-DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
-int gDevMemTypeValueMap[5] = {0,
-                              CL_MEM_ALLOC_HOST_PTR,
-                              CL_MEM_USE_HOST_PTR,
-                              CL_MEM_COPY_HOST_PTR,
-                              CL_MEM_USE_PERSISTENT_MEM_AMD};
-int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
-
-void finish()
-{
-    clFinish(getClCommandQueue(Context::getContext()));
-}
-
-bool isCpuDevice()
-{
-    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
-    return (info.deviceType == CVCL_DEVICE_TYPE_CPU);
-}
-
-size_t queryWaveFrontSize(cl_kernel kernel)
-{
-    const DeviceInfo& info = Context::getContext()->getDeviceInfo();
-    if (info.deviceType == CVCL_DEVICE_TYPE_CPU)
-        return 1;
-    size_t wavefront = 0;
-    CV_Assert(kernel != NULL);
-    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(Context::getContext()),
-            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &wavefront, NULL));
-    return wavefront;
-}
-
-
-void openCLReadBuffer(Context *ctx, cl_mem dst_buffer, void *host_buffer, size_t size)
-{
-    cl_int status;
-    status = clEnqueueReadBuffer(getClCommandQueue(ctx), dst_buffer, CL_TRUE, 0,
-                                 size, host_buffer, 0, NULL, NULL);
-    openCLVerifyCall(status);
-}
-
-cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
-{
-    cl_int status;
-    cl_mem buffer = clCreateBuffer(getClContext(ctx), (cl_mem_flags)flag, size, NULL, &status);
-    openCLVerifyCall(status);
-    return buffer;
-}
-
-#define MEMORY_CORRUPTION_GUARD
-#ifdef MEMORY_CORRUPTION_GUARD
-//#define CHECK_MEMORY_CORRUPTION
-#define CHECK_MEMORY_CORRUPTION_PRINT_ERROR
-#define CHECK_MEMORY_CORRUPTION_RAISE_ERROR
-static const int __memory_corruption_guard_bytes = 64*1024;
-#ifdef CHECK_MEMORY_CORRUPTION
-static const int __memory_corruption_check_pattern = 0x14326547; // change pattern for sizeof(int)==8
-#endif
-struct CheckBuffers
-{
-    cl_mem mainBuffer;
-    size_t size;
-    size_t widthInBytes, height;
-    CheckBuffers()
-        : mainBuffer(NULL), size(0), widthInBytes(0), height(0)
-    {
-        // nothing
-    }
-    CheckBuffers(cl_mem _mainBuffer, size_t _size, size_t _widthInBytes, size_t _height)
-        : mainBuffer(_mainBuffer), size(_size), widthInBytes(_widthInBytes), height(_height)
-    {
-        // nothing
-    }
-};
-static std::map<cl_mem, CheckBuffers> __check_buffers;
-#endif
-
-void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
-                       size_t widthInBytes, size_t height)
-{
-    openCLMallocPitchEx(ctx, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
-}
-
-void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
-                       size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
-{
-    cl_int status;
-    size_t size = widthInBytes * height;
-    bool useSubBuffers =
-#ifndef MEMORY_CORRUPTION_GUARD
-            false;
-#else
-            true;
-#endif
-    const DeviceInfo& devInfo = ctx->getDeviceInfo();
-    if (useSubBuffers && devInfo.isIntelDevice)
-    {
-        useSubBuffers = false; // TODO FIXIT We observe memory leaks then we working with sub-buffers
-                               // on the CPU device of Intel OpenCL SDK (Linux). We will investigate this later.
-    }
-    if (!useSubBuffers)
-    {
-        *dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                                  size, 0, &status);
-        openCLVerifyCall(status);
-    }
-#ifdef MEMORY_CORRUPTION_GUARD
-    else
-    {
-        size_t allocSize = size + __memory_corruption_guard_bytes * 2;
-        cl_mem mainBuffer = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                allocSize, 0, &status);
-        openCLVerifyCall(status);
-        cl_buffer_region r = {__memory_corruption_guard_bytes, size};
-        *dev_ptr = clCreateSubBuffer(mainBuffer,
-                gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
-                CL_BUFFER_CREATE_TYPE_REGION, &r,
-                &status);
-        openCLVerifyCall(status);
-#ifdef CHECK_MEMORY_CORRUPTION
-        std::vector<int> tmp(__memory_corruption_guard_bytes / sizeof(int),
-                __memory_corruption_check_pattern);
-        CV_Assert(tmp.size() * sizeof(int) == __memory_corruption_guard_bytes);
-        openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
-                mainBuffer, CL_FALSE, 0, __memory_corruption_guard_bytes, &tmp[0],
-                0, NULL, NULL));
-        openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
-                mainBuffer, CL_FALSE, __memory_corruption_guard_bytes + size, __memory_corruption_guard_bytes, &tmp[0],
-                0, NULL, NULL));
-        clFinish(getClCommandQueue(ctx));
-#endif
-        CheckBuffers data(mainBuffer, size, widthInBytes, height);
-        cv::AutoLock lock(getInitializationMutex());
-        __check_buffers.insert(std::pair<cl_mem, CheckBuffers>((cl_mem)*dev_ptr, data));
-    }
-#endif
-    *pitch = widthInBytes;
-}
-
-void openCLMemcpy2D(Context *ctx, void *dst, size_t dpitch,
-                    const void *src, size_t spitch,
-                    size_t width, size_t height, openCLMemcpyKind kind, int channels)
-{
-    size_t buffer_origin[3] = {0, 0, 0};
-    size_t host_origin[3] = {0, 0, 0};
-    size_t region[3] = {width, height, 1};
-    if(kind == clMemcpyHostToDevice)
-    {
-        if(dpitch == width || channels == 3 || height == 1)
-        {
-            openCLSafeCall(clEnqueueWriteBuffer(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
-                                                0, width * height, src, 0, NULL, NULL));
-        }
-        else
-        {
-            openCLSafeCall(clEnqueueWriteBufferRect(getClCommandQueue(ctx), (cl_mem)dst, CL_TRUE,
-                                                    buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
-        }
-    }
-    else if(kind == clMemcpyDeviceToHost)
-    {
-        if(spitch == width || channels == 3 || height == 1)
-        {
-            openCLSafeCall(clEnqueueReadBuffer(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
-                                               0, width * height, dst, 0, NULL, NULL));
-        }
-        else
-        {
-            openCLSafeCall(clEnqueueReadBufferRect(getClCommandQueue(ctx), (cl_mem)src, CL_TRUE,
-                                                   buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
-        }
-    }
-}
-
-void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
-                        const void *src, size_t spitch,
-                        size_t width, size_t height, int src_offset)
-{
-    size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
-    size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
-    size_t region[3] = {width, height, 1};
-
-    openCLSafeCall(clEnqueueCopyBufferRect(getClCommandQueue(ctx), (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
-                                           region, spitch, 0, dpitch, 0, 0, 0, 0));
-}
-
-void openCLFree(void *devPtr)
-{
-    openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
-#ifdef MEMORY_CORRUPTION_GUARD
-#ifdef CHECK_MEMORY_CORRUPTION
-    bool failBefore = false, failAfter = false;
-#endif
-    CheckBuffers data;
-    {
-        cv::AutoLock lock(getInitializationMutex());
-        std::map<cl_mem, CheckBuffers>::iterator i = __check_buffers.find((cl_mem)devPtr);
-        if (i != __check_buffers.end())
-        {
-            data = i->second;
-            __check_buffers.erase(i);
-        }
-    }
-    if (data.mainBuffer != NULL)
-    {
-#ifdef CHECK_MEMORY_CORRUPTION
-        Context* ctx = Context::getContext();
-        std::vector<uchar> checkBefore(__memory_corruption_guard_bytes);
-        std::vector<uchar> checkAfter(__memory_corruption_guard_bytes);
-        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
-                data.mainBuffer, CL_FALSE, 0, __memory_corruption_guard_bytes, &checkBefore[0],
-                0, NULL, NULL));
-        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
-                data.mainBuffer, CL_FALSE, __memory_corruption_guard_bytes + data.size, __memory_corruption_guard_bytes, &checkAfter[0],
-                0, NULL, NULL));
-        clFinish(getClCommandQueue(ctx));
-
-        std::vector<int> tmp(__memory_corruption_guard_bytes / sizeof(int),
-                __memory_corruption_check_pattern);
-
-        if (memcmp(&checkBefore[0], &tmp[0], __memory_corruption_guard_bytes) != 0)
-        {
-            failBefore = true;
-        }
-        if (memcmp(&checkAfter[0], &tmp[0], __memory_corruption_guard_bytes) != 0)
-        {
-            failAfter = true;
-        }
-#else
-        // TODO FIXIT Attach clReleaseMemObject call to event completion callback
-        // TODO 2013/12/04 Disable workaround
-        // Context* ctx = Context::getContext();
-        // clFinish(getClCommandQueue(ctx));
-#endif
-        openCLSafeCall(clReleaseMemObject(data.mainBuffer));
-    }
-#if defined(CHECK_MEMORY_CORRUPTION)
-    if (failBefore)
-    {
-#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
-        std::cerr << "ERROR: Memory corruption detected: before buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
-#endif
-#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
-        CV_Error(CV_StsInternal, "Memory corruption detected: before buffer");
-#endif
-    }
-    if (failAfter)
-    {
-#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
-        std::cerr << "ERROR: Memory corruption detected: after buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
-#endif
-#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
-        CV_Error(CV_StsInternal, "Memory corruption detected: after buffer");
-#endif
-    }
-#endif // CHECK_MEMORY_CORRUPTION
-#endif // MEMORY_CORRUPTION_GUARD
-}
-
-cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
-{
-    return openCLGetKernelFromSource(ctx, source, kernelName, NULL);
-}
-
-cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
-                                    const char *build_options)
-{
-    cl_kernel kernel;
-    cl_int status = 0;
-    CV_Assert(ProgramCache::getProgramCache() != NULL);
-    cl_program program = ProgramCache::getProgramCache()->getProgram(ctx, source, build_options);
-    CV_Assert(program != NULL);
-    kernel = clCreateKernel(program, kernelName.c_str(), &status);
-    openCLVerifyCall(status);
-    openCLVerifyCall(clReleaseProgram(program));
-    return kernel;
-}
-
-void openCLVerifyKernel(const Context *ctx, cl_kernel kernel, size_t *localThreads)
-{
-    size_t kernelWorkGroupSize;
-    openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(ctx),
-                                            CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-    CV_Assert( localThreads[0] <= ctx->getDeviceInfo().maxWorkItemSizes[0] );
-    CV_Assert( localThreads[1] <= ctx->getDeviceInfo().maxWorkItemSizes[1] );
-    CV_Assert( localThreads[2] <= ctx->getDeviceInfo().maxWorkItemSizes[2] );
-    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
-    CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= ctx->getDeviceInfo().maxWorkGroupSize );
-}
-
-#ifdef PRINT_KERNEL_RUN_TIME
-static double total_execute_time = 0;
-static double total_kernel_time = 0;
-#endif
-
-static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
-{
-    if (buildOptions == NULL)
-        return "";
-
-    size_t length = strlen(buildOptions), didx = 0, sidx = 0;
-    while (sidx < length && buildOptions[sidx] == 0)
-        ++sidx;
-
-    std::string opt;
-    opt.resize(length);
-
-    for ( ; sidx < length; ++sidx)
-        if (buildOptions[sidx] != ' ')
-            opt[didx++] = buildOptions[sidx];
-        else if ( !(didx > 0 && opt[didx - 1] == ' ') )
-            opt[didx++] = buildOptions[sidx];
-
-    return opt;
-}
-
-cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, int channels,
-                          int depth, const char *build_options)
-{
-    //construct kernel name
-    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
-    //for example split_C2_D3, represent the split kernel with channels = 2 and dataType Depth = 3(Data type is short)
-    std::stringstream idxStr;
-    if(channels != -1)
-        idxStr << "_C" << channels;
-    if(depth != -1)
-        idxStr << "_D" << depth;
-    kernelName += idxStr.str();
-
-    std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
-    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
-    return kernel;
-}
-
-void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
-                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args)
-{
-    if ( localThreads != NULL)
-    {
-        globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
-        globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
-        globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
-
-        cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
-    }
-    for(size_t i = 0; i < args.size(); i ++)
-        openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
-
-#ifndef PRINT_KERNEL_RUN_TIME
-    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
-                                          localThreads, 0, NULL, NULL));
-#else
-    cl_event event = NULL;
-    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
-                                          localThreads, 0, NULL, &event));
-
-    cl_ulong start_time, end_time, queue_time;
-    double execute_time = 0;
-    double total_time   = 0;
-
-    openCLSafeCall(clWaitForEvents(1, &event));
-    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
-                                           sizeof(cl_ulong), &start_time, 0));
-
-    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
-                                           sizeof(cl_ulong), &end_time, 0));
-
-    openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
-                                           sizeof(cl_ulong), &queue_time, 0));
-
-    execute_time = (double)(end_time - start_time) / (1000 * 1000);
-    total_time = (double)(end_time - queue_time) / (1000 * 1000);
-
-    total_execute_time += execute_time;
-    total_kernel_time += total_time;
-    clReleaseEvent(event);
-#endif
-
-    clFlush(getClCommandQueue(ctx));
-    openCLSafeCall(clReleaseKernel(kernel));
-}
-
-void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-                          size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                          int depth, const char *build_options)
-{
-    cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
-
-    openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
-}
-
-void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
-                         size_t globalThreads[3], size_t localThreads[3],
-                         std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
-{
-    openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args,
-                        channels, depth, NULL);
-}
-void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
-                         size_t globalThreads[3], size_t localThreads[3],
-                         std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
-
-{
-#ifndef PRINT_KERNEL_RUN_TIME
-    openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
-                         build_options);
-#else
-    String data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
-    std::cout << std::endl;
-    std::cout << "Function Name: " << kernelName;
-    if(depth >= 0)
-        std::cout << " |data type: " << data_type[depth];
-    std::cout << " |channels: " << channels;
-    std::cout << " |Time Unit: " << "ms" << std::endl;
-
-    total_execute_time = 0;
-    total_kernel_time = 0;
-    std::cout << "-------------------------------------" << std::endl;
-
-    std::cout << std::setiosflags(std::ios::left) << std::setw(15) << "execute time";
-    std::cout << std::setiosflags(std::ios::left) << std::setw(15) << "launch time";
-    std::cout << std::setiosflags(std::ios::left) << std::setw(15) << "kernel time" << std::endl;
-    int i = 0;
-    for(i = 0; i < RUN_TIMES; i++)
-        openCLExecuteKernel_(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth,
-                             build_options);
-
-    std::cout << "average kernel execute time: " << total_execute_time / RUN_TIMES << std::endl; // "ms" << std::endl;
-    std::cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << std::endl; // "ms" << std::endl;
-#endif
-}
-
-void openCLExecuteKernelInterop(Context *ctx, const cv::ocl::ProgramSource& source, String kernelName,
-                         size_t globalThreads[3], size_t localThreads[3],
-                         std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
-
-{
-    //construct kernel name
-    //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
-    //for example split_C2_D2, represent the split kernel with channels = 2 and dataType Depth = 2 (Data type is char)
-    std::stringstream idxStr;
-    if(channels != -1)
-        idxStr << "_C" << channels;
-    if(depth != -1)
-        idxStr << "_D" << depth;
-    kernelName += idxStr.str();
-
-    std::string name = std::string("custom_") + source.name;
-    ProgramEntry program = { name.c_str(), source.programStr, source.programHash };
-    cl_kernel kernel = openCLGetKernelFromSource(ctx, &program, kernelName, build_options);
-
-    CV_Assert(globalThreads != NULL);
-    if ( localThreads != NULL)
-    {
-        globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
-        globalThreads[1] = roundUp(globalThreads[1], localThreads[1]);
-        globalThreads[2] = roundUp(globalThreads[2], localThreads[2]);
-
-        cv::ocl::openCLVerifyKernel(ctx, kernel, localThreads);
-    }
-    for(size_t i = 0; i < args.size(); i ++)
-        openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
-
-    openCLSafeCall(clEnqueueNDRangeKernel(getClCommandQueue(ctx), kernel, 3, NULL, globalThreads,
-                    localThreads, 0, NULL, NULL));
-
-    clFinish(getClCommandQueue(ctx));
-    openCLSafeCall(clReleaseKernel(kernel));
-}
-
-cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-                     const size_t size)
-{
-    int status;
-    cl_mem con_struct;
-
-    con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
-    openCLSafeCall(status);
-
-    openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
-                                        value, 0, 0, 0));
-
-    return con_struct;
-}
-
-}//namespace ocl
-}//namespace cv
diff --git a/modules/ocl/src/cl_programcache.cpp b/modules/ocl/src/cl_programcache.cpp
deleted file mode 100644
index 56f0213c5..000000000
--- a/modules/ocl/src/cl_programcache.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Guoping Long, longguoping@gmail.com
-//    Niko Li, newlife20080214@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <iomanip>
-#include <fstream>
-#include "cl_programcache.hpp"
-
-namespace cv { namespace ocl {
-
-/*
- * The binary caching system to eliminate redundant program source compilation.
- * Strictly, this is not a cache because we do not implement evictions right now.
- * We shall add such features to trade-off memory consumption and performance when necessary.
- */
-
-cv::Mutex ProgramCache::mutexFiles;
-cv::Mutex ProgramCache::mutexCache;
-
-ProgramCache* _programCache = NULL;
-ProgramCache* ProgramCache::getProgramCache()
-{
-    if (NULL == _programCache)
-    {
-        cv::AutoLock lock(getInitializationMutex());
-        if (NULL == _programCache)
-            _programCache = new ProgramCache();
-    }
-    return _programCache;
-}
-
-ProgramCache::ProgramCache()
-{
-    codeCache.clear();
-    cacheSize = 0;
-}
-
-ProgramCache::~ProgramCache()
-{
-    releaseProgram();
-    if (this == _programCache)
-    {
-        cv::AutoLock lock(getInitializationMutex());
-        if (this == _programCache)
-            _programCache = NULL;
-    }
-}
-
-cl_program ProgramCache::progLookup(const String& srcsign)
-{
-    std::map<String, cl_program>::iterator iter;
-    iter = codeCache.find(srcsign);
-    if(iter != codeCache.end())
-        return iter->second;
-    else
-        return NULL;
-}
-
-void ProgramCache::addProgram(const String& srcsign, cl_program program)
-{
-    if (!progLookup(srcsign))
-    {
-        clRetainProgram(program);
-        codeCache.insert(std::map<String, cl_program>::value_type(srcsign, program));
-    }
-}
-
-void ProgramCache::releaseProgram()
-{
-    std::map<String, cl_program>::iterator iter;
-    for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
-    {
-        openCLSafeCall(clReleaseProgram(iter->second));
-    }
-    codeCache.clear();
-    cacheSize = 0;
-}
-
-static bool enable_disk_cache = true;
-static String binpath = "";
-
-void setBinaryDiskCache(int mode, String path)
-{
-    enable_disk_cache = false;
-    binpath = "";
-
-    if(mode == CACHE_NONE)
-    {
-        return;
-    }
-    enable_disk_cache =
-#if defined(_DEBUG) || defined(DEBUG)
-        (mode & CACHE_DEBUG)   == CACHE_DEBUG;
-#else
-        (mode & CACHE_RELEASE) == CACHE_RELEASE;
-#endif
-    if(enable_disk_cache && !path.empty())
-    {
-        binpath = path;
-    }
-}
-
-void setBinaryPath(const char *path)
-{
-    binpath = path;
-}
-
-static const int MAX_ENTRIES = 64;
-
-struct ProgramFileCache
-{
-    struct CV_DECL_ALIGNED(1) ProgramFileHeader
-    {
-        int hashLength;
-        //char hash[];
-    };
-
-    struct CV_DECL_ALIGNED(1) ProgramFileTable
-    {
-        int numberOfEntries;
-        //int firstEntryOffset[];
-    };
-
-    struct CV_DECL_ALIGNED(1) ProgramFileConfigurationEntry
-    {
-        int nextEntry;
-        int dataSize;
-        int optionsLength;
-        //char options[];
-        // char data[];
-    };
-
-    String fileName_;
-    const char* hash_;
-    std::fstream f;
-
-    ProgramFileCache(const String& fileName, const char* hash)
-        : fileName_(fileName), hash_(hash)
-    {
-        if (hash_ != NULL)
-        {
-            f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
-            if(f.is_open())
-            {
-                int hashLength = 0;
-                f.read((char*)&hashLength, sizeof(int));
-                std::vector<char> fhash(hashLength + 1);
-                f.read(&fhash[0], hashLength);
-                if (f.eof() || strncmp(hash_, &fhash[0], hashLength) != 0)
-                {
-                    f.close();
-                    remove(fileName_.c_str());
-                    return;
-                }
-            }
-        }
-    }
-
-    int getHash(const String& options)
-    {
-        int hash = 0;
-        for (size_t i = 0; i < options.length(); i++)
-        {
-            hash = (hash << 2) ^ (hash >> 17) ^ options[i];
-        }
-        return (hash + (hash >> 16)) & (MAX_ENTRIES - 1);
-    }
-
-    bool readConfigurationFromFile(const String& options, std::vector<char>& buf)
-    {
-        if (hash_ == NULL)
-            return false;
-
-        if (!f.is_open())
-            return false;
-
-        f.seekg(0, std::fstream::end);
-        size_t fileSize = (size_t)f.tellg();
-        if (fileSize == 0)
-        {
-            std::cerr << "Invalid file (empty): " << fileName_ << std::endl;
-            f.close();
-            remove(fileName_.c_str());
-            return false;
-        }
-        f.seekg(0, std::fstream::beg);
-
-        int hashLength = 0;
-        f.read((char*)&hashLength, sizeof(int));
-        CV_Assert(hashLength > 0);
-        f.seekg(sizeof(hashLength) + hashLength, std::fstream::beg);
-
-        int numberOfEntries = 0;
-        f.read((char*)&numberOfEntries, sizeof(int));
-        CV_Assert(numberOfEntries > 0);
-        if (numberOfEntries != MAX_ENTRIES)
-        {
-            std::cerr << "Invalid file: " << fileName_ << std::endl;
-            f.close();
-            remove(fileName_.c_str());
-            return false;
-        }
-
-        std::vector<int> firstEntryOffset(numberOfEntries);
-        f.read((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
-
-        int entryNum = getHash(options);
-
-        int entryOffset = firstEntryOffset[entryNum];
-        ProgramFileConfigurationEntry entry;
-        while (entryOffset > 0)
-        {
-            f.seekg(entryOffset, std::fstream::beg);
-            assert(sizeof(entry) == sizeof(int)*3);
-            f.read((char*)&entry, sizeof(entry));
-            std::vector<char> foptions(entry.optionsLength);
-            if ((int)options.length() == entry.optionsLength)
-            {
-                if (entry.optionsLength > 0)
-                    f.read(&foptions[0], entry.optionsLength);
-                if (memcmp(&foptions[0], options.c_str(), entry.optionsLength) == 0)
-                {
-                    buf.resize(entry.dataSize);
-                    f.read(&buf[0], entry.dataSize);
-                    f.seekg(0, std::fstream::beg);
-                    return true;
-                }
-            }
-            if (entry.nextEntry <= 0)
-                break;
-            entryOffset = entry.nextEntry;
-        }
-        return false;
-    }
-
-    bool writeConfigurationToFile(const String& options, std::vector<char>& buf)
-    {
-        if (hash_ == NULL)
-            return true; // don't save programs without hash
-
-        if (!f.is_open())
-        {
-            f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
-            if (!f.is_open())
-            {
-                f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
-                if (!f.is_open())
-                    return false;
-            }
-        }
-
-        f.seekg(0, std::fstream::end);
-        size_t fileSize = (size_t)f.tellg();
-        if (fileSize == 0)
-        {
-            f.seekp(0, std::fstream::beg);
-            int hashLength = strlen(hash_);
-            f.write((char*)&hashLength, sizeof(int));
-            f.write(hash_, hashLength);
-
-            int numberOfEntries = MAX_ENTRIES;
-            f.write((char*)&numberOfEntries, sizeof(int));
-            std::vector<int> firstEntryOffset(MAX_ENTRIES, 0);
-            f.write((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
-            f.close();
-            f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
-            CV_Assert(f.is_open());
-            f.seekg(0, std::fstream::end);
-            fileSize = (size_t)f.tellg();
-        }
-        f.seekg(0, std::fstream::beg);
-
-        int hashLength = 0;
-        f.read((char*)&hashLength, sizeof(int));
-        CV_Assert(hashLength > 0);
-        f.seekg(sizeof(hashLength) + hashLength, std::fstream::beg);
-
-        int numberOfEntries = 0;
-        f.read((char*)&numberOfEntries, sizeof(int));
-        CV_Assert(numberOfEntries > 0);
-        if (numberOfEntries != MAX_ENTRIES)
-        {
-            std::cerr << "Invalid file: " << fileName_ << std::endl;
-            f.close();
-            remove(fileName_.c_str());
-            return false;
-        }
-
-        size_t tableEntriesOffset = (size_t)f.tellg();
-        std::vector<int> firstEntryOffset(numberOfEntries);
-        f.read((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
-
-        int entryNum = getHash(options);
-
-        int entryOffset = firstEntryOffset[entryNum];
-        ProgramFileConfigurationEntry entry;
-        while (entryOffset > 0)
-        {
-            f.seekg(entryOffset, std::fstream::beg);
-            assert(sizeof(entry) == sizeof(int)*3);
-            f.read((char*)&entry, sizeof(entry));
-            std::vector<char> foptions(entry.optionsLength);
-            if ((int)options.length() == entry.optionsLength)
-            {
-                if (entry.optionsLength > 0)
-                    f.read(&foptions[0], entry.optionsLength);
-                CV_Assert(memcmp(&foptions, options.c_str(), entry.optionsLength) != 0);
-            }
-            if (entry.nextEntry <= 0)
-                break;
-            entryOffset = entry.nextEntry;
-        }
-        if (entryOffset > 0)
-        {
-            f.seekp(entryOffset, std::fstream::beg);
-            entry.nextEntry = fileSize;
-            f.write((char*)&entry, sizeof(entry));
-        }
-        else
-        {
-            firstEntryOffset[entryNum] = fileSize;
-            f.seekp(tableEntriesOffset, std::fstream::beg);
-            f.write((char*)&firstEntryOffset[0], sizeof(int)*numberOfEntries);
-        }
-        f.seekp(fileSize, std::fstream::beg);
-        entry.nextEntry = 0;
-        entry.dataSize = buf.size();
-        entry.optionsLength = options.length();
-        f.write((char*)&entry, sizeof(entry));
-        f.write(options.c_str(), entry.optionsLength);
-        f.write(&buf[0], entry.dataSize);
-        return true;
-    }
-
-    cl_program getOrBuildProgram(const Context* ctx, const cv::ocl::ProgramEntry* source, const String& options)
-    {
-        cl_int status = 0;
-        cl_program program = NULL;
-        std::vector<char> binary;
-        if (!enable_disk_cache || !readConfigurationFromFile(options, binary))
-        {
-            program = clCreateProgramWithSource(getClContext(ctx), 1, (const char**)&source->programStr, NULL, &status);
-            openCLVerifyCall(status);
-            cl_device_id device = getClDeviceID(ctx);
-            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
-            if(status == CL_SUCCESS)
-            {
-                if (enable_disk_cache)
-                {
-                    size_t binarySize;
-                    openCLSafeCall(clGetProgramInfo(program,
-                                            CL_PROGRAM_BINARY_SIZES,
-                                            sizeof(size_t),
-                                            &binarySize, NULL));
-
-                    std::vector<char> binary(binarySize);
-
-                    char* ptr = &binary[0];
-                    openCLSafeCall(clGetProgramInfo(program,
-                                            CL_PROGRAM_BINARIES,
-                                            sizeof(char*),
-                                            &ptr,
-                                            NULL));
-
-                    if (!writeConfigurationToFile(options, binary))
-                    {
-                        std::cerr << "Can't write data to file: " << fileName_ << std::endl;
-                    }
-                }
-            }
-        }
-        else
-        {
-            cl_device_id device = getClDeviceID(ctx);
-            size_t size = binary.size();
-            const char* ptr = &binary[0];
-            program = clCreateProgramWithBinary(getClContext(ctx),
-                    1, &device,
-                    (const size_t *)&size, (const unsigned char **)&ptr,
-                    NULL, &status);
-            openCLVerifyCall(status);
-            status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL);
-        }
-
-        if(status != CL_SUCCESS)
-        {
-            if (status == CL_BUILD_PROGRAM_FAILURE || status == CL_INVALID_BUILD_OPTIONS)
-            {
-                size_t buildLogSize = 0;
-                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
-                        CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize));
-                std::vector<char> buildLog; buildLog.resize(buildLogSize);
-                memset(&buildLog[0], 0, buildLogSize);
-                openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
-                        CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[0], NULL));
-                std::cout << std::endl << "BUILD LOG: "
-                        << (source->name ? source->name : "dynamic program") << ": "
-                        << options << "\n";
-                std::cout << &buildLog[0] << std::endl;
-            }
-            openCLVerifyCall(status);
-        }
-        return program;
-    }
-};
-
-cl_program ProgramCache::getProgram(const Context *ctx, const cv::ocl::ProgramEntry* source,
-                                    const char *build_options)
-{
-    std::stringstream src_sign;
-
-    if (source->name)
-    {
-        src_sign << source->name;
-        src_sign << getClContext(ctx);
-        if (NULL != build_options)
-        {
-            src_sign << "_" << build_options;
-        }
-
-        {
-            cv::AutoLock lockCache(mutexCache);
-            cl_program program = ProgramCache::getProgramCache()->progLookup(src_sign.str());
-            if (!!program)
-            {
-                clRetainProgram(program);
-                return program;
-            }
-        }
-    }
-
-    cv::AutoLock lockCache(mutexFiles);
-
-    // second check
-    if (source->name)
-    {
-        cv::AutoLock lockCache(mutexCache);
-        cl_program program = ProgramCache::getProgramCache()->progLookup(src_sign.str());
-        if (!!program)
-        {
-            clRetainProgram(program);
-            return program;
-        }
-    }
-
-    String all_build_options;
-    if (!ctx->getDeviceInfo().compilationExtraOptions.empty())
-        all_build_options += ctx->getDeviceInfo().compilationExtraOptions;
-    if (build_options != NULL)
-    {
-        all_build_options += " ";
-        all_build_options += build_options;
-    }
-    const DeviceInfo& devInfo = ctx->getDeviceInfo();
-    String filename = binpath + (source->name ? source->name : "NULL") + "_" + devInfo.platform->platformName + "_" + devInfo.deviceName + ".clb";
-
-    ProgramFileCache programFileCache(filename, source->programHash);
-    cl_program program = programFileCache.getOrBuildProgram(ctx, source, all_build_options);
-
-    //Cache the binary for future use if build_options is null
-    if (source->name)
-    {
-        cv::AutoLock lockCache(mutexCache);
-        this->addProgram(src_sign.str(), program);
-    }
-    return program;
-}
-
-} // namespace ocl
-} // namespace cv
diff --git a/modules/ocl/src/color.cpp b/modules/ocl/src/color.cpp
deleted file mode 100644
index f71081d78..000000000
--- a/modules/ocl/src/color.cpp
+++ /dev/null
@@ -1,506 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Wang Weiyan, wangweiyanster@gmail.com
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
-                           const std::string & additionalOptions = std::string(),
-                           const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
-{
-    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
-
-    String build_options = format("-D DEPTH_%d", src.depth());
-    if (!additionalOptions.empty())
-        build_options = build_options + additionalOptions;
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
-
-    if (!data1.empty())
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data ));
-    if (!data2.empty())
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
-
-    size_t gt[3] = { dst.cols, dst.rows, 1 };
-#ifdef ANDROID
-    size_t lt[3] = { 16, 10, 1 };
-#else
-    size_t lt[3] = { 16, 16, 1 };
-#endif
-    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
-}
-
-static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
-                         const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
-{
-    String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
-    if (!additionalOptions.empty())
-        build_options = build_options + additionalOptions;
-
-    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
-
-    if (!data.empty())
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
-
-    size_t gt[3] = {src.cols, src.rows, 1};
-#ifdef ANDROID
-    size_t lt[3] = {16, 10, 1};
-#else
-    size_t lt[3] = {16, 16, 1};
-#endif
-    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
-}
-
-static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
-{
-    String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
-                                  dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
-    int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
-    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
-
-    size_t gt[3] = { dst.cols, dst.rows, 1 };
-#ifdef ANDROID
-    size_t lt[3] = { 16, 10, 1 };
-#else
-    size_t lt[3] = { 16, 16, 1 };
-#endif
-    openCLExecuteKernel(src.clCxt, &cvt_color, "RGB", gt, lt, args, -1, -1, build_options.c_str());
-}
-
-static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
-{
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
-                                  src.depth(), greenbits, dst.channels());
-    int src_offset = src.offset >> 1, src_step = src.step >> 1;
-    int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
-
-    size_t gt[3] = { dst.cols, dst.rows, 1 };
-#ifdef ANDROID
-    size_t lt[3] = { 16, 10, 1 };
-#else
-    size_t lt[3] = { 16, 16, 1 };
-#endif
-    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
-}
-
-static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
-{
-    String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
-                                  src.depth(), greenbits, src.channels());
-    int src_offset = (int)src.offset, src_step = (int)src.step;
-    int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
-
-    size_t gt[3] = { dst.cols, dst.rows, 1 };
-#ifdef ANDROID
-    size_t lt[3] = { 16, 10, 1 };
-#else
-    size_t lt[3] = { 16, 16, 1 };
-#endif
-    openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
-}
-
-static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
-{
-    Size sz = src.size();
-    int scn = src.channels(), depth = src.depth(), bidx;
-
-    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
-
-    switch (code)
-    {
-    case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
-    case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
-    {
-        CV_Assert(scn == 3 || scn == 4);
-        dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
-        bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
-        dst.create(sz, CV_MAKE_TYPE(depth, dcn));
-        RGB_caller(src, dst, reverse);
-        break;
-    }
-    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
-    case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
-    {
-        CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
-        bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
-            code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
-        int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
-            code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
-        dst.create(sz, CV_8UC2);
-        toRGB5x5_caller(src, dst, bidx, greenbits, "RGB2RGB5x5");
-        break;
-    }
-    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
-    case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
-    {
-        dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
-        CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
-        bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
-            code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
-        int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
-            code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
-        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        fromRGB5x5_caller(src, dst, bidx, greenbits, "RGB5x52RGB");
-        break;
-    }
-    case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
-    {
-        CV_Assert(scn == 2 && depth == CV_8U);
-        dst.create(sz, CV_8UC1);
-        int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
-        fromRGB5x5_caller(src, dst, -1, greenbits, "BGR5x52Gray");
-        break;
-    }
-    case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
-    {
-        CV_Assert(scn == 1 && depth == CV_8U);
-        dst.create(sz, CV_8UC2);
-        int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
-        toRGB5x5_caller(src, dst, -1, greenbits, "Gray2BGR5x5");
-        break;
-    }
-    case COLOR_RGB2GRAY: case COLOR_BGR2GRAY: case COLOR_RGBA2GRAY: case COLOR_BGRA2GRAY:
-    {
-        CV_Assert(scn == 3 || scn == 4);
-        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
-        dst.create(sz, CV_MAKETYPE(depth, 1));
-        fromRGB_caller(src, dst, bidx, "RGB2Gray");
-        break;
-    }
-    case COLOR_GRAY2BGR: case COLOR_GRAY2BGRA:
-    {
-        CV_Assert(scn == 1);
-        dcn  = code == COLOR_GRAY2BGRA ? 4 : 3;
-        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, 0, "Gray2RGB");
-        break;
-    }
-    case COLOR_BGR2YUV: case COLOR_RGB2YUV:
-    {
-        CV_Assert(scn == 3 || scn == 4);
-        bidx = code == COLOR_BGR2YUV ? 0 : 2;
-        dst.create(sz, CV_MAKETYPE(depth, 3));
-        fromRGB_caller(src, dst, bidx, "RGB2YUV");
-        break;
-    }
-    case COLOR_YUV2BGR: case COLOR_YUV2RGB:
-    {
-        if( dcn <= 0 )
-            dcn = 3;
-        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
-        bidx = code == COLOR_YUV2BGR ? 0 : 2;
-        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YUV2RGB");
-        break;
-    }
-    case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12:
-    case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12:
-    {
-        CV_Assert(scn == 1);
-        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
-        dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
-        bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
-
-        Size dstSz(sz.width, sz.height * 2 / 3);
-        dst.create(dstSz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
-        break;
-    }
-    case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
-    {
-        CV_Assert(scn == 3 || scn == 4);
-        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
-        dst.create(sz, CV_MAKETYPE(depth, 3));
-        fromRGB_caller(src, dst, bidx, "RGB2YCrCb");
-        break;
-    }
-    case COLOR_YCrCb2BGR: case COLOR_YCrCb2RGB:
-    {
-        if( dcn <= 0 )
-            dcn = 3;
-        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
-        bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
-        dst.create(sz, CV_MAKETYPE(depth, dcn));
-        toRGB_caller(src, dst, bidx, "YCrCb2RGB");
-        break;
-    }
-    case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
-    {
-        CV_Assert(scn == 3 || scn == 4);
-        bidx = code == COLOR_BGR2XYZ ? 0 : 2;
-        dst.create(sz, CV_MAKE_TYPE(depth, 3));
-
-        Mat c;
-        if (depth == CV_32F)
-        {
-            float coeffs[] =
-            {
-                0.412453f, 0.357580f, 0.180423f,
-                0.212671f, 0.715160f, 0.072169f,
-                0.019334f, 0.119193f, 0.950227f
-            };
-            if (bidx == 0)
-            {
-                std::swap(coeffs[0], coeffs[2]);
-                std::swap(coeffs[3], coeffs[5]);
-                std::swap(coeffs[6], coeffs[8]);
-            }
-            Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
-        }
-        else
-        {
-            int coeffs[] =
-            {
-                1689,    1465,    739,
-                871,     2929,    296,
-                79,      488,     3892
-            };
-            if (bidx == 0)
-            {
-                std::swap(coeffs[0], coeffs[2]);
-                std::swap(coeffs[3], coeffs[5]);
-                std::swap(coeffs[6], coeffs[8]);
-            }
-            Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
-        }
-        oclMat oclCoeffs(c);
-
-        fromRGB_caller(src, dst, bidx, "RGB2XYZ", "", oclCoeffs);
-        break;
-    }
-    case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
-    {
-        if (dcn <= 0)
-            dcn = 3;
-        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
-        bidx = code == COLOR_XYZ2BGR ? 0 : 2;
-        dst.create(sz, CV_MAKE_TYPE(depth, dcn));
-
-        Mat c;
-        if (depth == CV_32F)
-        {
-            float coeffs[] =
-            {
-                3.240479f, -1.53715f, -0.498535f,
-                -0.969256f, 1.875991f, 0.041556f,
-                0.055648f, -0.204043f, 1.057311f
-            };
-            if (bidx == 0)
-            {
-                std::swap(coeffs[0], coeffs[6]);
-                std::swap(coeffs[1], coeffs[7]);
-                std::swap(coeffs[2], coeffs[8]);
-            }
-            Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
-        }
-        else
-        {
-            int coeffs[] =
-            {
-                13273,  -6296,  -2042,
-                -3970,   7684,    170,
-                  228,   -836,   4331
-            };
-            if (bidx == 0)
-            {
-                std::swap(coeffs[0], coeffs[6]);
-                std::swap(coeffs[1], coeffs[7]);
-                std::swap(coeffs[2], coeffs[8]);
-            }
-            Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
-        }
-        oclMat oclCoeffs(c);
-
-        toRGB_caller(src, dst, bidx, "XYZ2RGB", "", oclCoeffs);
-        break;
-    }
-    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
-    case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
-    {
-        CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
-        bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
-            code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
-        int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
-            code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
-        bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
-        dst.create(sz, CV_MAKETYPE(depth, 3));
-        std::string kernelName = std::string("RGB2") + (is_hsv ? "HSV" : "HLS");
-
-        if (is_hsv && depth == CV_8U)
-        {
-            static oclMat sdiv_data;
-            static oclMat hdiv_data180;
-            static oclMat hdiv_data256;
-            static int sdiv_table[256];
-            static int hdiv_table180[256];
-            static int hdiv_table256[256];
-            static volatile bool initialized180 = false, initialized256 = false;
-            volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
-
-            if (!initialized)
-            {
-                int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
-                oclMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
-
-                sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
-
-                int v = 255 << hsv_shift;
-                if (!initialized180 && !initialized256)
-                {
-                    for(int i = 1; i < 256; i++ )
-                        sdiv_table[i] = saturate_cast<int>(v/(1.*i));
-                    sdiv_data.upload(Mat(1, 256, CV_32SC1, sdiv_table));
-                }
-
-                v = hrange << hsv_shift;
-                for (int i = 1; i < 256; i++ )
-                    hdiv_table[i] = saturate_cast<int>(v/(6.*i));
-
-                hdiv_data.upload(Mat(1, 256, CV_32SC1, hdiv_table));
-                initialized = true;
-            }
-
-            fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
-            return;
-        }
-
-        fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
-        break;
-    }
-    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
-    case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
-    {
-        if (dcn <= 0)
-            dcn = 3;
-        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
-        bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
-            code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
-        int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
-            code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
-        bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
-                code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
-
-        dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-        std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
-        toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
-        break;
-    }
-    case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
-        {
-            CV_Assert(scn == 4 && depth == CV_8U);
-            dst.create(sz, CV_MAKETYPE(depth, 4));
-            std::string kernelName = code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA";
-
-            fromRGB_caller(src, dst, 0, kernelName);
-            break;
-        }
-    default:
-        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code" );
-    }
-}
-
-void cv::ocl::cvtColor(const oclMat &src, oclMat &dst, int code, int dcn)
-{
-    cvtColor_caller(src, dst, code, dcn);
-}
diff --git a/modules/ocl/src/columnsum.cpp b/modules/ocl/src/columnsum.cpp
deleted file mode 100644
index ccbd960bc..000000000
--- a/modules/ocl/src/columnsum.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Chunpeng Zhang, chunpeng@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-void cv::ocl::columnSum(const oclMat &src, oclMat &dst)
-{
-    CV_Assert(src.type() == CV_32FC1);
-    dst.create(src.size(), src.type());
-
-    int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
-    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset));
-
-    size_t globalThreads[3] = {dst.cols, 1, 1};
-    size_t localThreads[3]  = {256, 1, 1};
-
-    openCLExecuteKernel(src.clCxt, &imgproc_columnsum, "columnSum", globalThreads, localThreads, args, src.oclchannels(), src.depth());
-
-}
diff --git a/modules/ocl/src/error.cpp b/modules/ocl/src/error.cpp
deleted file mode 100644
index a1e2d807d..000000000
--- a/modules/ocl/src/error.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-
-        const char *getOpenCLErrorString( int err )
-        {
-            switch(err)
-            {
-            case CL_DEVICE_NOT_FOUND:
-                return "CL_DEVICE_NOT_FOUND";
-            case CL_DEVICE_NOT_AVAILABLE:
-                return "CL_DEVICE_NOT_AVAILABLE";
-            case CL_COMPILER_NOT_AVAILABLE:
-                return "CL_COMPILER_NOT_AVAILABLE";
-            case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-                return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-            case CL_OUT_OF_RESOURCES:
-                return "CL_OUT_OF_RESOURCES";
-            case CL_OUT_OF_HOST_MEMORY:
-                return "CL_OUT_OF_HOST_MEMORY";
-            case CL_PROFILING_INFO_NOT_AVAILABLE:
-                return "CL_PROFILING_INFO_NOT_AVAILABLE";
-            case CL_MEM_COPY_OVERLAP:
-                return "CL_MEM_COPY_OVERLAP";
-            case CL_IMAGE_FORMAT_MISMATCH:
-                return "CL_IMAGE_FORMAT_MISMATCH";
-            case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-                return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-            case CL_BUILD_PROGRAM_FAILURE:
-                return "CL_BUILD_PROGRAM_FAILURE";
-            case CL_MAP_FAILURE:
-                return "CL_MAP_FAILURE";
-            case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-                return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-            case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-                return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-            case CL_INVALID_VALUE:
-                return "CL_INVALID_VALUE";
-            case CL_INVALID_DEVICE_TYPE:
-                return "CL_INVALID_DEVICE_TYPE";
-            case CL_INVALID_PLATFORM:
-                return "CL_INVALID_PLATFORM";
-            case CL_INVALID_DEVICE:
-                return "CL_INVALID_DEVICE";
-            case CL_INVALID_CONTEXT:
-                return "CL_INVALID_CONTEXT";
-            case CL_INVALID_QUEUE_PROPERTIES:
-                return "CL_INVALID_QUEUE_PROPERTIES";
-            case CL_INVALID_COMMAND_QUEUE:
-                return "CL_INVALID_COMMAND_QUEUE";
-            case CL_INVALID_HOST_PTR:
-                return "CL_INVALID_HOST_PTR";
-            case CL_INVALID_MEM_OBJECT:
-                return "CL_INVALID_MEM_OBJECT";
-            case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-                return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-            case CL_INVALID_IMAGE_SIZE:
-                return "CL_INVALID_IMAGE_SIZE";
-            case CL_INVALID_SAMPLER:
-                return "CL_INVALID_SAMPLER";
-            case CL_INVALID_BINARY:
-                return "CL_INVALID_BINARY";
-            case CL_INVALID_BUILD_OPTIONS:
-                return "CL_INVALID_BUILD_OPTIONS";
-            case CL_INVALID_PROGRAM:
-                return "CL_INVALID_PROGRAM";
-            case CL_INVALID_PROGRAM_EXECUTABLE:
-                return "CL_INVALID_PROGRAM_EXECUTABLE";
-            case CL_INVALID_KERNEL_NAME:
-                return "CL_INVALID_KERNEL_NAME";
-            case CL_INVALID_KERNEL_DEFINITION:
-                return "CL_INVALID_KERNEL_DEFINITION";
-            case CL_INVALID_KERNEL:
-                return "CL_INVALID_KERNEL";
-            case CL_INVALID_ARG_INDEX:
-                return "CL_INVALID_ARG_INDEX";
-            case CL_INVALID_ARG_VALUE:
-                return "CL_INVALID_ARG_VALUE";
-            case CL_INVALID_ARG_SIZE:
-                return "CL_INVALID_ARG_SIZE";
-            case CL_INVALID_KERNEL_ARGS:
-                return "CL_INVALID_KERNEL_ARGS";
-            case CL_INVALID_WORK_DIMENSION:
-                return "CL_INVALID_WORK_DIMENSION";
-            case CL_INVALID_WORK_GROUP_SIZE:
-                return "CL_INVALID_WORK_GROUP_SIZE";
-            case CL_INVALID_WORK_ITEM_SIZE:
-                return "CL_INVALID_WORK_ITEM_SIZE";
-            case CL_INVALID_GLOBAL_OFFSET:
-                return "CL_INVALID_GLOBAL_OFFSET";
-            case CL_INVALID_EVENT_WAIT_LIST:
-                return "CL_INVALID_EVENT_WAIT_LIST";
-            case CL_INVALID_EVENT:
-                return "CL_INVALID_EVENT";
-            case CL_INVALID_OPERATION:
-                return "CL_INVALID_OPERATION";
-            case CL_INVALID_GL_OBJECT:
-                return "CL_INVALID_GL_OBJECT";
-            case CL_INVALID_BUFFER_SIZE:
-                return "CL_INVALID_BUFFER_SIZE";
-            case CL_INVALID_MIP_LEVEL:
-                return "CL_INVALID_MIP_LEVEL";
-            case CL_INVALID_GLOBAL_WORK_SIZE:
-                return "CL_INVALID_GLOBAL_WORK_SIZE";
-                //case CL_INVALID_PROPERTY:
-                //    return "CL_INVALID_PROPERTY";
-                //case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
-                //    return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
-                //case CL_PLATFORM_NOT_FOUND_KHR:
-                //    return "CL_PLATFORM_NOT_FOUND_KHR";
-                //    //case CL_INVALID_PROPERTY_EXT:
-                //    //    return "CL_INVALID_PROPERTY_EXT";
-                //case CL_DEVICE_PARTITION_FAILED_EXT:
-                //    return "CL_DEVICE_PARTITION_FAILED_EXT";
-                //case CL_INVALID_PARTITION_COUNT_EXT:
-                //    return "CL_INVALID_PARTITION_COUNT_EXT";
-                //default:
-                //    return "unknown error code";
-            }
-            static char buf[256];
-            sprintf(buf, "%d", err);
-            return buf;
-        }
-    }
-}
diff --git a/modules/ocl/src/fast.cpp b/modules/ocl/src/fast.cpp
deleted file mode 100644
index b32ea287b..000000000
--- a/modules/ocl/src/fast.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Peter Andreas Entschev, peter@entschev.com
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-cv::ocl::FAST_OCL::FAST_OCL(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
-    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::ocl::FAST_OCL::operator ()(const oclMat& image, const oclMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::ocl::FAST_OCL::downloadKeypoints(const oclMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::ocl::FAST_OCL::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const float* loc_x = h_keypoints.ptr<float>(X_ROW);
-    const float* loc_y = h_keypoints.ptr<float>(Y_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_x[i], loc_y[i], static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::ocl::FAST_OCL::operator ()(const oclMat& img, const oclMat& mask, oclMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
-int cv::ocl::FAST_OCL::calcKeyPointsLocation(const oclMat& img, const oclMat& mask)
-{
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(ROWS_COUNT, maxKeypoints, CV_32SC1, kpLoc_);
-    kpLoc_.setTo(Scalar::all(0));
-
-    if (nonmaxSupression)
-    {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
-    }
-
-    count_ = calcKeypointsOCL(img, mask, maxKeypoints);
-    count_ = std::min(count_, maxKeypoints);
-
-    return count_;
-}
-
-int cv::ocl::FAST_OCL::calcKeypointsOCL(const oclMat& img, const oclMat& mask, int maxKeypoints)
-{
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3] = {divUp(img.cols - 6, localThreads[0]) * localThreads[0],
-                               divUp(img.rows - 6, localThreads[1]) * localThreads[1],
-                               1};
-
-    Context *clCxt = Context::getContext();
-    String kernelName = (mask.empty()) ? "calcKeypoints" : "calcKeypointsWithMask";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    int counter = 0;
-    int err = CL_SUCCESS;
-    cl_mem counterCL = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(),
-                                    CL_MEM_COPY_HOST_PTR, sizeof(int),
-                                    &counter, &err);
-
-    int kpLocStep = kpLoc_.step / kpLoc_.elemSize();
-    int scoreStep = score_.step / score_.elemSize();
-    int nms = (nonmaxSupression) ? 1 : 0;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img.data));
-    if (!mask.empty()) args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&kpLoc_.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&score_.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counterCL));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nms));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxKeypoints));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&threshold));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img.cols));
-    if (!mask.empty()) args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&kpLocStep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&scoreStep));
-
-    openCLExecuteKernel(clCxt, &featdetect_fast, kernelName, globalThreads, localThreads, args, -1, -1);
-
-    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(),
-                                       counterCL, CL_TRUE, 0, sizeof(int), &counter, 0, NULL, NULL));
-    openCLSafeCall(clReleaseMemObject(counterCL));
-
-    return counter;
-}
-
-int cv::ocl::FAST_OCL::nonmaxSupressionOCL(oclMat& keypoints)
-{
-    size_t localThreads[3] = {256, 1, 1};
-    size_t globalThreads[3] = {count_, 1, 1};
-
-    Context *clCxt = Context::getContext();
-    String kernelName = "nonmaxSupression";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    int counter = 0;
-    int err = CL_SUCCESS;
-    cl_mem counterCL = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(),
-                                    CL_MEM_COPY_HOST_PTR, sizeof(int),
-                                    &counter, &err);
-
-    int kpLocStep = kpLoc_.step / kpLoc_.elemSize();
-    int sStep = score_.step / score_.elemSize();
-    int kStep = keypoints.step / keypoints.elemSize();
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&kpLoc_.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&score_.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counterCL));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&count_));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&kpLocStep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&sStep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&kStep));
-
-    openCLExecuteKernel(clCxt, &featdetect_fast, kernelName, globalThreads, localThreads, args, -1, -1);
-
-    openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(),
-                                       counterCL, CL_TRUE, 0, sizeof(int), &counter, 0, NULL, NULL));
-    openCLSafeCall(clReleaseMemObject(counterCL));
-
-    return counter;
-}
-
-int cv::ocl::FAST_OCL::getKeyPoints(oclMat& keypoints)
-{
-    if (count_ == 0)
-        return 0;
-
-    if (nonmaxSupression)
-    {
-        ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-        return nonmaxSupressionOCL(keypoints);
-    }
-
-    kpLoc_.convertTo(keypoints, CV_32FC1);
-    Mat k = keypoints;
-
-    return count_;
-}
-
-void cv::ocl::FAST_OCL::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
-}
diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp
deleted file mode 100644
index 395f14fba..000000000
--- a/modules/ocl/src/fft.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-#if !defined HAVE_CLAMDFFT
-
-void cv::ocl::dft(const oclMat&, oclMat&, Size, int)
-{
-    CV_Error(Error::OpenCLNoAMDBlasFft, "OpenCL DFT is not implemented");
-}
-
-namespace cv { namespace ocl {
-    void fft_teardown();
-} }
-
-void cv::ocl::fft_teardown() { }
-
-#else
-
-#include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
-
-namespace cv
-{
-    namespace ocl
-    {
-        void fft_setup();
-        void fft_teardown();
-
-        enum FftType
-        {
-            C2R = 1, // complex to complex
-            R2C = 2, // real to opencl HERMITIAN_INTERLEAVED
-            C2C = 3  // opencl HERMITIAN_INTERLEAVED to real
-        };
-
-        struct FftPlan
-        {
-        protected:
-            clAmdFftPlanHandle plHandle;
-            FftPlan& operator=(const FftPlan&);
-        public:
-            FftPlan(Size _dft_size, int _src_step, int _dst_step, int _depth, int _flags, FftType _type);
-            ~FftPlan();
-            inline clAmdFftPlanHandle getPlanHandle() { return plHandle; }
-
-            const Size dft_size;
-            const int src_step, dst_step;
-            const int depth;
-            const int flags;
-            const FftType type;
-        };
-
-        class PlanCache
-        {
-        protected:
-            PlanCache();
-            ~PlanCache();
-            static PlanCache* planCache;
-
-            bool started;
-            std::vector<FftPlan *> planStore;
-            clAmdFftSetupData *setupData;
-        public:
-            friend void fft_setup();
-            friend void fft_teardown();
-
-            static PlanCache* getPlanCache()
-            {
-                if (NULL == planCache)
-                    planCache = new PlanCache();
-                return planCache;
-            }
-
-            // return a baked plan->
-            // if there is one matched plan, return it
-            // if not, bake a new one, put it into the planStore and return it.
-            static FftPlan* getPlan(Size _dft_size, int _src_step, int _dst_step, int _depth, int _flags, FftType _type);
-
-            // remove a single plan from the store
-            // return true if the plan is successfully removed
-            // else
-            static bool removePlan(clAmdFftPlanHandle );
-        };
-    }
-}
-
-PlanCache* PlanCache::planCache = NULL;
-
-void cv::ocl::fft_setup()
-{
-    PlanCache& pCache = *PlanCache::getPlanCache();
-    if(pCache.started)
-    {
-        return;
-    }
-    if (pCache.setupData == NULL)
-        pCache.setupData = new clAmdFftSetupData;
-
-    openCLSafeCall(clAmdFftInitSetupData( pCache.setupData ));
-    pCache.started = true;
-}
-
-void cv::ocl::fft_teardown()
-{
-    PlanCache& pCache = *PlanCache::getPlanCache();
-
-    if(!pCache.started)
-        return;
-
-    for(size_t i = 0; i < pCache.planStore.size(); i ++)
-        delete pCache.planStore[i];
-    pCache.planStore.clear();
-
-    try
-    {
-        openCLSafeCall( clAmdFftTeardown( ) );
-    }
-    catch (const std::bad_alloc &)
-    { }
-
-    delete pCache.setupData; pCache.setupData = NULL;
-    pCache.started = false;
-}
-
-// bake a new plan
-cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _depth, int _flags, FftType _type)
-    : plHandle(0), dft_size(_dft_size), src_step(_src_step), depth(_depth), dst_step(_dst_step), flags(_flags), type(_type)
-{
-    fft_setup();
-
-    bool is_1d_input    = (_dft_size.height == 1);
-    int is_row_dft        = flags & DFT_ROWS;
-    int is_scaled_dft   = flags & DFT_SCALE;
-    int is_inverse        = flags & DFT_INVERSE;
-
-    //clAmdFftResultLocation    place;
-    clAmdFftLayout            inLayout;
-    clAmdFftLayout            outLayout;
-    clAmdFftDim                dim = is_1d_input || is_row_dft ? CLFFT_1D : CLFFT_2D;
-
-    size_t batchSize         = is_row_dft ? dft_size.height : 1;
-    size_t clLengthsIn[ 3 ]  = {1, 1, 1};
-    size_t clStridesIn[ 3 ]  = {1, 1, 1};
-    //size_t clLengthsOut[ 3 ] = {1, 1, 1};
-    size_t clStridesOut[ 3 ] = {1, 1, 1};
-    clLengthsIn[0]             = dft_size.width;
-    clLengthsIn[1]             = is_row_dft ? 1 : dft_size.height;
-    clStridesIn[0]             = 1;
-    clStridesOut[0]             = 1;
-
-    switch(_type)
-    {
-    case C2C:
-        inLayout        = CLFFT_COMPLEX_INTERLEAVED;
-        outLayout       = CLFFT_COMPLEX_INTERLEAVED;
-        clStridesIn[1]  = src_step / (2*CV_ELEM_SIZE(_depth));
-        clStridesOut[1] = dst_step / (2*CV_ELEM_SIZE(_depth));
-        break;
-    case R2C:
-        inLayout        = CLFFT_REAL;
-        outLayout       = CLFFT_HERMITIAN_INTERLEAVED;
-        clStridesIn[1]  = src_step / CV_ELEM_SIZE(_depth);
-        clStridesOut[1] = dst_step / (2*CV_ELEM_SIZE(_depth));
-        break;
-    case C2R:
-        inLayout        = CLFFT_HERMITIAN_INTERLEAVED;
-        outLayout       = CLFFT_REAL;
-        clStridesIn[1]  = src_step / (2*CV_ELEM_SIZE(_depth));
-        clStridesOut[1] = dst_step / CV_ELEM_SIZE(_depth);
-        break;
-    default:
-        //std::runtime_error("does not support this convertion!");
-        std::cout << "Does not support this convertion!" << std::endl;
-        throw std::exception();
-        break;
-    }
-
-    clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
-    clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
-
-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, *(cl_context*)getClContextPtr(), dim, clLengthsIn ) );
-
-    openCLSafeCall( clAmdFftSetPlanPrecision( plHandle, depth == CV_64F ? CLFFT_DOUBLE : CLFFT_SINGLE ) );
-    openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
-    openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
-    openCLSafeCall( clAmdFftSetPlanBatchSize( plHandle, batchSize ) );
-
-    openCLSafeCall( clAmdFftSetPlanInStride  ( plHandle, dim, clStridesIn ) );
-    openCLSafeCall( clAmdFftSetPlanOutStride ( plHandle, dim, clStridesOut ) );
-    openCLSafeCall( clAmdFftSetPlanDistance  ( plHandle, clStridesIn[ dim ], clStridesOut[ dim ]) );
-
-    float scale_ = is_scaled_dft ? 1.f / _dft_size.area() : 1.f;
-    openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
-
-    //ready to bake
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, (cl_command_queue*)getClCommandQueuePtr(), NULL, NULL ) );
-}
-
-cv::ocl::FftPlan::~FftPlan()
-{
-    openCLSafeCall( clAmdFftDestroyPlan( &plHandle ) );
-}
-
-cv::ocl::PlanCache::PlanCache()
-    : started(false),
-      planStore(std::vector<cv::ocl::FftPlan *>()),
-      setupData(NULL)
-{
-}
-
-cv::ocl::PlanCache::~PlanCache()
-{
-    fft_teardown();
-}
-
-FftPlan* cv::ocl::PlanCache::getPlan(Size _dft_size, int _src_step, int _dst_step, int _depth, int _flags, FftType _type)
-{
-    PlanCache& pCache = *PlanCache::getPlanCache();
-    std::vector<FftPlan *>& pStore = pCache.planStore;
-    // go through search
-    for(size_t i = 0; i < pStore.size(); i ++)
-    {
-        FftPlan *plan = pStore[i];
-        if(
-            plan->dft_size.width == _dft_size.width &&
-            plan->dft_size.height == _dft_size.height &&
-            plan->flags == _flags &&
-            plan->src_step == _src_step &&
-            plan->dst_step == _dst_step &&
-            plan->depth == _depth &&
-            plan->type == _type
-            )
-        {
-            return plan;
-        }
-    }
-    // no baked plan is found
-    FftPlan *newPlan = new FftPlan(_dft_size, _src_step, _dst_step, _depth, _flags, _type);
-    pStore.push_back(newPlan);
-    return newPlan;
-}
-
-bool cv::ocl::PlanCache::removePlan(clAmdFftPlanHandle plHandle)
-{
-    PlanCache& pCache = *PlanCache::getPlanCache();
-    std::vector<FftPlan *>& pStore = pCache.planStore;
-    for(size_t i = 0; i < pStore.size(); i ++)
-    {
-        if(pStore[i]->getPlanHandle() == plHandle)
-        {
-            pStore.erase(pStore.begin() + i);
-            delete pStore[i];
-            return true;
-        }
-    }
-    return false;
-}
-
-void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
-{
-    CV_Assert(cv::ocl::haveAmdFft());
-
-    if(dft_size == Size(0, 0))
-    {
-        dft_size = src.size();
-    }
-    // check if the given dft size is of optimal dft size
-    CV_Assert(dft_size.area() == getOptimalDFTSize(dft_size.area()));
-
-    // the two flags are not compatible
-    CV_Assert( !((flags & DFT_SCALE) && (flags & DFT_ROWS)) );
-
-    //bool is_1d_input    = (src.rows == 1);
-    //int is_row_dft        = flags & DFT_ROWS;
-    //int is_scaled_dft        = flags & DFT_SCALE;
-    int is_inverse = flags & DFT_INVERSE;
-    bool is_complex_input = src.channels() == 2;
-    bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
-
-    int depth = src.depth();
-
-    // We don't support real-to-real transform
-    CV_Assert(is_complex_input || is_complex_output);
-    FftType type = (FftType)(is_complex_input << 0 | is_complex_output << 1);
-
-    switch(type)
-    {
-    case C2C:
-        dst.create(src.rows, src.cols, CV_MAKE_TYPE(depth, 2));
-        printf("C2C\n");
-        break;
-    case R2C:
-        dst.create(src.rows, src.cols / 2 + 1, CV_MAKE_TYPE(depth, 2));
-        printf("R2C\n");
-        break;
-    case C2R:
-        CV_Assert(dft_size.width / 2 + 1 == src.cols && dft_size.height == src.rows);
-        dst.create(src.rows, dft_size.width, CV_MAKE_TYPE(depth, 1));
-        printf("C2R\n");
-        break;
-    default:
-        //std::runtime_error("does not support this convertion!");
-        std::cout << "Does not support this convertion!" << std::endl;
-        throw std::exception();
-        break;
-    }
-    clAmdFftPlanHandle plHandle = PlanCache::getPlan(dft_size, src.step, dst.step, depth, flags, type)->getPlanHandle();
-
-    //get the buffersize
-    size_t buffersize = 0;
-    openCLSafeCall( clAmdFftGetTmpBufSize(plHandle, &buffersize ) );
-
-    //allocate the intermediate buffer
-    // TODO, bind this with the current FftPlan
-    cl_mem clMedBuffer = NULL;
-    if (buffersize)
-    {
-        cl_int medstatus;
-        clMedBuffer = clCreateBuffer ( *(cl_context*)(src.clCxt->getOpenCLContextPtr()), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
-        openCLSafeCall( medstatus );
-    }
-    cl_command_queue clq = *(cl_command_queue*)(src.clCxt->getOpenCLCommandQueuePtr());
-    openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
-        is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
-        1,
-        &clq,
-        0, NULL, NULL,
-        (cl_mem *)&src.data, (cl_mem *)&dst.data, clMedBuffer ) );
-    openCLSafeCall( clFinish(clq) );
-    if(clMedBuffer)
-    {
-        openCLFree(clMedBuffer);
-    }
-    fft_teardown();
-}
-
-#endif
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
deleted file mode 100644
index 8832b305d..000000000
--- a/modules/ocl/src/filtering.cpp
+++ /dev/null
@@ -1,1537 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Zero Lin, Zero.Lin@amd.com
-//    Zhang Ying, zhangying913@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-//    Harris Gasparakis, harris.gasparakis@amd.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace
-{
-inline void normalizeAnchor(int &anchor, int ksize)
-{
-    if (anchor < 0)
-        anchor = ksize >> 1;
-
-    CV_Assert(0 <= anchor && anchor < ksize);
-}
-
-inline void normalizeAnchor(Point &anchor, const Size &ksize)
-{
-    normalizeAnchor(anchor.x, ksize.width);
-    normalizeAnchor(anchor.y, ksize.height);
-}
-
-inline void normalizeROI(Rect &roi, const Size &ksize, const Point &/*anchor*/, const Size &src_size)
-{
-    if (roi == Rect(0, 0, -1, -1))
-        roi = Rect(0, 0, src_size.width, src_size.height);
-
-    CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
-    CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
-}
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-namespace
-{
-class Filter2DEngine_GPU : public FilterEngine_GPU
-{
-public:
-    Filter2DEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_) : filter2D(filter2D_) {}
-
-    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
-    {
-        Size src_size = src.size();
-
-        // Delete those two clause below which exist before, However, the result is also correct
-        // dst.create(src_size, src.type());
-        // dst = Scalar(0.0);
-
-        normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-        oclMat srcROI = src(roi);
-        oclMat dstROI = dst(roi);
-
-        (*filter2D)(srcROI, dstROI);
-    }
-
-    Ptr<BaseFilter_GPU> filter2D;
-};
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filter2D)
-{
-    return makePtr<Filter2DEngine_GPU>(filter2D);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Box Filter
-namespace
-{
-typedef void (*FilterBox_t)(const oclMat & , oclMat & , Size &, const Point, const int);
-
-class GPUBoxFilter : public BaseFilter_GPU
-{
-public:
-    GPUBoxFilter(const Size &ksize_, const Point &anchor_, const int borderType_, FilterBox_t func_) :
-        BaseFilter_GPU(ksize_, anchor_, borderType_), func(func_) {}
-
-    virtual void operator()(const oclMat &src, oclMat &dst)
-    {
-        func(src, dst, ksize, anchor, borderType);
-    }
-
-    FilterBox_t func;
-
-};
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Morphology Filter
-
-namespace
-{
-typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, bool rectKernel);
-
-class MorphFilter_GPU : public BaseFilter_GPU
-{
-public:
-    MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUMorfFilter_t func_) :
-        BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_), rectKernel(false) {}
-
-    virtual void operator()(const oclMat &src, oclMat &dst)
-    {
-        func(src, dst, kernel, ksize, anchor, rectKernel) ;
-    }
-
-    oclMat kernel;
-    GPUMorfFilter_t func;
-    bool rectKernel;
-};
-}
-
-/*
-**We should be able to support any data types here.
-**Extend this if necessary later.
-**Note that the kernel need to be further refined.
-*/
-static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
-                         Size &ksize, const Point anchor, bool rectKernel)
-{
-    //Normalize the result by default
-    //float alpha = ksize.height * ksize.width;
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    CV_Assert((src.oclchannels() == dst.oclchannels()));
-
-    int srcStep = src.step / src.elemSize();
-    int dstStep = dst.step / dst.elemSize();
-    int srcOffset = src.offset / src.elemSize();
-    int dstOffset = dst.offset / dst.elemSize();
-
-    int srcOffset_x = srcOffset % srcStep;
-    int srcOffset_y = srcOffset / srcStep;
-    Context *clCxt = src.clCxt;
-    String kernelName;
-#ifdef ANDROID
-    size_t localThreads[3] = {16, 8, 1};
-#else
-    size_t localThreads[3] = {16, 16, 1};
-#endif
-    size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
-
-    if (src.type() == CV_8UC1)
-    {
-        kernelName = "morph_C1_D0";
-        globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-        CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
-    }
-    else
-    {
-        kernelName = "morph";
-        CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
-    }
-
-    char s[64];
-
-    switch (src.type())
-    {
-    case CV_8UC1:
-        sprintf(s, "-D VAL=255");
-        break;
-    case CV_8UC3:
-    case CV_8UC4:
-        sprintf(s, "-D VAL=255 -D GENTYPE=uchar4");
-        break;
-    case CV_32FC1:
-        sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float");
-        break;
-    case CV_32FC3:
-    case CV_32FC4:
-        sprintf(s, "-D VAL=FLT_MAX -D GENTYPE=float4");
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unsupported type");
-    }
-
-    char compile_option[128];
-    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
-        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
-        rectKernel?"-D RECTKERNEL":"",
-        s);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_y));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcStep));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstStep));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstOffset));
-
-    openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-}
-
-
-//! data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
-static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
-                          Size &ksize, const Point anchor, bool rectKernel)
-{
-    //Normalize the result by default
-    //float alpha = ksize.height * ksize.width;
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    CV_Assert((src.oclchannels() == dst.oclchannels()));
-
-    int srcStep = src.step1() / src.oclchannels();
-    int dstStep = dst.step1() / dst.oclchannels();
-    int srcOffset = src.offset /  src.elemSize();
-    int dstOffset = dst.offset /  dst.elemSize();
-
-    int srcOffset_x = srcOffset % srcStep;
-    int srcOffset_y = srcOffset / srcStep;
-    Context *clCxt = src.clCxt;
-    String kernelName;
-#ifdef ANDROID
-    size_t localThreads[3] = {16, 10, 1};
-#else
-    size_t localThreads[3] = {16, 16, 1};
-#endif
-    size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
-                               (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
-
-    if (src.type() == CV_8UC1)
-    {
-        kernelName = "morph_C1_D0";
-        globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-        CV_Assert(localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
-    }
-    else
-    {
-        kernelName = "morph";
-        CV_Assert(localThreads[0]*localThreads[1] * 2 >= (localThreads[0] + ksize.width - 1) * (localThreads[1] + ksize.height - 1));
-    }
-
-    char s[64];
-
-    switch (src.type())
-    {
-    case CV_8UC1:
-        sprintf(s, "-D VAL=0");
-        break;
-    case CV_8UC3:
-    case CV_8UC4:
-        sprintf(s, "-D VAL=0 -D GENTYPE=uchar4");
-        break;
-    case CV_32FC1:
-        sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float");
-        break;
-    case CV_32FC3:
-    case CV_32FC4:
-        sprintf(s, "-D VAL=-FLT_MAX -D GENTYPE=float4");
-        break;
-    default:
-        CV_Error(Error::StsUnsupportedFormat, "unsupported type");
-    }
-
-    char compile_option[128];
-    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
-        anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
-        s, rectKernel?"-D RECTKERNEL":"");
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcOffset_y));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcStep));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstStep));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstOffset));
-    openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-}
-
-Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &_kernel, const Size &ksize, Point anchor)
-{
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
-    CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4);
-
-    normalizeAnchor(anchor, ksize);
-    Mat kernel8U;
-    _kernel.convertTo(kernel8U, CV_8U);
-    Mat kernel = kernel8U.reshape(1, 1);
-
-    bool noZero = true;
-    for(int i = 0; i < kernel.rows * kernel.cols; ++i)
-        if(kernel.at<uchar>(i) != 1)
-            noZero = false;
-
-    MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, kernel, op == MORPH_ERODE ? GPUErode : GPUDilate);
-    if(noZero)
-        mfgpu->rectKernel = true;
-
-    return Ptr<BaseFilter_GPU>(mfgpu);
-}
-
-namespace
-{
-class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU
-{
-public:
-    MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_, int iters_) :
-        Filter2DEngine_GPU(filter2D_), iters(iters_) {}
-
-    virtual void apply(const oclMat &src, oclMat &dst)
-    {
-        Filter2DEngine_GPU::apply(src, dst);
-
-        for (int i = 1; i < iters; ++i)
-        {
-            Size wholesize;
-            Point ofs;
-            dst.locateROI(wholesize, ofs);
-            int rows = dst.rows, cols = dst.cols;
-            dst.adjustROI(ofs.y, -ofs.y - rows + dst.wholerows, ofs.x, -ofs.x - cols + dst.wholecols);
-            dst.copyTo(morfBuf);
-            dst.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols);
-            morfBuf.adjustROI(-ofs.y, ofs.y + rows - dst.wholerows, -ofs.x, ofs.x + cols - dst.wholecols);
-            Filter2DEngine_GPU::apply(morfBuf, dst);
-        }
-    }
-
-    int iters;
-    oclMat morfBuf;
-};
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Point &anchor, int iterations)
-{
-    CV_Assert(iterations > 0);
-
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-
-    return makePtr<MorphologyFilterEngine_GPU>(filter2D, iterations);
-}
-
-namespace
-{
-void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point anchor, int iterations, int borderType, const Scalar &borderValue)
-{
-    if ((borderType != cv::BORDER_CONSTANT) || (borderValue != morphologyDefaultBorderValue()))
-    {
-        CV_Error(Error::StsBadArg, "unsupported border type");
-    }
-
-    Mat kernel;
-    Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
-
-    normalizeAnchor(anchor, ksize);
-
-    if (iterations == 0 || _kernel.rows *_kernel.cols == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    if (!_kernel.data)
-    {
-        kernel = getStructuringElement(MORPH_RECT, Size(1 + iterations * 2, 1 + iterations * 2));
-        anchor = Point(iterations, iterations);
-        iterations = 1;
-    }
-    else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
-    {
-        anchor = Point(anchor.x * iterations, anchor.y * iterations);
-        kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + (iterations - 1) * (ksize.width - 1),
-                                       ksize.height + (iterations - 1) * (ksize.height - 1)), anchor);
-        iterations = 1;
-    }
-    else
-        kernel = _kernel;
-
-    Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations)
-            .staticCast<MorphologyFilterEngine_GPU>();
-
-    f->apply(src, dst);
-}
-}
-
-void cv::ocl::erode(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
-                    int borderType, const Scalar &borderValue)
-{
-    bool allZero = true;
-
-    for (int i = 0; i < kernel.rows * kernel.cols; ++i)
-        if (kernel.data[i] != 0)
-            allZero = false;
-
-    if (allZero)
-        kernel.data[0] = 1;
-
-    morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, borderType, borderValue);
-}
-
-void cv::ocl::dilate(const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor, int iterations,
-                     int borderType, const Scalar &borderValue)
-{
-    morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, borderType, borderValue);
-}
-
-void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor, int iterations,
-                           int borderType, const Scalar &borderValue)
-{
-    oclMat temp;
-
-    switch (op)
-    {
-    case MORPH_ERODE:
-        erode(src, dst, kernel, anchor, iterations, borderType, borderValue);
-        break;
-    case MORPH_DILATE:
-        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
-        break;
-    case MORPH_OPEN:
-        erode(src, temp, kernel, anchor, iterations, borderType, borderValue);
-        dilate(temp, dst, kernel, anchor, iterations, borderType, borderValue);
-        break;
-    case MORPH_CLOSE:
-        dilate(src, temp, kernel, anchor, iterations, borderType, borderValue);
-        erode(temp, dst, kernel, anchor, iterations, borderType, borderValue);
-        break;
-    case MORPH_GRADIENT:
-        erode(src, temp, kernel, anchor, iterations, borderType, borderValue);
-        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
-        subtract(dst, temp, dst);
-        break;
-    case MORPH_TOPHAT:
-        erode(src, dst, kernel, anchor, iterations, borderType, borderValue);
-        dilate(dst, temp, kernel, anchor, iterations, borderType, borderValue);
-        subtract(src, temp, dst);
-        break;
-    case MORPH_BLACKHAT:
-        dilate(src, dst, kernel, anchor, iterations, borderType, borderValue);
-        erode(dst, temp, kernel, anchor, iterations, borderType, borderValue);
-        subtract(temp, src, dst);
-        break;
-    default:
-        CV_Error(Error::StsBadArg, "unknown morphological operation");
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Linear Filter
-
-namespace
-{
-typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const Mat & , const Size &, const Point&, const int);
-
-class LinearFilter_GPU : public BaseFilter_GPU
-{
-public:
-    LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUFilter2D_t func_,
-                     int borderType_) :
-        BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {}
-
-    virtual void operator()(const oclMat &src, oclMat &dst)
-    {
-        func(src, dst, kernel, ksize, anchor, borderType) ;
-    }
-
-    Mat kernel;
-    GPUFilter2D_t func;
-};
-}
-
-// prepare kernel: transpose and make double rows (+align). Returns size of aligned row
-// Samples:
-//        a b c
-// Input: d e f
-//        g h i
-// Output, last two zeros is the alignment:
-// a d g a d g 0 0
-// b e h b e h 0 0
-// c f i c f i 0 0
-template <typename T>
-static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
-{
-    Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
-    int size_y_aligned = roundUp(kernel.rows * 2, 4);
-    data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
-    for (int x = 0; x < kernel.cols; x++)
-    {
-        for (int y = 0; y < kernel.rows; y++)
-        {
-            data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
-            data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
-        }
-    }
-    return size_y_aligned;
-}
-
-static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
-    const Size &ksize, const Point& anchor, const int borderType)
-{
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    CV_Assert(src.oclchannels() == dst.oclchannels());
-
-    CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height);
-    CV_Assert(kernel.channels() == 1);
-
-    CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols);
-    CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows);
-
-    bool useDouble = src.depth() == CV_64F;
-
-    std::vector<float> kernelDataFloat;
-    std::vector<double> kernelDataDouble;
-    int kernel_size_y2_aligned = useDouble ?
-            _prepareKernelFilter2D<double>(kernelDataDouble, kernel)
-            : _prepareKernelFilter2D<float>(kernelDataFloat, kernel);
-    oclMat oclKernelParameter;
-    if (useDouble)
-    {
-        oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
-        openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double),
-                &kernelDataDouble[0], kernelDataDouble.size()*sizeof(double),
-                kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice);
-    }
-    else
-    {
-        oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
-        openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float),
-                &kernelDataFloat[0], kernelDataFloat.size()*sizeof(float),
-                kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
-    }
-
-    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
-    do {
-        size_t BLOCK_SIZE = tryWorkItems;
-        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
-            BLOCK_SIZE /= 2;
-#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
-        size_t BLOCK_SIZE_Y = 1;
-#else
-        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
-        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
-            BLOCK_SIZE_Y *= 2;
-#endif
-
-        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
-
-        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-
-        std::vector<std::pair<size_t , const void *> > args;
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-        cl_uint stepBytes = src.step;
-        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
-        int offsetXBytes = src.offset % src.step;
-        int offsetX = offsetXBytes / src.elemSize();
-        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
-        int offsetY = src.offset / src.step;
-        int endX = (offsetX + src.cols);
-        int endY = (offsetY + src.rows);
-        cl_int rect[4] = {offsetX, offsetY, endX, endY};
-        if (!isIsolatedBorder)
-        {
-            rect[2] = src.wholecols;
-            rect[3] = src.wholerows;
-        }
-        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-        cl_uint _stepBytes = dst.step;
-        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
-        int _offsetXBytes = dst.offset % dst.step;
-        int _offsetX = _offsetXBytes / dst.elemSize();
-        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
-        int _offsetY = dst.offset / dst.step;
-        int _endX = (_offsetX + dst.cols);
-        int _endY = (_offsetY + dst.rows);
-        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
-        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-
-        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-        {
-            if (useDouble)
-                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
-            else
-                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
-        }
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
-
-        const char* btype = NULL;
-
-        switch (borderType & ~BORDER_ISOLATED)
-        {
-        case BORDER_CONSTANT:
-            btype = "BORDER_CONSTANT";
-            break;
-        case BORDER_REPLICATE:
-            btype = "BORDER_REPLICATE";
-            break;
-        case BORDER_REFLECT:
-            btype = "BORDER_REFLECT";
-            break;
-        case BORDER_WRAP:
-            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-            return;
-        case BORDER_REFLECT101:
-            btype = "BORDER_REFLECT_101";
-            break;
-        }
-
-        int requiredTop = anchor.y;
-        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
-        int requiredBottom = ksize.height - 1 - anchor.y;
-        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? src.rows : src.wholerows;
-        int w = isIsolatedBorder ? src.cols : src.wholecols;
-        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-
-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
-                "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
-                "-D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
-        size_t lt[3] = {BLOCK_SIZE, 1, 1};
-        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
-
-        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
-
-        size_t kernelWorkGroupSize;
-        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
-                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-        if (lt[0] > kernelWorkGroupSize)
-        {
-            clReleaseKernel(kernel);
-            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
-            tryWorkItems = kernelWorkGroupSize;
-            continue;
-        }
-
-        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
-    } while (false);
-}
-
-Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
-        const Point &anchor, int borderType)
-{
-    Point norm_archor = anchor;
-    normalizeAnchor(norm_archor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D,
-                               borderType));
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
-        int borderType)
-{
-    Size ksize = kernel.size(); // TODO remove duplicated parameter
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType);
-
-    return createFilter2D_GPU(linearFilter);
-}
-
-void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, double delta, int borderType)
-{
-    CV_Assert(delta == 0);
-
-    if (ddepth < 0)
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor, borderType);
-    f->apply(src, dst);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-
-namespace
-{
-class SeparableFilterEngine_GPU : public FilterEngine_GPU
-{
-public:
-    SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter_,
-                              const Ptr<BaseColumnFilter_GPU> &columnFilter_) :
-        rowFilter(rowFilter_), columnFilter(columnFilter_)
-    {
-        ksize = Size(rowFilter->ksize, columnFilter->ksize);
-        anchor = Point(rowFilter->anchor, columnFilter->anchor);
-    }
-
-    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
-    {
-        Size src_size = src.size();
-
-        int cn = src.oclchannels();
-        dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-
-        normalizeROI(roi, ksize, anchor, src_size);
-
-        srcROI = src(roi);
-        dstROI = dst(roi);
-
-        (*rowFilter)(srcROI, dstBuf);
-        (*columnFilter)(dstBuf, dstROI);
-    }
-
-    Ptr<BaseRowFilter_GPU> rowFilter;
-    Ptr<BaseColumnFilter_GPU> columnFilter;
-    Size ksize;
-    Point anchor;
-    oclMat dstBuf;
-    oclMat srcROI;
-    oclMat dstROI;
-    oclMat dstBufROI;
-};
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
-        const Ptr<BaseColumnFilter_GPU> &columnFilter)
-{
-    return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
-}
-
-static void GPUFilterBox(const oclMat &src, oclMat &dst,
-                         Size &ksize, const Point anchor, const int borderType)
-{
-    //Normalize the result by default
-    float alpha = 1.0f / (ksize.height * ksize.width);
-
-    CV_Assert(src.clCxt == dst.clCxt);
-    CV_Assert((src.cols == dst.cols) &&
-              (src.rows == dst.rows));
-    CV_Assert(src.oclchannels() == dst.oclchannels());
-
-    size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
-    do {
-        size_t BLOCK_SIZE = tryWorkItems;
-        while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
-            BLOCK_SIZE /= 2;
-        size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
-        while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
-            BLOCK_SIZE_Y *= 2;
-
-        CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
-
-        bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-
-        std::vector<std::pair<size_t , const void *> > args;
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-        cl_uint stepBytes = src.step;
-        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
-        int offsetXBytes = src.offset % src.step;
-        int offsetX = offsetXBytes / src.elemSize();
-        CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
-        int offsetY = src.offset / src.step;
-        int endX = (offsetX + src.cols);
-        int endY = (offsetY + src.rows);
-        cl_int rect[4] = {offsetX, offsetY, endX, endY};
-        if (!isIsolatedBorder)
-        {
-            rect[2] = src.wholecols;
-            rect[3] = src.wholerows;
-        }
-        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-        cl_uint _stepBytes = dst.step;
-        args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
-        int _offsetXBytes = dst.offset % dst.step;
-        int _offsetX = _offsetXBytes / dst.elemSize();
-        CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
-        int _offsetY = dst.offset / dst.step;
-        int _endX = (_offsetX + dst.cols);
-        int _endY = (_offsetY + dst.rows);
-        cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
-        args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-
-        bool useDouble = src.depth() == CV_64F;
-
-        float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-        double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
-        if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
-        {
-            if (useDouble)
-                args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
-            else
-                args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
-        }
-
-        double alphaDouble = alpha; // DON'T move into 'if' body
-        if (useDouble)
-            args.push_back( std::make_pair( sizeof(double), (void *)&alphaDouble));
-        else
-            args.push_back( std::make_pair( sizeof(float), (void *)&alpha));
-
-        const char* btype = NULL;
-
-        switch (borderType & ~BORDER_ISOLATED)
-        {
-        case BORDER_CONSTANT:
-            btype = "BORDER_CONSTANT";
-            break;
-        case BORDER_REPLICATE:
-            btype = "BORDER_REPLICATE";
-            break;
-        case BORDER_REFLECT:
-            btype = "BORDER_REFLECT";
-            break;
-        case BORDER_WRAP:
-            CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
-            return;
-        case BORDER_REFLECT101:
-            btype = "BORDER_REFLECT_101";
-            break;
-        }
-
-        int requiredTop = anchor.y;
-        int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
-        int requiredBottom = ksize.height - 1 - anchor.y;
-        int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
-        int h = isIsolatedBorder ? src.rows : src.wholerows;
-        int w = isIsolatedBorder ? src.cols : src.wholecols;
-        bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-
-        CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
-
-        char build_options[1024];
-        sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
-                (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
-                src.depth(), src.oclchannels(), useDouble ? 1 : 0,
-                anchor.x, anchor.y, ksize.width, ksize.height,
-                btype,
-                extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
-                isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
-        size_t lt[3] = {BLOCK_SIZE, 1, 1};
-        size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
-
-        cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
-
-        size_t kernelWorkGroupSize;
-        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
-                                                CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-        if (lt[0] > kernelWorkGroupSize)
-        {
-            clReleaseKernel(kernel);
-            CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
-            tryWorkItems = kernelWorkGroupSize;
-            continue;
-        }
-
-        openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
-    } while (false);
-}
-
-Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
-        const Size &ksize, Point anchor, int borderType)
-{
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseFilter_GPU>(new GPUBoxFilter(ksize, anchor,
-                               borderType, GPUFilterBox));
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createBoxFilter_GPU(int srcType, int dstType,
-        const Size &ksize, const Point &anchor, int borderType)
-{
-    Ptr<BaseFilter_GPU> boxFilter = getBoxFilter_GPU(srcType, dstType, ksize, anchor, borderType);
-    return createFilter2D_GPU(boxFilter);
-}
-
-void cv::ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
-                        Point anchor, int borderType)
-{
-    int sdepth = src.depth(), cn = src.channels();
-
-    if (ddepth < 0)
-    {
-        ddepth = sdepth;
-    }
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
-
-    Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(),
-                              dst.type(), ksize, anchor, borderType);
-    f->apply(src, dst);
-}
-
-namespace
-{
-typedef void (*gpuFilter1D_t)(const oclMat &src, const oclMat &dst, oclMat kernel, int ksize, int anchor, int bordertype);
-
-class GpuLinearRowFilter : public BaseRowFilter_GPU
-{
-public:
-    GpuLinearRowFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) :
-        BaseRowFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {}
-
-    virtual void operator()(const oclMat &src, oclMat &dst)
-    {
-        func(src, dst, kernel, ksize, anchor, bordertype);
-    }
-
-    oclMat kernel;
-    gpuFilter1D_t func;
-};
-}
-
-template <typename T> struct index_and_sizeof;
-template <> struct index_and_sizeof<uchar>
-{
-    enum { index = 1 };
-};
-template <> struct index_and_sizeof<char>
-{
-    enum { index = 2 };
-};
-template <> struct index_and_sizeof<ushort>
-{
-    enum { index = 3 };
-};
-template <> struct index_and_sizeof<short>
-{
-    enum { index = 4 };
-};
-template <> struct index_and_sizeof<int>
-{
-    enum { index = 5 };
-};
-template <> struct index_and_sizeof<float>
-{
-    enum { index = 6 };
-};
-
-template <typename T>
-void linearRowFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel, int ksize, int anchor, int bordertype)
-{
-    CV_Assert(bordertype <= BORDER_REFLECT_101);
-    CV_Assert(ksize == (anchor << 1) + 1);
-    int channels = src.oclchannels();
-
-#ifdef ANDROID
-    size_t localThreads[3] = { 16, 10, 1 };
-#else
-    size_t localThreads[3] = { 16, 16, 1 };
-#endif
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" };
-    std::string buildOptions = format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s",
-            anchor, (int)localThreads[0], (int)localThreads[1], channels, borderMap[bordertype]);
-
-    if (src.depth() == CV_8U)
-    {
-        switch (channels)
-        {
-        case 1:
-            globalThreads[0] = (dst.cols + 3) >> 2;
-            break;
-        case 2:
-            globalThreads[0] = (dst.cols + 1) >> 1;
-            break;
-        case 4:
-            globalThreads[0] = dst.cols;
-            break;
-        }
-    }
-
-    int src_pix_per_row = src.step / src.elemSize();
-    int src_offset_x = (src.offset % src.step) / src.elemSize();
-    int src_offset_y = src.offset / src.step;
-    int dst_pix_per_row = dst.step / dst.elemSize();
-    int ridusy = (dst.rows - src.rows) >> 1;
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_x));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_y));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&ridusy));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-
-    openCLExecuteKernel(src.clCxt, &filter_sep_row, "row_filter", globalThreads, localThreads,
-                        args, channels, src.depth(), buildOptions.c_str());
-}
-
-Ptr<BaseRowFilter_GPU> cv::ocl::getLinearRowFilter_GPU(int srcType, int /*bufType*/, const Mat &rowKernel, int anchor, int bordertype)
-{
-    static const gpuFilter1D_t gpuFilter1D_callers[6] =
-    {
-        linearRowFilter_gpu<uchar>,
-        linearRowFilter_gpu<char>,
-        linearRowFilter_gpu<ushort>,
-        linearRowFilter_gpu<short>,
-        linearRowFilter_gpu<int>,
-        linearRowFilter_gpu<float>
-    };
-
-    Mat temp = rowKernel.reshape(1, 1);
-    oclMat mat_kernel(temp);
-
-
-    int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
-    normalizeAnchor(anchor, ksize);
-
-    return makePtr<GpuLinearRowFilter>(ksize, anchor, mat_kernel,
-        gpuFilter1D_callers[CV_MAT_DEPTH(srcType)], bordertype);
-}
-
-namespace
-{
-class GpuLinearColumnFilter : public BaseColumnFilter_GPU
-{
-public:
-    GpuLinearColumnFilter(int ksize_, int anchor_, const oclMat &kernel_, gpuFilter1D_t func_, int bordertype_) :
-        BaseColumnFilter_GPU(ksize_, anchor_, bordertype_), kernel(kernel_), func(func_) {}
-
-    virtual void operator()(const oclMat &src, oclMat &dst)
-    {
-        func(src, dst, kernel, ksize, anchor, bordertype);
-    }
-
-    oclMat kernel;
-    gpuFilter1D_t func;
-};
-}
-
-template <typename T>
-void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_kernel, int ksize, int anchor, int bordertype)
-{
-    Context *clCxt = src.clCxt;
-    int channels = src.oclchannels();
-
-#ifdef ANDROID
-    size_t localThreads[3] = {16, 10, 1};
-#else
-    size_t localThreads[3] = {16, 16, 1};
-#endif
-    String kernelName = "col_filter";
-
-    char btype[30];
-
-    switch (bordertype)
-    {
-    case 0:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case 1:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case 2:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case 3:
-        sprintf(btype, "BORDER_WRAP");
-        break;
-    case 4:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    }
-
-    char compile_option[256];
-
-
-    size_t globalThreads[3];
-    globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
-    globalThreads[2] = (1 + localThreads[2] - 1) / localThreads[2] * localThreads[2];
-
-    if (dst.depth() == CV_8U)
-    {
-        switch (channels)
-        {
-        case 1:
-            globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "uchar", "convert_uchar_sat");
-            break;
-        case 2:
-            globalThreads[0] = ((dst.cols + 1) / 2 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float2", "uchar2", "convert_uchar2_sat");
-            break;
-        case 3:
-        case 4:
-            globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "uchar4", "convert_uchar4_sat");
-            break;
-        }
-    }
-    else
-    {
-        globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-
-        switch (dst.type())
-        {
-        case CV_32SC1:
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "int", "convert_int_sat");
-            break;
-        case CV_32SC3:
-        case CV_32SC4:
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "int4", "convert_int4_sat");
-            break;
-        case CV_32FC1:
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float", "float", "");
-            break;
-        case CV_32FC3:
-        case CV_32FC4:
-            sprintf(compile_option, "-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D GENTYPE_SRC=%s -D GENTYPE_DST=%s -D convert_to_DST=%s",
-                    anchor, (int)localThreads[0], (int)localThreads[1], channels, btype, "float4", "float4", "");
-            break;
-        }
-    }
-
-    //sanity checks
-    CV_Assert(clCxt == dst.clCxt);
-    CV_Assert(src.cols == dst.cols);
-    CV_Assert(src.oclchannels() == dst.oclchannels());
-    CV_Assert(ksize == (anchor << 1) + 1);
-    int src_pix_per_row, dst_pix_per_row;
-    int dst_offset_in_pixel;
-    src_pix_per_row = src.step / src.elemSize();
-    dst_pix_per_row = dst.step / dst.elemSize();
-    dst_offset_in_pixel = dst.offset / dst.elemSize();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
-
-    openCLExecuteKernel(clCxt, &filter_sep_col, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-}
-
-Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, int dstType, const Mat &columnKernel, int anchor, int bordertype, double /*delta*/)
-{
-    static const gpuFilter1D_t gpuFilter1D_callers[6] =
-    {
-        linearColumnFilter_gpu<uchar>,
-        linearColumnFilter_gpu<char>,
-        linearColumnFilter_gpu<ushort>,
-        linearColumnFilter_gpu<short>,
-        linearColumnFilter_gpu<int>,
-        linearColumnFilter_gpu<float>
-    };
-
-    Mat temp = columnKernel.reshape(1, 1);
-    oclMat mat_kernel(temp);
-
-    int ksize = temp.cols;
-    normalizeAnchor(anchor, ksize);
-
-    return makePtr<GpuLinearColumnFilter>(ksize, anchor, mat_kernel,
-        gpuFilter1D_callers[CV_MAT_DEPTH(dstType)], bordertype);
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
-        const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
-{
-    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter);
-}
-
-void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
-{
-    if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
-    {
-        if ((bordertype & cv::BORDER_ISOLATED) != 0)
-        {
-            bordertype &= ~cv::BORDER_ISOLATED;
-
-            if ((bordertype != cv::BORDER_CONSTANT) &&
-                    (bordertype != cv::BORDER_REPLICATE))
-            {
-                CV_Error(Error::StsBadArg, "unsupported border type");
-            }
-        }
-    }
-
-    if (ddepth < 0)
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
-    f->apply(src, dst);
-}
-
-Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType,
-                                           kx, ky, Point(-1, -1), 0, borderType);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Deriv Filter
-void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize, double scale, double delta, int borderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-
-    if (scale != 1)
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if (dx == 0)
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
-}
-
-void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale, double delta , int bordertype)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-
-    if (scale != 1)
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if (dx == 0)
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
-}
-
-void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale,
-        double delta, int borderType)
-{
-    CV_Assert(delta == 0);
-
-    if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_Assert(ksize == 1 || ksize == 3);
-
-    double K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_64F, (void *)K[ksize == 3 ? 1 : 0]);
-
-    if (scale != 1)
-        kernel *= scale;
-
-    filter2D(src, dst, ddepth, kernel, Point(-1, -1), 0, borderType);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Gaussian Filter
-
-Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-
-    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel(ksize.width, sigma1, std::max(depth, CV_32F));
-    Mat ky;
-
-    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-        ky = kx;
-    else
-        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
-}
-
-void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
-{
-    if (bordertype != BORDER_CONSTANT)
-    {
-        if (src.rows == 1)
-            ksize.height = 1;
-
-        if (src.cols == 1)
-            ksize.width = 1;
-    }
-
-    if (ksize.width == 1 && ksize.height == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    if ((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
-    {
-        if ((bordertype & cv::BORDER_ISOLATED) != 0)
-        {
-            bordertype &= ~cv::BORDER_ISOLATED;
-
-            if ((bordertype != cv::BORDER_CONSTANT) &&
-                    (bordertype != cv::BORDER_REPLICATE))
-            {
-                CV_Error(Error::StsBadArg, "unsupported border type");
-            }
-        }
-    }
-
-    dst.create(src.size(), src.type());
-
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
-    f->apply(src, dst);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Adaptive Bilateral Filter
-
-void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, double maxSigmaColor, Point anchor, int borderType)
-{
-    CV_Assert((ksize.width & 1) && (ksize.height & 1));  // ksize must be odd
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);  // source must be 8bit RGB image
-    if( sigmaSpace <= 0 )
-        sigmaSpace = 1;
-    Mat lut(Size(ksize.width, ksize.height), CV_32FC1);
-    double sigma2 = sigmaSpace * sigmaSpace;
-    int idx = 0;
-    int w = ksize.width / 2;
-    int h = ksize.height / 2;
-
-    int ABF_GAUSSIAN_ocl = 1;
-
-    if(ABF_GAUSSIAN_ocl)
-    {
-        for(int y=-h; y<=h; y++)
-            for(int x=-w; x<=w; x++)
-        {
-            lut.at<float>(idx++) = expf( (float)(-0.5 * (x * x + y * y)/sigma2));
-        }
-    }
-    else
-    {
-        for(int y=-h; y<=h; y++)
-            for(int x=-w; x<=w; x++)
-        {
-            lut.at<float>(idx++) = (float) (sigma2 / (sigma2 + x * x + y * y));
-        }
-    }
-
-    oclMat dlut(lut);
-    int depth = src.depth();
-    int cn = src.oclchannels();
-
-    normalizeAnchor(anchor, ksize);
-    const static String kernelName = "adaptiveBilateralFilter";
-
-    dst.create(src.size(), src.type());
-
-    char btype[30];
-    switch(borderType)
-    {
-    case BORDER_CONSTANT:
-        sprintf(btype, "BORDER_CONSTANT");
-        break;
-    case BORDER_REPLICATE:
-        sprintf(btype, "BORDER_REPLICATE");
-        break;
-    case BORDER_REFLECT:
-        sprintf(btype, "BORDER_REFLECT");
-        break;
-    case BORDER_WRAP:
-        sprintf(btype, "BORDER_WRAP");
-        break;
-    case BORDER_REFLECT101:
-        sprintf(btype, "BORDER_REFLECT_101");
-        break;
-    default:
-        CV_Error(Error::StsBadArg, "This border type is not supported");
-        break;
-    }
-
-    //the following constants may be adjusted for performance concerns
-    const static size_t blockSizeX = 64, blockSizeY = 1, EXTRA = ksize.height - 1;
-
-    //Normalize the result by default
-    const float alpha = ksize.height * ksize.width;
-
-    const size_t gSize = blockSizeX - ksize.width / 2 * 2;
-    const size_t globalSizeX = (src.cols) % gSize == 0 ?
-        src.cols / gSize * blockSizeX :
-        (src.cols / gSize + 1) * blockSizeX;
-    const size_t rows_per_thread = 1 + EXTRA;
-    const size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
-        ((src.rows + rows_per_thread - 1) / rows_per_thread) :
-        (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
-
-    size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
-    size_t localThreads[3]  = { blockSizeX, blockSizeY, 1};
-
-    char build_options[250];
-
-    //LDATATYPESIZE is sizeof local data store. This is to exemplify effect of LDS on kernel performance
-    sprintf(build_options,
-        "-D VAR_PER_CHANNEL=1 -D CALCVAR=1 -D FIXED_WEIGHT=0 -D EXTRA=%d -D MAX_VAR_VAL=%f -D ABF_GAUSSIAN=%d"
-        " -D THREADS=%d -D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
-        static_cast<int>(EXTRA), static_cast<float>(maxSigmaColor*maxSigmaColor), static_cast<int>(ABF_GAUSSIAN_ocl),
-        static_cast<int>(blockSizeX), anchor.x, anchor.y, ksize.width, ksize.height, btype);
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back(std::make_pair(sizeof(cl_mem), &dlut.data));
-    int lut_step = dlut.step1();
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&lut_step));
-
-    openCLExecuteKernel(Context::getContext(), &filtering_adaptive_bilateral, kernelName,
-        globalThreads, localThreads, args, cn, depth, build_options);
-}
diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp
deleted file mode 100644
index 50a2fdcce..000000000
--- a/modules/ocl/src/gemm.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv { namespace ocl {
-
-// used for clAmdBlas library to avoid redundant setup/teardown
-void clBlasSetup();
-void clBlasTeardown();
-
-}} /* namespace cv { namespace ocl */
-
-
-#if !defined HAVE_CLAMDBLAS
-void cv::ocl::gemm(const oclMat&, const oclMat&, double,
-                   const oclMat&, double, oclMat&, int)
-{
-    CV_Error(Error::OpenCLNoAMDBlasFft, "OpenCL BLAS is not implemented");
-}
-
-void cv::ocl::clBlasSetup()
-{
-    CV_Error(Error::OpenCLNoAMDBlasFft, "OpenCL BLAS is not implemented");
-}
-
-void cv::ocl::clBlasTeardown()
-{
-    //intentionally do nothing
-}
-
-#else
-#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
-using namespace cv;
-
-static bool clBlasInitialized = false;
-
-void cv::ocl::clBlasSetup()
-{
-    if(!clBlasInitialized)
-    {
-        AutoLock lock(getInitializationMutex());
-        if(!clBlasInitialized)
-        {
-            openCLSafeCall(clAmdBlasSetup());
-            clBlasInitialized = true;
-        }
-    }
-}
-
-void cv::ocl::clBlasTeardown()
-{
-    AutoLock lock(getInitializationMutex());
-    if(clBlasInitialized)
-    {
-        clAmdBlasTeardown();
-        clBlasInitialized = false;
-    }
-}
-
-void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
-                   const oclMat &src3, double beta, oclMat &dst, int flags)
-{
-    CV_Assert(src1.cols == src2.rows &&
-              (src3.empty() || (src1.rows == src3.rows && src2.cols == src3.cols)));
-    CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
-    if(!src3.empty())
-    {
-        src3.copyTo(dst);
-    }
-    else
-    {
-        dst.create(src1.rows, src2.cols, src1.type());
-        dst.setTo(Scalar::all(0));
-    }
-
-    clBlasSetup();
-
-    const clAmdBlasTranspose transA = (cv::GEMM_1_T & flags) ? clAmdBlasTrans : clAmdBlasNoTrans;
-    const clAmdBlasTranspose transB = (cv::GEMM_2_T & flags) ? clAmdBlasTrans : clAmdBlasNoTrans;
-    const clAmdBlasOrder     order  = clAmdBlasRowMajor;
-
-    const int M = src1.rows;
-    const int N = src2.cols;
-    const int K = src1.cols;
-    int lda     = src1.step;
-    int ldb     = src2.step;
-    int ldc     = dst.step;
-    int offa    = src1.offset;
-    int offb    = src2.offset;
-    int offc    = dst.offset;
-
-    cl_command_queue clq = *(cl_command_queue*)src1.clCxt->getOpenCLCommandQueuePtr();
-    switch(src1.type())
-    {
-    case CV_32FC1:
-        lda  /= sizeof(float);
-        ldb  /= sizeof(float);
-        ldc  /= sizeof(float);
-        offa /= sizeof(float);
-        offb /= sizeof(float);
-        offc /= sizeof(float);
-
-        openCLSafeCall
-        (
-            clAmdBlasSgemmEx(order, transA, transB, M, N, K,
-                             alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
-        );
-        break;
-    case CV_64FC1:
-        lda  /= sizeof(double);
-        ldb  /= sizeof(double);
-        ldc  /= sizeof(double);
-        offa /= sizeof(double);
-        offb /= sizeof(double);
-        offc /= sizeof(double);
-        openCLSafeCall
-        (
-            clAmdBlasDgemmEx(order, transA, transB, M, N, K,
-                             alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
-        );
-        break;
-    case CV_32FC2:
-    {
-        lda  /= (2*sizeof(float));
-        ldb  /= (2*sizeof(float));
-        ldc  /= (2*sizeof(float));
-        offa /= (2*sizeof(float));
-        offb /= (2*sizeof(float));
-        offc /= (2*sizeof(float));
-        cl_float2 alpha_2 = {{alpha, 0}};
-        cl_float2 beta_2  = {{beta, 0}};
-        openCLSafeCall
-        (
-            clAmdBlasCgemmEx(order, transA, transB, M, N, K,
-                             alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
-        );
-    }
-    break;
-    case CV_64FC2:
-    {
-        lda  /= (2*sizeof(double));
-        ldb  /= (2*sizeof(double));
-        ldc  /= (2*sizeof(double));
-        offa /= (2*sizeof(double));
-        offb /= (2*sizeof(double));
-        offc /= (2*sizeof(double));
-        cl_double2 alpha_2 = {{alpha, 0}};
-        cl_double2 beta_2  = {{beta, 0}};
-        openCLSafeCall
-        (
-            clAmdBlasZgemmEx(order, transA, transB, M, N, K,
-                             alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
-        );
-    }
-    break;
-    }
-}
-#endif
diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp
deleted file mode 100644
index b07286553..000000000
--- a/modules/ocl/src/gftt.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-static bool use_cpu_sorter = true;
-
-namespace
-{
-enum SortMethod
-{
-    CPU_STL,
-    BITONIC,
-    SELECTION
-};
-
-const int GROUP_SIZE = 256;
-
-template<SortMethod method>
-struct Sorter
-{
-    //typedef EigType;
-};
-
-//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
-template<>
-struct Sorter<CPU_STL>
-{
-    typedef oclMat EigType;
-    static cv::Mutex cs;
-    static Mat mat_eig;
-
-    //prototype
-    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
-    {
-        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
-        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
-        return v1 > v2;
-    }
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        cv::AutoLock lock(cs);
-        //temporarily use STL's sort function
-        Mat mat_corners = corners;
-        mat_eig = eig_tex;
-        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
-        corners = mat_corners;
-    }
-};
-cv::Mutex Sorter<CPU_STL>::cs;
-cv::Mat   Sorter<CPU_STL>::mat_eig;
-
-template<>
-struct Sorter<BITONIC>
-{
-    typedef TextureCL EigType;
-
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        Context * cxt = Context::getContext();
-        size_t globalThreads[3] = {count / 2, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-        // 2^numStages should be equal to count or the output is invalid
-        int numStages = 0;
-        for(int i = count; i > 1; i >>= 1)
-        {
-            ++numStages;
-        }
-        const int argc = 5;
-        std::vector< std::pair<size_t, const void *> > args(argc);
-        String kernelname = "sortCorners_bitonicSort";
-        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
-        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
-        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
-        for(int stage = 0; stage < numStages; ++stage)
-        {
-            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
-            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
-            {
-                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
-                openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-            }
-        }
-    }
-};
-
-template<>
-struct Sorter<SELECTION>
-{
-    typedef TextureCL EigType;
-
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        Context * cxt = Context::getContext();
-
-        size_t globalThreads[3] = {count, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-        std::vector< std::pair<size_t, const void *> > args;
-        //local
-        String kernelname = "sortCorners_selectionSortLocal";
-        int lds_size = GROUP_SIZE * sizeof(cl_float2);
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
-        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
-        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
-
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-
-        //final
-        kernelname = "sortCorners_selectionSortFinal";
-        args.pop_back();
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-    }
-};
-
-int findCorners_caller(
-    const TextureCL& eig,
-    const float threshold,
-    const oclMat& mask,
-    oclMat& corners,
-    const int max_count)
-{
-    std::vector<int> k;
-    Context * cxt = Context::getContext();
-
-    std::vector< std::pair<size_t, const void*> > args;
-    String kernelname = "findCorners";
-
-    const int mask_strip = mask.step / mask.elemSize1();
-
-    oclMat g_counter(1, 1, CV_32SC1);
-    g_counter.setTo(0);
-
-    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&eig  ));
-    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&mask.data ));
-    args.push_back(std::make_pair( sizeof(cl_mem),   (void*)&corners.data ));
-    args.push_back(std::make_pair( sizeof(cl_int),   (void*)&mask_strip));
-    args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
-    args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
-    args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
-
-    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-
-    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
-    openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
-    return std::min(Mat(g_counter).at<int>(0), max_count);
-}
-}//unnamed namespace
-
-void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
-{
-    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    ensureSizeIsEnough(image.size(), CV_32F, eig_);
-
-    if (useHarrisDetector)
-        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
-    else
-        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
-
-    double maxVal = 0;
-    minMax(eig_, NULL, &maxVal);
-
-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
-
-    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
-    int total = findCorners_caller(
-        *eig_tex,
-        static_cast<float>(maxVal * qualityLevel),
-        mask,
-        tmpCorners_,
-        tmpCorners_.cols);
-
-    if (total == 0)
-    {
-        corners.release();
-        return;
-    }
-    if(use_cpu_sorter)
-    {
-        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
-    }
-    else
-    {
-        //if total is power of 2
-        if(((total - 1) & (total)) == 0)
-        {
-            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
-        else
-        {
-            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
-    }
-
-    if (minDistance < 1)
-    {
-        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
-        tmpCorners_(roi_range).copyTo(corners);
-    }
-    else
-    {
-        std::vector<Point2f> tmp(total);
-        downloadPoints(tmpCorners_, tmp);
-
-        std::vector<Point2f> tmp2;
-        tmp2.reserve(total);
-
-        const int cell_size = cvRound(minDistance);
-        const int grid_width = (image.cols + cell_size - 1) / cell_size;
-        const int grid_height = (image.rows + cell_size - 1) / cell_size;
-
-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
-
-        for (int i = 0; i < total; ++i)
-        {
-            Point2f p = tmp[i];
-
-            bool good = true;
-
-            int x_cell = static_cast<int>(p.x / cell_size);
-            int y_cell = static_cast<int>(p.y / cell_size);
-
-            int x1 = x_cell - 1;
-            int y1 = y_cell - 1;
-            int x2 = x_cell + 1;
-            int y2 = y_cell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(grid_width - 1, x2);
-            y2 = std::min(grid_height - 1, y2);
-
-            for (int yy = y1; yy <= y2; yy++)
-            {
-                for (int xx = x1; xx <= x2; xx++)
-                {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
-
-                    if (!m.empty())
-                    {
-                        for(size_t j = 0; j < m.size(); j++)
-                        {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
-
-                            if (dx * dx + dy * dy < minDistance * minDistance)
-                            {
-                                good = false;
-                                goto break_out;
-                            }
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
-
-                tmp2.push_back(p);
-
-                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
-                    break;
-            }
-        }
-
-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
-    }
-}
-void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
-{
-    CV_DbgAssert(points.type() == CV_32FC2);
-    points_v.resize(points.cols);
-    openCLSafeCall(clEnqueueReadBuffer(
-        *(cl_command_queue*)getClCommandQueuePtr(),
-        reinterpret_cast<cl_mem>(points.data),
-        CL_TRUE,
-        0,
-        points.cols * sizeof(Point2f),
-        &points_v[0],
-        0,
-        NULL,
-        NULL));
-}
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
deleted file mode 100644
index a023f8a04..000000000
--- a/modules/ocl/src/haar.cpp
+++ /dev/null
@@ -1,1195 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Wang Weiyan, wangweiyanster@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Wu Xinglong, wxl370@126.com
-//    Wang Yao, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-/* these settings affect the quality of detection: change with care */
-#define CV_ADJUST_FEATURES  1
-#define CV_ADJUST_WEIGHTS   0
-#define CV_HAAR_FEATURE_MAX 3
-typedef int sumtype;
-typedef double sqsumtype;
-
-typedef struct CvHidHaarFeature
-{
-    struct
-    {
-        sumtype *p0, *p1, *p2, *p3;
-        float weight;
-    }
-    rect[CV_HAAR_FEATURE_MAX];
-}
-CvHidHaarFeature;
-
-
-typedef struct CvHidHaarTreeNode
-{
-    CvHidHaarFeature feature;
-    float threshold;
-    int left;
-    int right;
-}
-CvHidHaarTreeNode;
-
-
-typedef struct CvHidHaarClassifier
-{
-    int count;
-    //CvHaarFeature* orig_feature;
-    CvHidHaarTreeNode *node;
-    float *alpha;
-}
-CvHidHaarClassifier;
-
-
-typedef struct CvHidHaarStageClassifier
-{
-    int  count;
-    float threshold;
-    CvHidHaarClassifier *classifier;
-    int two_rects;
-
-    struct CvHidHaarStageClassifier *next;
-    struct CvHidHaarStageClassifier *child;
-    struct CvHidHaarStageClassifier *parent;
-}
-CvHidHaarStageClassifier;
-
-
-struct CvHidHaarClassifierCascade
-{
-    int  count;
-    int  is_stump_based;
-    int  has_tilted_features;
-    int  is_tree;
-    double inv_window_area;
-    CvMat sum, sqsum, tilted;
-    CvHidHaarStageClassifier *stage_classifier;
-    sqsumtype *pq0, *pq1, *pq2, *pq3;
-    sumtype *p0, *p1, *p2, *p3;
-
-    void **ipp_stages;
-};
-typedef struct
-{
-    int width_height;
-    int grpnumperline_totalgrp;
-    int imgoff;
-    float factor;
-} detect_piramid_info;
-#ifdef _MSC_VER
-#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
-
-typedef _ALIGNED_ON(128) struct  GpuHidHaarTreeNode
-{
-    _ALIGNED_ON(64) int p[CV_HAAR_FEATURE_MAX][4];
-    float weight[CV_HAAR_FEATURE_MAX] ;
-    float threshold ;
-    _ALIGNED_ON(16) float alpha[3] ;
-    _ALIGNED_ON(4) int left ;
-    _ALIGNED_ON(4) int right ;
-}
-GpuHidHaarTreeNode;
-
-
-typedef  _ALIGNED_ON(32) struct  GpuHidHaarClassifier
-{
-    _ALIGNED_ON(4) int count;
-    _ALIGNED_ON(8) GpuHidHaarTreeNode *node ;
-    _ALIGNED_ON(8) float *alpha ;
-}
-GpuHidHaarClassifier;
-
-
-typedef _ALIGNED_ON(64) struct   GpuHidHaarStageClassifier
-{
-    _ALIGNED_ON(4) int  count ;
-    _ALIGNED_ON(4) float threshold ;
-    _ALIGNED_ON(4) int two_rects ;
-    _ALIGNED_ON(8) GpuHidHaarClassifier *classifier ;
-    _ALIGNED_ON(8) struct GpuHidHaarStageClassifier *next;
-    _ALIGNED_ON(8) struct GpuHidHaarStageClassifier *child ;
-    _ALIGNED_ON(8) struct GpuHidHaarStageClassifier *parent ;
-}
-GpuHidHaarStageClassifier;
-
-
-typedef _ALIGNED_ON(64) struct  GpuHidHaarClassifierCascade
-{
-    _ALIGNED_ON(4) int  count ;
-    _ALIGNED_ON(4) int  is_stump_based ;
-    _ALIGNED_ON(4) int  has_tilted_features ;
-    _ALIGNED_ON(4) int  is_tree ;
-    _ALIGNED_ON(4) int pq0 ;
-    _ALIGNED_ON(4) int pq1 ;
-    _ALIGNED_ON(4) int pq2 ;
-    _ALIGNED_ON(4) int pq3 ;
-    _ALIGNED_ON(4) int p0 ;
-    _ALIGNED_ON(4) int p1 ;
-    _ALIGNED_ON(4) int p2 ;
-    _ALIGNED_ON(4) int p3 ;
-    _ALIGNED_ON(4) float inv_window_area ;
-} GpuHidHaarClassifierCascade;
-#else
-#define _ALIGNED_ON(_ALIGNMENT) __attribute__((aligned(_ALIGNMENT) ))
-
-typedef struct _ALIGNED_ON(128) GpuHidHaarTreeNode
-{
-    int p[CV_HAAR_FEATURE_MAX][4] _ALIGNED_ON(64);
-    float weight[CV_HAAR_FEATURE_MAX];// _ALIGNED_ON(16);
-    float threshold;// _ALIGNED_ON(4);
-    float alpha[3] _ALIGNED_ON(16);
-    int left _ALIGNED_ON(4);
-    int right _ALIGNED_ON(4);
-}
-GpuHidHaarTreeNode;
-
-typedef struct _ALIGNED_ON(32) GpuHidHaarClassifier
-{
-    int count _ALIGNED_ON(4);
-    GpuHidHaarTreeNode *node _ALIGNED_ON(8);
-    float *alpha _ALIGNED_ON(8);
-}
-GpuHidHaarClassifier;
-
-
-typedef struct _ALIGNED_ON(64) GpuHidHaarStageClassifier
-{
-    int  count _ALIGNED_ON(4);
-    float threshold _ALIGNED_ON(4);
-    int two_rects _ALIGNED_ON(4);
-    GpuHidHaarClassifier *classifier _ALIGNED_ON(8);
-    struct GpuHidHaarStageClassifier *next _ALIGNED_ON(8);
-    struct GpuHidHaarStageClassifier *child _ALIGNED_ON(8);
-    struct GpuHidHaarStageClassifier *parent _ALIGNED_ON(8);
-}
-GpuHidHaarStageClassifier;
-
-
-typedef struct _ALIGNED_ON(64) GpuHidHaarClassifierCascade
-{
-    int  count _ALIGNED_ON(4);
-    int  is_stump_based _ALIGNED_ON(4);
-    int  has_tilted_features _ALIGNED_ON(4);
-    int  is_tree _ALIGNED_ON(4);
-    int pq0 _ALIGNED_ON(4);
-    int pq1 _ALIGNED_ON(4);
-    int pq2 _ALIGNED_ON(4);
-    int pq3 _ALIGNED_ON(4);
-    int p0 _ALIGNED_ON(4);
-    int p1 _ALIGNED_ON(4);
-    int p2 _ALIGNED_ON(4);
-    int p3 _ALIGNED_ON(4);
-    float inv_window_area _ALIGNED_ON(4);
-} GpuHidHaarClassifierCascade;
-#endif
-
-const int icv_object_win_border = 1;
-const float icv_stage_threshold_bias = 0.0001f;
-double globaltime = 0;
-
-/* create more efficient internal representation of haar classifier cascade */
-static GpuHidHaarClassifierCascade * gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size, int *totalclassifier)
-{
-    GpuHidHaarClassifierCascade *out = 0;
-
-    int i, j, k, l;
-    int datasize;
-    int total_classifiers = 0;
-    int total_nodes = 0;
-    char errorstr[256];
-
-    GpuHidHaarStageClassifier *stage_classifier_ptr;
-    GpuHidHaarClassifier *haar_classifier_ptr;
-    GpuHidHaarTreeNode *haar_node_ptr;
-
-    CvSize orig_window_size;
-    int has_tilted_features = 0;
-
-    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
-        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
-
-    if( cascade->hid_cascade )
-        CV_Error( CV_StsError, "hid_cascade has been already created" );
-
-    if( !cascade->stage_classifier )
-        CV_Error( CV_StsNullPtr, "" );
-
-    if( cascade->count <= 0 )
-        CV_Error( CV_StsOutOfRange, "Negative number of cascade stages" );
-
-    orig_window_size = cascade->orig_window_size;
-
-    /* check input structure correctness and calculate total memory size needed for
-    internal representation of the classifier cascade */
-    for( i = 0; i < cascade->count; i++ )
-    {
-        CvHaarStageClassifier *stage_classifier = cascade->stage_classifier + i;
-
-        if( !stage_classifier->classifier ||
-                stage_classifier->count <= 0 )
-        {
-            sprintf( errorstr, "header of the stage classifier #%d is invalid "
-                     "(has null pointers or non-positive classfier count)", i );
-            CV_Error( CV_StsError, errorstr );
-        }
-
-        total_classifiers += stage_classifier->count;
-
-        for( j = 0; j < stage_classifier->count; j++ )
-        {
-            CvHaarClassifier *classifier = stage_classifier->classifier + j;
-
-            total_nodes += classifier->count;
-            for( l = 0; l < classifier->count; l++ )
-            {
-                for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
-                {
-                    if( classifier->haar_feature[l].rect[k].r.width )
-                    {
-                        CvRect r = classifier->haar_feature[l].rect[k].r;
-                        int tilted = classifier->haar_feature[l].tilted;
-                        has_tilted_features |= tilted != 0;
-                        if( r.width < 0 || r.height < 0 || r.y < 0 ||
-                                r.x + r.width > orig_window_size.width
-                                ||
-                                (!tilted &&
-                                 (r.x < 0 || r.y + r.height > orig_window_size.height))
-                                ||
-                                (tilted && (r.x - r.height < 0 ||
-                                            r.y + r.width + r.height > orig_window_size.height)))
-                        {
-                            sprintf( errorstr, "rectangle #%d of the classifier #%d of "
-                                     "the stage classifier #%d is not inside "
-                                     "the reference (original) cascade window", k, j, i );
-                            CV_Error( CV_StsNullPtr, errorstr );
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // this is an upper boundary for the whole hidden cascade size
-    datasize = sizeof(GpuHidHaarClassifierCascade)                   +
-               sizeof(GpuHidHaarStageClassifier) * cascade->count    +
-               sizeof(GpuHidHaarClassifier)      * total_classifiers +
-               sizeof(GpuHidHaarTreeNode)        * total_nodes;
-
-    *totalclassifier = total_classifiers;
-    *size = datasize;
-    out = (GpuHidHaarClassifierCascade *)cvAlloc( datasize );
-    memset( out, 0, sizeof(*out) );
-
-    /* init header */
-    out->count = cascade->count;
-    stage_classifier_ptr = (GpuHidHaarStageClassifier *)(out + 1);
-    haar_classifier_ptr = (GpuHidHaarClassifier *)(stage_classifier_ptr + cascade->count);
-    haar_node_ptr = (GpuHidHaarTreeNode *)(haar_classifier_ptr + total_classifiers);
-
-    out->is_stump_based = 1;
-    out->has_tilted_features = has_tilted_features;
-    out->is_tree = 0;
-
-    /* initialize internal representation */
-    for( i = 0; i < cascade->count; i++ )
-    {
-        CvHaarStageClassifier *stage_classifier = cascade->stage_classifier + i;
-        GpuHidHaarStageClassifier *hid_stage_classifier = stage_classifier_ptr + i;
-
-        hid_stage_classifier->count = stage_classifier->count;
-        hid_stage_classifier->threshold = stage_classifier->threshold - icv_stage_threshold_bias;
-        hid_stage_classifier->classifier = haar_classifier_ptr;
-        hid_stage_classifier->two_rects = 1;
-        haar_classifier_ptr += stage_classifier->count;
-
-        for( j = 0; j < stage_classifier->count; j++ )
-        {
-            CvHaarClassifier *classifier         = stage_classifier->classifier + j;
-            GpuHidHaarClassifier *hid_classifier = hid_stage_classifier->classifier + j;
-            int node_count = classifier->count;
-
-            float *alpha_ptr = &haar_node_ptr->alpha[0];
-
-            hid_classifier->count = node_count;
-            hid_classifier->node = haar_node_ptr;
-            hid_classifier->alpha = alpha_ptr;
-
-            for( l = 0; l < node_count; l++ )
-            {
-                GpuHidHaarTreeNode *node     = hid_classifier->node + l;
-                CvHaarFeature      *feature = classifier->haar_feature + l;
-
-                memset( node, -1, sizeof(*node) );
-                node->threshold = classifier->threshold[l];
-                node->left      = classifier->left[l];
-                node->right     = classifier->right[l];
-
-                if( fabs(feature->rect[2].weight) < DBL_EPSILON ||
-                        feature->rect[2].r.width == 0 ||
-                        feature->rect[2].r.height == 0 )
-                {
-                    node->p[2][0] = 0;
-                    node->p[2][1] = 0;
-                    node->p[2][2] = 0;
-                    node->p[2][3] = 0;
-                    node->weight[2] = 0;
-                }
-                else
-                    hid_stage_classifier->two_rects = 0;
-
-                memcpy( node->alpha, classifier->alpha, (node_count + 1)*sizeof(alpha_ptr[0]));
-                haar_node_ptr = haar_node_ptr + 1;
-            }
-            out->is_stump_based &= node_count == 1;
-        }
-    }
-
-    cascade->hid_cascade = (CvHidHaarClassifierCascade *)out;
-    assert( (char *)haar_node_ptr - (char *)out <= datasize );
-
-    return out;
-}
-
-
-#define sum_elem_ptr(sum,row,col)  \
-    ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
-
-#define sqsum_elem_ptr(sqsum,row,col)  \
-    ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
-
-#define calc_sum(rect,offset) \
-    ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
-
-
-static void gpuSetImagesForHaarClassifierCascade( CvHaarClassifierCascade *_cascade,
-                                      double scale,
-                                      int step)
-{
-    GpuHidHaarClassifierCascade *cascade;
-    int coi0 = 0, coi1 = 0;
-    int i;
-    int datasize;
-    int total;
-    CvRect equRect;
-    double weight_scale;
-    GpuHidHaarStageClassifier *stage_classifier;
-
-    if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
-        CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
-
-    if( scale <= 0 )
-        CV_Error( CV_StsOutOfRange, "Scale must be positive" );
-
-    if( coi0 || coi1 )
-        CV_Error( CV_BadCOI, "COI is not supported" );
-
-    if( !_cascade->hid_cascade )
-        gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
-
-    cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
-    stage_classifier = (GpuHidHaarStageClassifier *) (cascade + 1);
-
-    _cascade->scale = scale;
-    _cascade->real_window_size.width = cvRound( _cascade->orig_window_size.width * scale );
-    _cascade->real_window_size.height = cvRound( _cascade->orig_window_size.height * scale );
-
-    equRect.x = equRect.y = cvRound(scale);
-    equRect.width = cvRound((_cascade->orig_window_size.width - 2) * scale);
-    equRect.height = cvRound((_cascade->orig_window_size.height - 2) * scale);
-    weight_scale = 1. / (equRect.width * equRect.height);
-    cascade->inv_window_area = weight_scale;
-
-    cascade->pq0 = equRect.x;
-    cascade->pq1 = equRect.y;
-    cascade->pq2 = equRect.x + equRect.width;
-    cascade->pq3 = equRect.y + equRect.height;
-
-    cascade->p0 = equRect.x;
-    cascade->p1 = equRect.y;
-    cascade->p2 = equRect.x + equRect.width;
-    cascade->p3 = equRect.y + equRect.height;
-
-
-    /* init pointers in haar features according to real window size and
-    given image pointers */
-    for( i = 0; i < _cascade->count; i++ )
-    {
-        int j, k, l;
-        for( j = 0; j < stage_classifier[i].count; j++ )
-        {
-            for( l = 0; l < stage_classifier[i].classifier[j].count; l++ )
-            {
-                CvHaarFeature *feature =
-                    &_cascade->stage_classifier[i].classifier[j].haar_feature[l];
-                GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
-                double sum0 = 0, area0 = 0;
-                CvRect r[3];
-
-                int base_w = -1, base_h = -1;
-                int new_base_w = 0, new_base_h = 0;
-                int kx, ky;
-                int flagx = 0, flagy = 0;
-                int x0 = 0, y0 = 0;
-                int nr;
-
-                /* align blocks */
-                for( k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
-                {
-                    if(!hidnode->p[k][0])
-                        break;
-                    r[k] = feature->rect[k].r;
-                    base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].width - 1) );
-                    base_w = (int)CV_IMIN( (unsigned)base_w, (unsigned)(r[k].x - r[0].x - 1) );
-                    base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].height - 1) );
-                    base_h = (int)CV_IMIN( (unsigned)base_h, (unsigned)(r[k].y - r[0].y - 1) );
-                }
-
-                nr = k;
-                base_w += 1;
-                base_h += 1;
-                if(base_w == 0)
-                    base_w = 1;
-                kx = r[0].width / base_w;
-                if(base_h == 0)
-                    base_h = 1;
-                ky = r[0].height / base_h;
-
-                if( kx <= 0 )
-                {
-                    flagx = 1;
-                    new_base_w = cvRound( r[0].width * scale ) / kx;
-                    x0 = cvRound( r[0].x * scale );
-                }
-
-                if( ky <= 0 )
-                {
-                    flagy = 1;
-                    new_base_h = cvRound( r[0].height * scale ) / ky;
-                    y0 = cvRound( r[0].y * scale );
-                }
-
-                for( k = 0; k < nr; k++ )
-                {
-                    CvRect tr;
-                    double correction_ratio;
-
-                    if( flagx )
-                    {
-                        tr.x = (r[k].x - r[0].x) * new_base_w / base_w + x0;
-                        tr.width = r[k].width * new_base_w / base_w;
-                    }
-                    else
-                    {
-                        tr.x = cvRound( r[k].x * scale );
-                        tr.width = cvRound( r[k].width * scale );
-                    }
-
-                    if( flagy )
-                    {
-                        tr.y = (r[k].y - r[0].y) * new_base_h / base_h + y0;
-                        tr.height = r[k].height * new_base_h / base_h;
-                    }
-                    else
-                    {
-                        tr.y = cvRound( r[k].y * scale );
-                        tr.height = cvRound( r[k].height * scale );
-                    }
-
-#if CV_ADJUST_WEIGHTS
-                    {
-                        // RAINER START
-                        const float orig_feature_size =  (float)(feature->rect[k].r.width) * feature->rect[k].r.height;
-                        const float orig_norm_size = (float)(_cascade->orig_window_size.width) * (_cascade->orig_window_size.height);
-                        const float feature_size = float(tr.width * tr.height);
-                        //const float normSize    = float(equRect.width*equRect.height);
-                        float target_ratio = orig_feature_size / orig_norm_size;
-                        //float isRatio = featureSize / normSize;
-                        //correctionRatio = targetRatio / isRatio / normSize;
-                        correction_ratio = target_ratio / feature_size;
-                        // RAINER END
-                    }
-#else
-                    correction_ratio = weight_scale * (!feature->tilted ? 1 : 0.5);
-#endif
-
-                    if( !feature->tilted )
-                    {
-                        hidnode->p[k][0] = tr.x;
-                        hidnode->p[k][1] = tr.y;
-                        hidnode->p[k][2] = tr.x + tr.width;
-                        hidnode->p[k][3] = tr.y + tr.height;
-                    }
-                    else
-                    {
-                        hidnode->p[k][2] = (tr.y + tr.width) * step + tr.x + tr.width;
-                        hidnode->p[k][3] = (tr.y + tr.width + tr.height) * step + tr.x + tr.width - tr.height;
-                        hidnode->p[k][0] = tr.y * step + tr.x;
-                        hidnode->p[k][1] = (tr.y + tr.height) * step + tr.x - tr.height;
-                    }
-                    hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
-                    if( k == 0 )
-                        area0 = tr.width * tr.height;
-                    else
-                        sum0 += hidnode->weight[k] * tr.width * tr.height;
-                }
-                hidnode->weight[0] = (float)(-sum0 / area0);
-            } /* l */
-        } /* j */
-    }
-}
-
-static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade)
-{
-    GpuHidHaarClassifierCascade *cascade;
-    int i;
-    int datasize;
-    int total;
-    CvRect equRect;
-    double weight_scale;
-    GpuHidHaarStageClassifier *stage_classifier;
-
-    if( !CV_IS_HAAR_CLASSIFIER(_cascade) )
-        CV_Error( !_cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier pointer" );
-
-    if( !_cascade->hid_cascade )
-        gpuCreateHidHaarClassifierCascade(_cascade, &datasize, &total);
-
-    cascade = (GpuHidHaarClassifierCascade *) _cascade->hid_cascade;
-    stage_classifier = (GpuHidHaarStageClassifier *) cascade + 1;
-
-    _cascade->scale = 1.0;
-    _cascade->real_window_size.width =  _cascade->orig_window_size.width ;
-    _cascade->real_window_size.height = _cascade->orig_window_size.height;
-
-    equRect.x = equRect.y = 1;
-    equRect.width = _cascade->orig_window_size.width - 2;
-    equRect.height = _cascade->orig_window_size.height - 2;
-    weight_scale = 1;
-    cascade->inv_window_area = weight_scale;
-
-    cascade->p0 = equRect.x;
-    cascade->p1 = equRect.y;
-    cascade->p2 = equRect.height;
-    cascade->p3 = equRect.width ;
-    for( i = 0; i < _cascade->count; i++ )
-    {
-        int j, l;
-        for( j = 0; j < stage_classifier[i].count; j++ )
-        {
-            for( l = 0; l < stage_classifier[i].classifier[j].count; l++ )
-            {
-                const CvHaarFeature *feature =
-                    &_cascade->stage_classifier[i].classifier[j].haar_feature[l];
-                GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
-
-                for( int k = 0; k < CV_HAAR_FEATURE_MAX; k++ )
-                {
-                    const CvRect tr = feature->rect[k].r;
-                    if (tr.width == 0)
-                        break;
-                    double correction_ratio = weight_scale * (!feature->tilted ? 1 : 0.5);
-                    hidnode->p[k][0] = tr.x;
-                    hidnode->p[k][1] = tr.y;
-                    hidnode->p[k][2] = tr.width;
-                    hidnode->p[k][3] = tr.height;
-                    hidnode->weight[k] = (float)(feature->rect[k].weight * correction_ratio);
-                }
-            } /* l */
-        } /* j */
-    }
-}
-void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
-                                            double scaleFactor, int minNeighbors, int flags,
-                                            Size minSize, Size maxSize)
-//CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor,
-//        int minNeighbors, int flags, CvSize minSize, CvSize maxSize)
-{
-    CvHaarClassifierCascade *cascade = (CvHaarClassifierCascade*)getOldCascade();
-
-    const double GROUP_EPS = 0.2;
-
-    cv::ConcurrentRectVector allCandidates;
-    std::vector<cv::Rect> rectList;
-    std::vector<int> rweights;
-    double factor;
-    int datasize=0;
-    int totalclassifier=0;
-
-    GpuHidHaarClassifierCascade *gcascade;
-    GpuHidHaarStageClassifier    *stage;
-    GpuHidHaarClassifier         *classifier;
-    GpuHidHaarTreeNode           *node;
-
-    int *candidate;
-    cl_int status;
-
-    bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
-
-    if( maxSize.height == 0 || maxSize.width == 0 )
-    {
-        maxSize.height = gimg.rows;
-        maxSize.width = gimg.cols;
-    }
-
-    if( !CV_IS_HAAR_CLASSIFIER(cascade) )
-        CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
-
-    //if( !storage )
-    //    CV_Error( CV_StsNullPtr, "Null storage pointer" );
-
-    if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
-        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
-
-    if( scaleFactor <= 1 )
-        CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
-
-    if( findBiggestObject )
-        flags &= ~CV_HAAR_SCALE_IMAGE;
-
-    if( !cascade->hid_cascade )
-        gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
-
-    //result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage );
-
-    if( CV_MAT_CN(gimg.type()) > 1 )
-    {
-        oclMat gtemp;
-        cvtColor( gimg, gtemp, COLOR_BGR2GRAY );
-        gimg = gtemp;
-    }
-
-    if( findBiggestObject )
-        flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
-
-    if( gimg.cols < minSize.width || gimg.rows < minSize.height )
-        CV_Error(CV_StsError, "Image too small");
-
-    cl_command_queue qu = getClCommandQueue(Context::getContext());
-    if( (flags & CV_HAAR_SCALE_IMAGE) )
-    {
-        CvSize winSize0 = cascade->orig_window_size;
-        int totalheight = 0;
-        int indexy = 0;
-        CvSize sz;
-        std::vector<CvSize> sizev;
-        std::vector<float> scalev;
-        for(factor = 1.f;; factor *= scaleFactor)
-        {
-            CvSize winSize( cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) );
-            sz.width     = cvRound( gimg.cols / factor ) + 1;
-            sz.height    = cvRound( gimg.rows / factor ) + 1;
-            CvSize sz1( sz.width - winSize0.width - 1,      sz.height - winSize0.height - 1 );
-
-            if( sz1.width <= 0 || sz1.height <= 0 )
-                break;
-            if( winSize.width > maxSize.width || winSize.height > maxSize.height )
-                break;
-            if( winSize.width < minSize.width || winSize.height < minSize.height )
-                continue;
-
-            totalheight += sz.height;
-            sizev.push_back(sz);
-            scalev.push_back(factor);
-        }
-
-        oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
-        oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
-        oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);
-
-        int sdepth = 0;
-        if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-            sdepth = CV_64FC1;
-        else
-            sdepth = CV_32FC1;
-        sdepth = CV_MAT_DEPTH(sdepth);
-        int type = CV_MAKE_TYPE(sdepth, 1);
-        oclMat gsqsum_t(totalheight + 4, gimg.cols + 1, type);
-
-        cl_mem stagebuffer;
-        cl_mem nodebuffer;
-        cl_mem candidatebuffer;
-        cl_mem scaleinfobuffer;
-        cv::Rect roi, roi2;
-        cv::Mat imgroi, imgroisq;
-        cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
-
-        int grp_per_CU = 12;
-
-        size_t blocksize = 8;
-        size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *(gsum.clCxt->getDeviceInfo().maxComputeUnits) *localThreads[0],
-                                    localThreads[1], 1
-                                  };
-        int outputsz = 256 * globalThreads[0] / localThreads[0];
-        int loopcount = sizev.size();
-        detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
-
-        for( int i = 0; i < loopcount; i++ )
-        {
-            sz = sizev[i];
-            factor = scalev[i];
-            roi = Rect(0, indexy, sz.width, sz.height);
-            roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
-            resizeroi = gimg1(roi2);
-            gimgroi = gsum(roi);
-            gimgroisq = gsqsum_t(roi);
-            int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
-            int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
-            scaleinfo[i].width_height = (width << 16) | height;
-
-
-            int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
-            int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
-
-            scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
-            scaleinfo[i].imgoff = gimgroi.offset >> 2;
-            scaleinfo[i].factor = factor;
-            cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
-            cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
-
-            indexy += sz.height;
-        }
-        if(gsqsum_t.depth() == CV_64F)
-            gsqsum_t.convertTo(gsqsum, CV_32FC1);
-        else
-            gsqsum = gsqsum_t;
-
-        gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
-        stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
-        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
-        node       = (GpuHidHaarTreeNode *)(classifier->node);
-
-        int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
-                       sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
-
-        candidate = (int *)malloc(4 * sizeof(int) * outputsz);
-
-        gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
-
-        stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
-
-        nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
-
-        openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));
-        candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
-
-        scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
-
-        int startstage = 0;
-        int endstage = gcascade->count;
-        int startnode = 0;
-        int pixelstep = gsum.step / 4;
-        int splitstage = 3;
-        int splitnode = stage[0].count + stage[1].count + stage[2].count;
-        cl_int4 p, pq;
-        p.s[0] = gcascade->p0;
-        p.s[1] = gcascade->p1;
-        p.s[2] = gcascade->p2;
-        p.s[3] = gcascade->p3;
-        pq.s[0] = gcascade->pq0;
-        pq.s[1] = gcascade->pq1;
-        pq.s[2] = gcascade->pq2;
-        pq.s[3] = gcascade->pq3;
-        float correction = gcascade->inv_window_area;
-
-        std::vector<std::pair<size_t, const void *> > args;
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&pixelstep ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode ));
-        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&p ));
-        args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
-        args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
-
-        if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
-        {
-            //setup local group size
-            localThreads[0] = 8;
-            localThreads[1] = 16;
-            localThreads[2] = 1;
-
-            //init maximal number of workgroups
-            int WGNumX = 1+(sizev[0].width /(localThreads[0]));
-            int WGNumY = 1+(sizev[0].height/(localThreads[1]));
-            int WGNumZ = loopcount;
-            int WGNum = 0; //accurate number of non -empty workgroups
-            oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
-            {
-                cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
-                openCLVerifyCall(status);
-                for(int z=0;z<WGNumZ;++z)
-                {
-                    int     Width  = (scaleinfo[z].width_height >> 16)&0xFFFF;
-                    int     Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
-                    for(int y=0;y<WGNumY;++y)
-                    {
-                        int     gy = y*localThreads[1];
-                        if(gy>=(Height-cascade->orig_window_size.height))
-                            continue; // no data to process
-                        for(int x=0;x<WGNumX;++x)
-                        {
-                            int     gx = x*localThreads[0];
-                            if(gx>=(Width-cascade->orig_window_size.width))
-                                continue; // no data to process
-
-                            // save no-empty workgroup info into array
-                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
-                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
-                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
-                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
-                            WGNum++;
-                        }
-                    }
-                }
-                openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
-                pWGInfo = NULL;
-            }
-
-            // setup global sizes to have linear array of workgroups with WGNum size
-            globalThreads[0] = localThreads[0]*WGNum;
-            globalThreads[1] = localThreads[1];
-            globalThreads[2] = 1;
-
-#define NODE_SIZE 12
-            // pack node info to have less memory loads
-            oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
-            {
-                cl_int  status;
-                cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
-                openCLVerifyCall(status);
-                //use known local data stride to precalulate indexes
-                int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
-                // check that maximal value is less than maximal unsigned short
-                assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < (int)USHRT_MAX);
-                for(int i = 0;i<nodenum;++i)
-                {//process each node from classifier
-                    struct NodePK
-                    {
-                        unsigned short  slm_index[3][4];
-                        float           weight[3];
-                        float           threshold;
-                        float           alpha[2];
-                    };
-                    struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
-                    for(int k=0;k<3;++k)
-                    {// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
-                        int* p = &(node[i].p[k][0]);
-                        pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
-                        pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
-                        pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
-                        pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
-                    }
-                    //store used float point values for each node
-                    pOut->weight[0] = node[i].weight[0];
-                    pOut->weight[1] = node[i].weight[1];
-                    pOut->weight[2] = node[i].weight[2];
-                    pOut->threshold = node[i].threshold;
-                    pOut->alpha[0] = node[i].alpha[0];
-                   pOut->alpha[1] = node[i].alpha[1];
-                }
-                openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
-                pNodesPK = NULL;
-            }
-            // add 2 additional buffers (WGinfo and packed nodes) as 2 last args
-            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
-            args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
-
-            //form build options for kernel
-            String  options = "-D PACKED_CLASSIFIER";
-            options += format(" -D NODE_SIZE=%d",NODE_SIZE);
-            options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
-            options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
-            options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
-            options += format(" -D LSx=%d",localThreads[0]);
-            options += format(" -D LSy=%d",localThreads[1]);
-            options += format(" -D SPLITNODE=%d",splitnode);
-            options += format(" -D SPLITSTAGE=%d",splitstage);
-            options += format(" -D OUTPUTSZ=%d",outputsz);
-
-            // init candiate global count by 0
-            int pattern = 0;
-            openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
-            // execute face detector
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
-            //read candidate buffer back and put it into host list
-            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
-            assert(candidate[0]<outputsz);
-            //printf("candidate[0]=%d\n",candidate[0]);
-            for(int i = 1; i <= candidate[0]; i++)
-            {
-                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
-            }
-        }
-        else
-        {
-            const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
-
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
-
-            openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
-
-            for(int i = 0; i < outputsz; i++)
-                if(candidate[4 * i + 2] != 0)
-                    allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
-                    candidate[4 * i + 2], candidate[4 * i + 3]));
-        }
-
-        free(scaleinfo);
-        free(candidate);
-        openCLSafeCall(clReleaseMemObject(stagebuffer));
-        openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
-        openCLSafeCall(clReleaseMemObject(nodebuffer));
-        openCLSafeCall(clReleaseMemObject(candidatebuffer));
-
-    }
-    else
-    {
-        CvSize winsize0 = cascade->orig_window_size;
-        int n_factors = 0;
-        oclMat gsum;
-        oclMat gsqsum;
-        oclMat gsqsum_t;
-        cv::ocl::integral(gimg, gsum, gsqsum_t);
-        if(gsqsum_t.depth() == CV_64F)
-            gsqsum_t.convertTo(gsqsum, CV_32FC1);
-        else
-            gsqsum = gsqsum_t;
-        CvSize sz;
-        std::vector<CvSize> sizev;
-        std::vector<float> scalev;
-        gpuSetHaarClassifierCascade(cascade);
-        gcascade   = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
-        stage      = (GpuHidHaarStageClassifier *)(gcascade + 1);
-        classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
-        node       = (GpuHidHaarTreeNode *)(classifier->node);
-        cl_mem stagebuffer;
-        cl_mem nodebuffer;
-        cl_mem candidatebuffer;
-        cl_mem scaleinfobuffer;
-        cl_mem pbuffer;
-        cl_mem correctionbuffer;
-        for( n_factors = 0, factor = 1;
-                cvRound(factor * winsize0.width) < gimg.cols - 10 &&
-                cvRound(factor * winsize0.height) < gimg.rows - 10;
-                n_factors++, factor *= scaleFactor )
-        {
-            CvSize winSize( cvRound( winsize0.width * factor ), cvRound( winsize0.height * factor ) );
-            if( winSize.width < minSize.width || winSize.height < minSize.height )
-            {
-                continue;
-            }
-            sizev.push_back(winSize);
-            scalev.push_back(factor);
-        }
-        int loopcount = scalev.size();
-        if(loopcount == 0)
-        {
-            loopcount = 1;
-            n_factors = 1;
-            sizev.push_back(minSize);
-            scalev.push_back( std::min(cvRound(minSize.width / winsize0.width), cvRound(minSize.height / winsize0.height)) );
-        }
-        detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
-        cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * loopcount);
-        float *correction = (float *)malloc(sizeof(float) * loopcount);
-        int grp_per_CU = 12;
-        size_t blocksize = 8;
-        size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->getDeviceInfo().maxComputeUnits *localThreads[0],
-                                    localThreads[1], 1 };
-        int outputsz = 256 * globalThreads[0] / localThreads[0];
-        int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
-                       sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
-        nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
-                                        nodenum * sizeof(GpuHidHaarTreeNode));
-        openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
-                                            nodenum * sizeof(GpuHidHaarTreeNode),
-                                            node, 0, NULL, NULL));
-        cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
-                               loopcount * nodenum * sizeof(GpuHidHaarTreeNode));
-        int startstage = 0;
-        int endstage = gcascade->count;
-        for(int i = 0; i < loopcount; i++)
-        {
-            sz = sizev[i];
-            factor = scalev[i];
-            double ystep = std::max(2., factor);
-            int equRect_x = cvRound(factor * gcascade->p0);
-            int equRect_y = cvRound(factor * gcascade->p1);
-            int equRect_w = cvRound(factor * gcascade->p3);
-            int equRect_h = cvRound(factor * gcascade->p2);
-            p[i].s[0] = equRect_x;
-            p[i].s[1] = equRect_y;
-            p[i].s[2] = equRect_x + equRect_w;
-            p[i].s[3] = equRect_y + equRect_h;
-            correction[i] = 1. / (equRect_w * equRect_h);
-            int width = (gsum.cols - 1 - sz.width  + ystep - 1) / ystep;
-            int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep;
-            int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
-            int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
-
-            scaleinfo[i].width_height = (width << 16) | height;
-            scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
-            scaleinfo[i].imgoff = 0;
-            scaleinfo[i].factor = factor;
-            int startnodenum = nodenum * i;
-            float factor2 = (float)factor;
-
-            std::vector<std::pair<size_t, const void *> > args1;
-            args1.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer ));
-            args1.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
-            args1.push_back ( std::make_pair(sizeof(cl_float) , (void *)&factor2 ));
-            args1.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction[i] ));
-            args1.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnodenum ));
-
-            size_t globalThreads2[3] = {nodenum, 1, 1};
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
-        }
-
-        int step = gsum.step / 4;
-        int startnode = 0;
-        int splitstage = 3;
-        stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
-        candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
-        scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
-        pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
-        correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
-
-        std::vector<std::pair<size_t, const void *> > args;
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&newnodebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&step ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
-        args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
-        const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
-        openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1, build_options);
-
-        candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
-
-        for(int i = 0; i < outputsz; i++)
-        {
-            if(candidate[4 * i + 2] != 0)
-                allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3]));
-        }
-
-        free(scaleinfo);
-        free(p);
-        free(correction);
-        clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0);
-        openCLSafeCall(clReleaseMemObject(stagebuffer));
-        openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
-        openCLSafeCall(clReleaseMemObject(nodebuffer));
-        openCLSafeCall(clReleaseMemObject(newnodebuffer));
-        openCLSafeCall(clReleaseMemObject(candidatebuffer));
-        openCLSafeCall(clReleaseMemObject(pbuffer));
-        openCLSafeCall(clReleaseMemObject(correctionbuffer));
-    }
-
-    cvFree(&cascade->hid_cascade);
-    rectList.resize(allCandidates.size());
-    if(!allCandidates.empty())
-        std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
-
-    if( minNeighbors != 0 || findBiggestObject )
-        groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
-    else
-        rweights.resize(rectList.size(), 0);
-
-    faces.clear();
-    if( findBiggestObject && rectList.size() )
-    {
-        Rect result_comp(0, 0, 0, 0);
-        for( size_t i = 0; i < rectList.size(); i++ )
-        {
-            cv::Rect r = rectList[i];
-            if( r.area() > result_comp.area() )
-            {
-                result_comp = r;
-            }
-        }
-        faces.push_back(result_comp);
-    }
-    else
-    {
-        faces = rectList;
-    }
-}
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
deleted file mode 100644
index 70fe99187..000000000
--- a/modules/ocl/src/hog.cpp
+++ /dev/null
@@ -1,1946 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//     Wenju He, wenju@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-#define CELL_WIDTH 8
-#define CELL_HEIGHT 8
-#define CELLS_PER_BLOCK_X 2
-#define CELLS_PER_BLOCK_Y 2
-#define NTHREADS 256
-
-static oclMat gauss_w_lut;
-static bool hog_device_cpu;
-
-namespace cv
-{
-    namespace ocl
-    {
-        namespace device
-        {
-            namespace hog
-            {
-                int cnbins;
-                int cblock_stride_x;
-                int cblock_stride_y;
-                int cnblocks_win_x;
-                int cnblocks_win_y;
-                int cblock_hist_size;
-                int cdescr_size;
-                int cdescr_width;
-                int cdescr_height;
-
-                void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
-                                      int nblocks_win_x, int nblocks_win_y);
-
-                void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                                   int height, int width, const cv::ocl::oclMat &grad,
-                                   const cv::ocl::oclMat &qangle,
-                                   const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
-
-                void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
-                                     int height, int width, cv::ocl::oclMat &block_hists,
-                                     float threshold);
-
-                void classify_hists(int win_height, int win_width, int block_stride_y,
-                                    int block_stride_x, int win_stride_y, int win_stride_x,
-                                    int height, int width, const cv::ocl::oclMat &block_hists,
-                                    const cv::ocl::oclMat &coefs, float free_coef,
-                                    float threshold, cv::ocl::oclMat &labels);
-
-                void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y,
-                                            int block_stride_x, int win_stride_y, int win_stride_x,
-                                            int height, int width, const cv::ocl::oclMat &block_hists,
-                                            cv::ocl::oclMat &descriptors);
-                void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y,
-                                            int block_stride_x, int win_stride_y, int win_stride_x,
-                                            int height, int width, const cv::ocl::oclMat &block_hists,
-                                            cv::ocl::oclMat &descriptors);
-
-                void compute_gradients_8UC1(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad,
-                                            cv::ocl::oclMat &qangle, bool correct_gamma);
-                void compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat &img,
-                                            float angle_scale, cv::ocl::oclMat &grad,
-                                            cv::ocl::oclMat &qangle, bool correct_gamma);
-            }
-        }
-    }
-}
-
-using namespace ::cv::ocl::device;
-
-cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_,
-                                      Size cell_size_, int nbins_, double win_sigma_,
-                                      double threshold_L2hys_, bool gamma_correction_, int nlevels_)
-    : win_size(win_size_),
-      block_size(block_size_),
-      block_stride(block_stride_),
-      cell_size(cell_size_),
-      nbins(nbins_),
-      win_sigma(win_sigma_),
-      threshold_L2hys(threshold_L2hys_),
-      gamma_correction(gamma_correction_),
-      nlevels(nlevels_)
-{
-    CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
-              (win_size.height - block_size.height) % block_stride.height == 0);
-
-    CV_Assert(block_size.width % cell_size.width == 0 &&
-        block_size.height % cell_size.height == 0);
-
-    CV_Assert(block_stride == cell_size);
-
-    CV_Assert(cell_size == Size(8, 8));
-
-    Size cells_per_block(block_size.width / cell_size.width,
-        block_size.height / cell_size.height);
-    CV_Assert(cells_per_block == Size(2, 2));
-
-    cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height,
-        blocks_per_win.width, blocks_per_win.height);
-
-    effect_size = Size(0, 0);
-
-    if (isCpuDevice())
-        hog_device_cpu = true;
-    else
-        hog_device_cpu = false;
-}
-
-size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
-{
-    return numPartsWithin(win_size, block_size, block_stride).area() * getBlockHistogramSize();
-}
-
-size_t cv::ocl::HOGDescriptor::getBlockHistogramSize() const
-{
-    Size cells_per_block = Size(block_size.width / cell_size.width,
-        block_size.height / cell_size.height);
-    return (size_t)(nbins * cells_per_block.area());
-}
-
-double cv::ocl::HOGDescriptor::getWinSigma() const
-{
-    return win_sigma >= 0 ? win_sigma : (block_size.width + block_size.height) / 8.0;
-}
-
-bool cv::ocl::HOGDescriptor::checkDetectorSize() const
-{
-    size_t detector_size = detector.rows * detector.cols;
-    size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size ||
-        detector_size == descriptor_size + 1;
-}
-
-void cv::ocl::HOGDescriptor::setSVMDetector(const std::vector<float> &_detector)
-{
-    std::vector<float> detector_reordered(_detector.size());
-
-    size_t block_hist_size = getBlockHistogramSize();
-    cv::Size blocks_per_img = numPartsWithin(win_size, block_size, block_stride);
-
-    for (int i = 0; i < blocks_per_img.height; ++i)
-        for (int j = 0; j < blocks_per_img.width; ++j)
-        {
-            const float *src = &_detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
-            float *dst = &detector_reordered[0] + (i * blocks_per_img.width + j) * block_hist_size;
-            for (size_t k = 0; k < block_hist_size; ++k)
-                dst[k] = src[k];
-        }
-
-    this->detector.upload(Mat(detector_reordered).reshape(1, 1));
-
-    size_t descriptor_size = getDescriptorSize();
-    free_coef = _detector.size() > descriptor_size ? _detector[descriptor_size] : 0;
-
-    CV_Assert(checkDetectorSize());
-}
-
-void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
-{
-    if (!image_scale.empty())
-        return;
-
-    if (effect_size == Size(0, 0))
-        effect_size = img.size();
-
-    grad.create(img.size(), CV_32FC2);
-    qangle.create(img.size(), CV_8UC2);
-
-    const size_t block_hist_size = getBlockHistogramSize();
-    const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
-    block_hists.create(1,
-        static_cast<int>(block_hist_size * blocks_per_img.area()) + 256, CV_32F);
-
-    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-    labels.create(1, wins_per_img.area(), CV_8U);
-
-    float sigma = getWinSigma();
-    float scale = 1.f / (2.f * sigma * sigma);
-    Mat gaussian_lut(1, 512, CV_32FC1);
-    int idx = 0;
-    for(int i=-8; i<8; i++)
-        for(int j=-8; j<8; j++)
-            gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
-    for(int i=-8; i<8; i++)
-        for(int j=-8; j<8; j++)
-            gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
-
-    gauss_w_lut.upload(gaussian_lut);
-}
-
-void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
-{
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-
-    float angleScale = (float)(nbins / CV_PI);
-    switch (img.type())
-    {
-    case CV_8UC1:
-        hog::compute_gradients_8UC1(effect_size.height, effect_size.width, img,
-            angleScale, grad, qangle, gamma_correction);
-        break;
-    case CV_8UC4:
-        hog::compute_gradients_8UC4(effect_size.height, effect_size.width, img,
-            angleScale, grad, qangle, gamma_correction);
-        break;
-    }
-}
-
-
-void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
-{
-    computeGradient(img, this->grad, this->qangle);
-
-    hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
-        effect_size.width, grad, qangle, gauss_w_lut, block_hists);
-
-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
-        effect_size.width, block_hists, (float)threshold_L2hys);
-}
-
-
-void cv::ocl::HOGDescriptor::getDescriptors(const oclMat &img, Size win_stride,
-                                            oclMat &descriptors, int descr_format)
-{
-    CV_Assert(win_stride.width % block_stride.width == 0 &&
-        win_stride.height % block_stride.height == 0);
-
-    init_buffer(img, win_stride);
-
-    computeBlockHistograms(img);
-
-    const size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    Size wins_per_img   = numPartsWithin(effect_size, win_size, win_stride);
-
-    descriptors.create(wins_per_img.area(),
-        static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
-
-    switch (descr_format)
-    {
-    case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width,
-            block_stride.height, block_stride.width, win_stride.height, win_stride.width,
-            effect_size.height, effect_size.width, block_hists, descriptors);
-        break;
-    case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width,
-            block_stride.height, block_stride.width, win_stride.height, win_stride.width,
-            effect_size.height, effect_size.width, block_hists, descriptors);
-        break;
-    default:
-        CV_Error(Error::StsBadArg, "Unknown descriptor format");
-    }
-}
-
-
-void cv::ocl::HOGDescriptor::detect(const oclMat &img, std::vector<Point> &hits,
-                                    double hit_threshold, Size win_stride, Size padding)
-{
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-    CV_Assert(padding == Size(0, 0));
-
-    hits.clear();
-    if (detector.empty())
-        return;
-
-    if (win_stride == Size())
-        win_stride = block_stride;
-    else
-        CV_Assert(win_stride.width % block_stride.width == 0 &&
-            win_stride.height % block_stride.height == 0);
-    init_buffer(img, win_stride);
-
-    computeBlockHistograms(img);
-
-    hog::classify_hists(win_size.height, win_size.width, block_stride.height,
-        block_stride.width, win_stride.height, win_stride.width,
-        effect_size.height, effect_size.width, block_hists, detector,
-        (float)free_coef, (float)hit_threshold, labels);
-
-    labels.download(labels_host);
-    unsigned char *vec = labels_host.ptr();
-    Size wins_per_img = numPartsWithin(effect_size, win_size, win_stride);
-    for (int i = 0; i < wins_per_img.area(); i++)
-    {
-        int y = i / wins_per_img.width;
-        int x = i - wins_per_img.width * y;
-        if (vec[i])
-            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
-    }
-}
-
-
-
-void cv::ocl::HOGDescriptor::detectMultiScale(const oclMat &img, std::vector<Rect> &found_locations,
-                                              double hit_threshold, Size win_stride, Size padding,
-                                              double scale0, int group_threshold)
-{
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-    CV_Assert(scale0 > 1);
-
-    std::vector<double> level_scale;
-    double scale = 1.;
-    int levels = 0;
-
-    for (levels = 0; levels < nlevels; levels++)
-    {
-        level_scale.push_back(scale);
-        if (cvRound(img.cols / scale) < win_size.width ||
-                cvRound(img.rows / scale) < win_size.height || scale0 <= 1)
-            break;
-        scale *= scale0;
-    }
-    levels = std::max(levels, 1);
-    level_scale.resize(levels);
-
-    std::vector<Rect> all_candidates;
-    std::vector<Point> locations;
-
-    if (win_stride == Size())
-        win_stride = block_stride;
-    else
-        CV_Assert(win_stride.width % block_stride.width == 0 &&
-            win_stride.height % block_stride.height == 0);
-    init_buffer(img, win_stride);
-    image_scale.create(img.size(), img.type());
-
-    for (size_t i = 0; i < level_scale.size(); i++)
-    {
-        scale = level_scale[i];
-        effect_size = Size(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        if (effect_size == img.size())
-        {
-            detect(img, locations, hit_threshold, win_stride, padding);
-        }
-        else
-        {
-            resize(img, image_scale, effect_size);
-            detect(image_scale, locations, hit_threshold, win_stride, padding);
-        }
-        Size scaled_win_size(cvRound(win_size.width * scale),
-            cvRound(win_size.height * scale));
-        for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
-    }
-
-    found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2);
-}
-
-int cv::ocl::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
-{
-    return (size - part_size + stride) / stride;
-}
-
-cv::Size cv::ocl::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size,
-                                                cv::Size stride)
-{
-    return Size(numPartsWithin(size.width, part_size.width, stride.width),
-        numPartsWithin(size.height, part_size.height, stride.height));
-}
-
-std::vector<float> cv::ocl::HOGDescriptor::getDefaultPeopleDetector()
-{
-    return getPeopleDetector64x128();
-}
-
-std::vector<float> cv::ocl::HOGDescriptor::getPeopleDetector48x96()
-{
-    static const float detector[] =
-    {
-        0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
-        0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
-        0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
-        0.254676f, -0.069235f, 0.082566f, 0.147260f, 0.326969f, 0.148888f,
-        0.055270f, -0.087985f, 0.261720f, 0.143442f, 0.026812f, 0.238212f,
-        0.194020f, 0.056341f, -0.025854f, -0.034444f, -0.156631f, 0.205174f,
-        0.089008f, -0.139811f, -0.100147f, -0.037830f, -0.029230f, -0.055641f,
-        0.033248f, -0.016512f, 0.155244f, 0.247315f, -0.124694f, -0.048414f,
-        -0.062219f, 0.193683f, 0.004574f, 0.055089f, 0.093565f, 0.167712f,
-        0.167581f, 0.018895f, 0.215258f, 0.122609f, 0.090520f, -0.067219f,
-        -0.049029f, -0.099615f, 0.241804f, -0.094893f, -0.176248f, 0.001727f,
-        -0.134473f, 0.104442f, 0.050942f, 0.081165f, 0.072156f, 0.121646f,
-        0.002656f, -0.297974f, -0.133587f, -0.060121f, -0.092515f, -0.048974f,
-        -0.084754f, -0.180111f, -0.038590f, 0.086283f, -0.134636f, -0.107249f,
-        0.132890f, 0.141556f, 0.249425f, 0.130273f, -0.030031f, 0.073212f,
-        -0.008155f, 0.019931f, 0.071688f, 0.000300f, -0.019525f, -0.021725f,
-        -0.040993f, -0.086841f, 0.070124f, 0.240033f, 0.265350f, 0.043208f,
-        0.166754f, 0.091453f, 0.060916f, -0.036972f, -0.091043f, 0.079873f,
-        0.219781f, 0.158102f, -0.140618f, -0.043016f, 0.124802f, 0.093668f,
-        0.103208f, 0.094872f, 0.080541f, 0.137711f, 0.160566f, -0.169231f,
-        0.013983f, 0.309508f, -0.004217f, -0.057200f, -0.064489f, 0.014066f,
-        0.361009f, 0.251328f, -0.080983f, -0.044183f, 0.061436f, -0.037381f,
-        -0.078786f, 0.030993f, 0.066314f, 0.037683f, 0.152325f, -0.091683f,
-        0.070203f, 0.217856f, 0.036435f, -0.076462f, 0.006254f, -0.094431f,
-        0.154829f, -0.023038f, -0.196961f, -0.024594f, 0.178465f, -0.050139f,
-        -0.045932f, -0.000965f, 0.109112f, 0.046165f, -0.159373f, -0.008713f,
-        0.041307f, 0.097129f, -0.057211f, -0.064599f, 0.077165f, 0.176167f,
-        0.138322f, 0.065753f, -0.104950f, 0.017933f, 0.136255f, -0.011598f,
-        0.047007f, 0.080550f, 0.068619f, 0.084661f, -0.035493f, -0.091314f,
-        -0.041411f, 0.060971f, -0.101912f, -0.079870f, -0.085977f, -0.022686f,
-        0.079788f, -0.098064f, -0.054603f, 0.040383f, 0.300794f, 0.128603f,
-        0.094844f, 0.047407f, 0.101825f, 0.061832f, -0.162160f, -0.204553f,
-        -0.035165f, 0.101450f, -0.016641f, -0.027140f, -0.134392f, -0.008743f,
-        0.102331f, 0.114853f, 0.009644f, 0.062823f, 0.237339f, 0.167843f,
-        0.053066f, -0.012592f, 0.043158f, 0.002305f, 0.065001f, -0.038929f,
-        -0.020356f, 0.152343f, 0.043469f, -0.029967f, -0.042948f, 0.032481f,
-        0.068488f, -0.110840f, -0.111083f, 0.111980f, -0.002072f, -0.005562f,
-        0.082926f, 0.006635f, -0.108153f, 0.024242f, -0.086464f, -0.189884f,
-        -0.017492f, 0.191456f, -0.007683f, -0.128769f, -0.038017f, -0.132380f,
-        0.091926f, 0.079696f, -0.106728f, -0.007656f, 0.172744f, 0.011576f,
-        0.009883f, 0.083258f, -0.026516f, 0.145534f, 0.153924f, -0.130290f,
-        -0.108945f, 0.124490f, -0.003186f, -0.100485f, 0.015024f, -0.060512f,
-        0.026288f, -0.086713f, -0.169012f, 0.076517f, 0.215778f, 0.043701f,
-        -0.131642f, -0.012585f, -0.045181f, -0.118183f, -0.241544f, -0.167293f,
-        -0.020107f, -0.019917f, -0.101827f, -0.107096f, -0.010503f, 0.044938f,
-        0.189680f, 0.217119f, -0.046086f, 0.044508f, 0.199716f, -0.036004f,
-        -0.148927f, 0.013355f, -0.078279f, 0.030451f, 0.056301f, -0.024609f,
-        0.083224f, 0.099533f, -0.039432f, -0.138880f, 0.005482f, -0.024120f,
-        -0.140468f, -0.066381f, -0.017057f, 0.009260f, -0.058004f, -0.028486f,
-        -0.061610f, 0.007483f, -0.158309f, -0.150687f, -0.044595f, -0.105121f,
-        -0.045763f, -0.006618f, -0.024419f, -0.117713f, -0.119366f, -0.175941f,
-        -0.071542f, 0.119027f, 0.111362f, 0.043080f, 0.034889f, 0.093003f,
-        0.007842f, 0.057368f, -0.108834f, -0.079968f, 0.230959f, 0.020205f,
-        0.011470f, 0.098877f, 0.101310f, -0.030215f, -0.018018f, -0.059552f,
-        -0.106157f, 0.021866f, -0.036471f, 0.080051f, 0.041165f, -0.082101f,
-        0.117726f, 0.030961f, -0.054763f, -0.084102f, -0.185778f, -0.061305f,
-        -0.038089f, -0.110728f, -0.264010f, 0.076675f, -0.077111f, -0.137644f,
-        0.036232f, 0.277995f, 0.019116f, 0.107738f, 0.144003f, 0.080304f,
-        0.215036f, 0.228897f, 0.072713f, 0.077773f, 0.120168f, 0.075324f,
-        0.062730f, 0.122478f, -0.049008f, 0.164912f, 0.162450f, 0.041246f,
-        0.009891f, -0.097827f, -0.038700f, -0.023027f, -0.120020f, 0.203364f,
-        0.248474f, 0.149810f, -0.036276f, -0.082814f, -0.090343f, -0.027143f,
-        -0.075689f, -0.320310f, -0.000500f, -0.143334f, -0.065077f, -0.186936f,
-        0.129372f, 0.116431f, 0.181699f, 0.170436f, 0.418854f, 0.460045f,
-        0.333719f, 0.230515f, 0.047822f, -0.044954f, -0.068086f, 0.140179f,
-        -0.044821f, 0.085550f, 0.092483f, -0.107296f, -0.130670f, -0.206629f,
-        0.114601f, -0.317869f, -0.076663f, 0.038680f, 0.212753f, -0.016059f,
-        -0.126526f, -0.163602f, 0.210154f, 0.099887f, -0.126366f, 0.118453f,
-        0.019309f, -0.021611f, -0.096499f, -0.111809f, -0.200489f, 0.142854f,
-        0.228840f, -0.353346f, -0.179151f, 0.116834f, 0.252389f, -0.031728f,
-        -0.188135f, -0.158998f, 0.386523f, 0.122315f, 0.209944f, 0.394023f,
-        0.359030f, 0.260717f, 0.170335f, 0.013683f, -0.142596f, -0.026138f,
-        -0.011878f, -0.150519f, 0.047159f, -0.107062f, -0.147347f, -0.187689f,
-        -0.186027f, -0.208048f, 0.058468f, -0.073026f, -0.236556f, -0.079788f,
-        -0.146216f, -0.058563f, -0.101361f, -0.071294f, -0.071093f, 0.116919f,
-        0.234304f, 0.306781f, 0.321866f, 0.240000f, 0.073261f, -0.012173f,
-        0.026479f, 0.050173f, 0.166127f, 0.228955f, 0.061905f, 0.156460f,
-        0.205990f, 0.120672f, 0.037350f, 0.167884f, 0.290099f, 0.420900f,
-        -0.012601f, 0.189839f, 0.306378f, 0.118383f, -0.095598f, -0.072360f,
-        -0.132496f, -0.224259f, -0.126021f, 0.022714f, 0.284039f, 0.051369f,
-        -0.000927f, -0.058735f, -0.083354f, -0.141254f, -0.187578f, -0.202669f,
-        0.048902f, 0.246597f, 0.441863f, 0.342519f, 0.066979f, 0.215286f,
-        0.188191f, -0.072240f, -0.208142f, -0.030196f, 0.178141f, 0.136985f,
-        -0.043374f, -0.181098f, 0.091815f, 0.116177f, -0.126690f, -0.386625f,
-        0.368165f, 0.269149f, -0.088042f, -0.028823f, 0.092961f, 0.024099f,
-        0.046112f, 0.176756f, 0.135849f, 0.124955f, 0.195467f, -0.037218f,
-        0.167217f, 0.188938f, 0.053528f, -0.066561f, 0.133721f, -0.070565f,
-        0.115898f, 0.152435f, -0.116993f, -0.110592f, -0.179005f, 0.026668f,
-        0.080530f, 0.075084f, -0.070401f, 0.012497f, 0.021849f, -0.139764f,
-        -0.022020f, -0.096301f, -0.064954f, -0.127446f, -0.013806f, -0.108315f,
-        0.156285f, 0.149867f, -0.011382f, 0.064532f, 0.029168f, 0.027393f,
-        0.069716f, 0.153735f, 0.038459f, 0.230714f, 0.253840f, 0.059522f,
-        -0.045053f, 0.014083f, 0.071103f, 0.068747f, 0.095887f, 0.005832f,
-        0.144887f, 0.026357f, -0.067359f, -0.044151f, -0.123283f, -0.019911f,
-        0.005318f, 0.109208f, -0.003201f, -0.021734f, 0.142025f, -0.066907f,
-        -0.120070f, -0.188639f, 0.012472f, -0.048704f, -0.012366f, -0.184828f,
-        0.168591f, 0.267166f, 0.058208f, -0.044101f, 0.033500f, 0.178558f,
-        0.104550f, 0.122418f, 0.080177f, 0.173246f, 0.298537f, 0.064173f,
-        0.053397f, 0.174341f, 0.230984f, 0.117025f, 0.166242f, 0.227781f,
-        0.120623f, 0.176952f, -0.011393f, -0.086483f, -0.008270f, 0.051700f,
-        -0.153369f, -0.058837f, -0.057639f, -0.060115f, 0.026349f, -0.160745f,
-        -0.037894f, -0.048575f, 0.041052f, -0.022112f, 0.060365f, 0.051906f,
-        0.162657f, 0.138519f, -0.050185f, -0.005938f, 0.071301f, 0.127686f,
-        0.062342f, 0.144400f, 0.072600f, 0.198436f, 0.246219f, -0.078185f,
-        -0.036169f, 0.075934f, 0.047328f, -0.013601f, 0.087205f, 0.019900f,
-        0.022606f, -0.015365f, -0.092506f, 0.075275f, -0.116375f, 0.050500f,
-        0.045118f, 0.166567f, 0.072073f, 0.060371f, 0.131747f, -0.169863f,
-        -0.039352f, -0.047486f, -0.039797f, -0.204312f, 0.021710f, 0.129443f,
-        -0.021173f, 0.173416f, -0.070794f, -0.063986f, 0.069689f, -0.064099f,
-        -0.123201f, -0.017372f, -0.206870f, 0.065863f, 0.113226f, 0.024707f,
-        -0.071341f, -0.066964f, -0.098278f, -0.062927f, 0.075840f, 0.014716f,
-        0.019378f, 0.132699f, -0.074191f, -0.089557f, -0.078446f, -0.197488f,
-        -0.173665f, 0.052583f, 0.044361f, 0.113549f, 0.098492f, 0.077379f,
-        -0.011146f, -0.192593f, -0.164435f, 0.045568f, 0.205699f, 0.049187f,
-        -0.082281f, 0.134874f, 0.185499f, 0.034968f, -0.119561f, -0.112372f,
-        -0.115091f, -0.054042f, -0.183816f, -0.078100f, 0.190695f, 0.091617f,
-        0.004257f, -0.041135f, -0.061453f, -0.141592f, -0.194809f, -0.120638f,
-        0.020168f, 0.109672f, 0.067398f, -0.015238f, -0.239145f, -0.264671f,
-        -0.185176f, 0.050472f, 0.020793f, 0.035678f, 0.022839f, -0.052055f,
-        -0.127968f, -0.113049f, -0.228416f, -0.258281f, -0.053437f, 0.076424f,
-        0.061450f, 0.237478f, 0.003618f, -0.055865f, -0.108087f, -0.028937f,
-        0.045585f, 0.052829f, -0.001471f, 0.022826f, 0.059565f, -0.104430f,
-        -0.077266f, -0.211882f, -0.212078f, 0.028074f, 0.075846f, 0.016265f,
-        0.161879f, 0.134477f, 0.008935f, -0.048041f, 0.074692f, 0.004928f,
-        -0.025156f, 0.192874f, 0.074410f, 0.308732f, 0.267400f, 0.094208f,
-        -0.005251f, 0.042041f, -0.032148f, 0.015588f, 0.252869f, 0.175302f,
-        0.022892f, 0.081673f, 0.063208f, 0.162626f, 0.194426f, 0.233890f,
-        0.262292f, 0.186930f, 0.084079f, -0.286388f, -0.213034f, -0.048867f,
-        -0.207669f, -0.170050f, 0.011673f, -0.092958f, -0.192786f, -0.273536f,
-        0.230904f, 0.266732f, 0.320519f, 0.297155f, 0.548169f, 0.304922f,
-        0.132687f, 0.247333f, 0.212488f, -0.271472f, -0.142105f, -0.002627f,
-        -0.119215f, 0.128383f, 0.100079f, -0.057490f, -0.121902f, -0.228892f,
-        0.202292f, -0.399795f, -0.371326f, -0.095836f, -0.063626f, -0.161375f,
-        -0.311180f, -0.294797f, 0.242122f, 0.011788f, 0.095573f, 0.322523f,
-        0.511840f, 0.322880f, 0.313259f, 0.173331f, 0.002542f, -0.029802f,
-        0.324766f, -0.326170f, -0.340547f, -0.138288f, -0.002963f, -0.114060f,
-        -0.377312f, -0.442570f, 0.212446f, -0.007759f, -0.011576f, 0.169711f,
-        0.308689f, 0.317348f, 0.539390f, 0.332845f, 0.057331f, -0.068180f,
-        0.101994f, 0.266995f, 0.209570f, 0.355730f, 0.091635f, 0.170238f,
-        0.125215f, 0.274154f, 0.070223f, 0.025515f, 0.049946f, -0.000550f,
-        0.043715f, -0.141843f, 0.020844f, 0.129871f, 0.256588f, 0.105015f,
-        0.148339f, 0.170682f, 0.028792f, 0.074037f, 0.160042f, 0.405137f,
-        0.246187f, 0.352160f, 0.168951f, 0.222263f, 0.264439f, 0.065945f,
-        0.021963f, -0.075084f, 0.093105f, 0.027318f, 0.098864f, 0.057566f,
-        -0.080282f, 0.185032f, 0.314419f, 0.333727f, 0.125798f, 0.294919f,
-        0.386002f, 0.217619f, -0.183517f, -0.278622f, -0.002342f, -0.027821f,
-        -0.134266f, -0.331843f, -0.008296f, 0.124564f, 0.053712f, -0.369016f,
-        -0.095036f, 0.209381f, 0.423760f, 0.371760f, 0.106397f, 0.369408f,
-        0.485608f, 0.231201f, -0.138685f, -0.349208f, -0.070083f, 0.028991f,
-        -0.081630f, -0.395992f, -0.146791f, -0.027354f, 0.063396f, -0.272484f,
-        0.058299f, 0.338207f, 0.110767f, -0.052642f, -0.233848f, -0.027448f,
-        0.030328f, 0.155572f, -0.093826f, 0.019331f, 0.120638f, 0.006292f,
-        -0.106083f, -0.236290f, -0.140933f, -0.088067f, -0.025138f, -0.208395f,
-        -0.025502f, 0.144192f, -0.048353f, -0.106144f, -0.305121f, -0.114147f,
-        0.090963f, 0.327727f, 0.035606f, -0.093779f, 0.002651f, -0.171081f,
-        -0.188131f, -0.216571f, -0.209101f, -0.054402f, 0.157147f, -0.057127f,
-        0.066584f, 0.008988f, 0.041191f, 0.034456f, -0.078255f, 0.052099f,
-        -0.022239f, 0.066981f, -0.117520f, -0.072637f, 0.062512f, 0.037570f,
-        -0.057544f, -0.312359f, 0.034357f, -0.031549f, 0.002566f, -0.207375f,
-        -0.070654f, -0.018786f, -0.044815f, -0.012814f, -0.076320f, 0.078183f,
-        0.023877f, 0.117078f, 0.022292f, -0.205424f, -0.060430f, -0.017296f,
-        -0.004827f, -0.321036f, -0.092155f, 0.038837f, 0.073190f, -0.067513f,
-        0.026521f, 0.171945f, 0.087318f, 0.034495f, -0.034089f, 0.154410f,
-        -0.061431f, 0.007435f, -0.111094f, -0.095976f, 0.014741f, -0.132324f,
-        -0.029517f, -0.192160f, 0.098667f, 0.020762f, 0.177050f, -0.064510f,
-        -0.054437f, -0.058678f, -0.001858f, 0.167602f, 0.015735f, 0.054338f,
-        0.016477f, 0.186381f, -0.010667f, 0.054692f, 0.126742f, 0.013140f,
-        0.090353f, -0.133608f, -0.018017f, -0.152619f, 0.027600f, -0.138700f,
-        -0.050274f, 0.045141f, -0.118731f, 0.094797f, -0.167605f, 0.097461f,
-        -0.009131f, 0.199920f, -0.052976f, 0.158194f, 0.178568f, -0.107600f,
-        0.009671f, -0.084072f, -0.040258f, -0.205673f, 0.102891f, 0.223511f,
-        0.042699f, 0.118548f, -0.021274f, 0.110997f, -0.155121f, 0.027696f,
-        -0.149968f, 0.051552f, -0.129219f, 0.173524f, 0.073972f, -0.189045f,
-        -0.034523f, -0.106655f, -0.011843f, -0.197381f, 0.219413f, 0.183197f,
-        -0.054920f, 0.144955f, 0.036517f, -0.085412f, -0.229070f, -0.143710f,
-        -0.049486f, 0.156634f, -0.008673f, -0.064778f, 0.082344f, 0.145673f,
-        0.002912f, -0.210121f, -0.116564f, 0.078425f, 0.220908f, -0.067594f,
-        0.048610f, 0.084912f, -0.066202f, -0.112515f, -0.217767f, -0.082640f,
-        -0.017414f, 0.230265f, -0.070735f, 0.066073f, 0.215256f, 0.071157f,
-        -0.087220f, -0.202235f, -0.011918f, 0.099562f, 0.174716f, -0.063845f,
-        -0.121055f, 0.014367f, 0.132709f, -0.005060f, -0.244606f, -0.179693f,
-        -0.134690f, 0.023239f, -0.193116f, -0.076975f, -0.021164f, -0.001938f,
-        -0.163799f, -0.111437f, -0.210362f, -0.166376f, 0.034754f, 0.010036f,
-        -0.021917f, 0.068014f, -0.086893f, -0.251746f, -0.267171f, 0.037383f,
-        0.003966f, 0.033571f, -0.151506f, 0.025437f, -0.020626f, -0.308454f,
-        -0.343143f, -0.092263f, -0.026261f, -0.028345f, 0.036036f, 0.035169f,
-        0.129470f, 0.122205f, 0.015661f, -0.070612f, -0.094333f, -0.066055f,
-        -0.041083f, 0.159146f, 0.073184f, 0.110044f, 0.174471f, 0.078069f,
-        -0.014881f, 0.008116f, 0.013209f, 0.075857f, 0.195605f, 0.062714f,
-        0.067955f, 0.056544f, -0.153908f, -0.141749f, -0.072550f, 0.033523f,
-        -0.024665f, 0.134487f, 0.079076f, 0.133562f, 0.227130f, 0.018054f,
-        0.004928f, 0.169162f, 0.065152f, 0.072160f, 0.131631f, 0.096303f,
-        0.054288f, 0.106256f, 0.114632f, 0.119038f, 0.515200f, 0.247429f,
-        0.199134f, 0.211957f, 0.127558f, -0.294684f, -0.194890f, -0.049988f,
-        -0.112247f, -0.008122f, -0.006176f, 0.037035f, -0.110881f, -0.249989f,
-        0.152434f, 0.234621f, 0.153340f, 0.349283f, 0.683049f, 0.157174f,
-        0.124844f, 0.099136f, 0.064407f, -0.248400f, -0.155323f, -0.026498f,
-        -0.023450f, 0.049051f, -0.114187f, 0.007195f, -0.176825f, -0.376926f,
-        0.366159f, -0.179938f, -0.148508f, 0.006043f, 0.170048f, 0.097866f,
-        -0.102658f, -0.260430f, 0.248868f, 0.037019f, -0.118111f, 0.078176f,
-        0.194171f, 0.211328f, 0.368612f, 0.361213f, 0.130013f, 0.094650f,
-        0.227396f, -0.178058f, -0.114782f, -0.008093f, 0.231080f, -0.011843f,
-        -0.097917f, -0.325788f, 0.141879f, 0.119738f, -0.230427f, -0.117419f,
-        -0.114153f, 0.037903f, 0.116383f, 0.218773f, -0.101884f, 0.059466f,
-        0.119255f, 0.010874f, -0.031449f, 0.045996f, 0.119931f, 0.273760f,
-        0.311700f, 0.261794f, 0.194809f, 0.339829f, 0.239449f, 0.064140f,
-        0.077597f, 0.098996f, 0.143534f, 0.184602f, 0.037507f, 0.225494f,
-        0.096142f, -0.147370f, -0.207833f, -0.174742f, -0.086391f, -0.038942f,
-        0.159577f, -0.088492f, -0.000989f, 0.108154f, -0.025890f, -0.072713f,
-        0.025997f, -0.006803f, -0.086879f, -0.011290f, -0.269200f, -0.103450f,
-        -0.124910f, -0.116340f, 0.141459f, 0.208800f, 0.042268f, 0.265034f,
-        0.516474f, 0.217591f, -0.018843f, -0.313328f, -0.168363f, 0.047129f,
-        0.090480f, -0.109852f, -0.018761f, 0.210669f, 0.281269f, -0.043591f,
-        -0.034147f, -0.237772f, -0.134843f, -0.072481f, -0.103831f, 0.038355f,
-        0.308619f, 0.148023f, -0.045867f, -0.123950f, -0.210860f, -0.064973f,
-        -0.036308f, -0.046731f, -0.022099f, 0.095776f, 0.409423f, 0.060635f,
-        -0.065196f, 0.051828f, 0.027981f, -0.009609f, -0.137681f, -0.095011f,
-        -0.019045f, 0.177278f, 0.009759f, -0.092119f, -0.016958f, -0.133860f,
-        -0.118421f, -0.032039f, -0.006214f, -0.084541f, 0.063971f, -0.073642f,
-        0.165676f, 0.110443f, 0.044131f, 0.046568f, 0.053292f, -0.055466f,
-        0.015512f, 0.371947f, 0.232102f, -0.016923f, 0.103979f, -0.091758f,
-        0.005907f, 0.209100f, 0.157433f, 0.030518f, 0.250366f, 0.062322f,
-        0.036720f, 0.094676f, 0.017306f, -0.010328f, -0.079012f, 0.016781f,
-        -0.112435f, 0.061795f, 0.042543f, -0.126799f, -0.009975f, -0.056760f,
-        0.046424f, -0.194712f, -0.139399f, -0.037731f, 0.157989f, -0.016261f,
-        0.123345f, 0.230563f, 0.083300f, -0.016392f, 0.059567f, -0.016035f,
-        -0.064767f, 0.231945f, 0.156629f, 0.034602f, 0.145628f, 0.041315f,
-        0.034535f, 0.019967f, -0.089188f, -0.012091f, 0.307857f, 0.211405f,
-        -0.025091f, -0.148249f, -0.129384f, 0.063536f, -0.068603f, -0.067941f,
-        -0.035104f, 0.210832f, 0.063810f, 0.062764f, -0.089889f, -0.030554f,
-        0.014791f, -0.053362f, -0.037818f, -0.196640f, 0.008388f, -0.082654f,
-        0.143056f, 0.064221f, 0.069795f, 0.191040f, 0.097321f, -0.028679f,
-        0.075794f, 0.313154f, 0.086240f, 0.207643f, 0.017809f, 0.122867f,
-        0.224586f, 0.167403f, -0.023884f, 0.047434f, 0.344091f, 0.187745f,
-        0.136177f, 0.141738f, 0.063799f, 0.045233f, -0.077342f, -0.003525f,
-        -0.165041f, -0.025616f, -0.073745f, 0.164439f, 0.011200f, -0.145896f,
-        -0.027954f, -0.061987f, -0.039874f, -0.142775f, 0.151042f, -0.038238f,
-        0.053152f, 0.078615f, 0.086061f, 0.100593f, 0.128046f, -0.071006f,
-        -0.116558f, 0.208445f, 0.051086f, 0.076843f, 0.023191f, -0.084781f,
-        -0.011790f, 0.147807f, -0.048554f, -0.113932f, 0.283322f, 0.190934f,
-        0.092789f, 0.033018f, -0.142428f, -0.142480f, -0.099023f, -0.041020f,
-        -0.042760f, 0.203295f, -0.053475f, 0.042424f, 0.222839f, -0.019167f,
-        -0.133176f, -0.276216f, -0.031998f, 0.117290f, 0.177827f, -0.059973f,
-        -0.064744f, -0.117040f, -0.155482f, -0.099531f, 0.164121f, -0.026682f,
-        -0.093810f, 0.238993f, -0.006506f, 0.007830f, 0.065819f, -0.203643f,
-        -0.100925f, -0.053652f, -0.130770f, 0.026277f, 0.131796f, 0.032742f,
-        0.127186f, 0.116694f, -0.161122f, -0.279773f, -0.252515f, -0.002638f,
-        0.042812f, 0.096776f, -0.123280f, 0.064858f, -0.010455f, -0.219760f,
-        -0.239331f, -0.104363f, -0.058022f, -0.053584f, 0.025611f, 0.005129f,
-        -0.100418f, -0.045712f, -0.194418f, -0.126366f, -0.030530f, 0.051168f,
-        0.215959f, 0.172402f, -0.054700f, -0.185995f, -0.278360f, -0.193693f,
-        -0.040309f, 0.003735f, -0.007770f, 0.123556f, 0.190179f, -0.077315f,
-        0.117403f, 0.212942f, 0.012160f, 0.000113f, 0.027331f, 0.040202f,
-        0.033293f, 0.219438f, 0.184174f, 0.259349f, 0.311206f, 0.082547f,
-        -0.047875f, -0.078417f, 0.010746f, 0.082620f, 0.311931f, 0.307605f,
-        0.003863f, 0.021405f, -0.026388f, -0.019572f, 0.020582f, -0.059353f,
-        0.025199f, 0.261319f, 0.086316f, 0.143614f, 0.107780f, 0.003900f,
-        -0.188397f, -0.038563f, -0.106045f, -0.125154f, -0.010509f, 0.054021f,
-        0.242130f, 0.279152f, 0.215546f, 0.346995f, 0.440856f, 0.237452f,
-        0.234154f, 0.301646f, 0.168929f, -0.208358f, -0.126848f, 0.010260f,
-        0.121018f, -0.062975f, -0.052848f, 0.050341f, -0.061103f, -0.266482f,
-        0.107186f, 0.140221f, 0.280065f, 0.287889f, 0.373198f, 0.151596f,
-        0.013593f, 0.115616f, 0.014616f, -0.281710f, -0.237597f, -0.117305f,
-        -0.000034f, -0.136739f, -0.196275f, -0.095225f, -0.125310f, -0.250514f,
-        0.236804f, -0.071805f, -0.037421f, 0.048230f, 0.321596f, 0.063632f,
-        0.024039f, -0.029133f, 0.230983f, 0.160593f, -0.154355f, -0.013086f,
-        -0.079929f, 0.094692f, 0.160391f, 0.180239f, 0.053895f, 0.100759f,
-        0.288631f, 0.038191f, 0.181692f, 0.229682f, 0.440166f, 0.063401f,
-        0.006273f, 0.020865f, 0.338695f, 0.256244f, -0.043927f, 0.115617f,
-        0.003296f, 0.173965f, 0.021318f, -0.040936f, -0.118932f, 0.182380f,
-        0.235922f, -0.053233f, -0.015053f, -0.101057f, 0.095341f, 0.051111f,
-        0.161831f, 0.032614f, 0.159496f, 0.072375f, 0.025089f, 0.023748f,
-        0.029151f, 0.161284f, -0.117717f, -0.036191f, -0.176822f, -0.162006f,
-        0.226542f, -0.078329f, 0.043079f, -0.119172f, 0.054614f, -0.101365f,
-        -0.064541f, -0.115304f, 0.135170f, 0.298872f, 0.098060f, 0.089428f,
-        -0.007497f, 0.110391f, -0.028824f, 0.020835f, -0.036804f, 0.125411f,
-        0.192105f, -0.048931f, 0.003086f, -0.010681f, 0.074698f, -0.016263f,
-        0.096063f, 0.060267f, -0.007277f, 0.139139f, -0.080635f, 0.036628f,
-        0.086058f, 0.131979f, 0.085707f, 0.025301f, 0.226094f, 0.194759f,
-        0.042193f, -0.157846f, -0.068402f, -0.141450f, -0.112659f, -0.076305f,
-        -0.069085f, -0.114332f, -0.102005f, 0.132193f, -0.067042f, 0.106643f,
-        0.198964f, 0.171616f, 0.167237f, -0.033730f, -0.026755f, 0.083621f,
-        0.149459f, -0.002799f, -0.000318f, 0.011753f, 0.065889f, -0.089375f,
-        -0.049610f, 0.224579f, 0.216548f, -0.034908f, -0.017851f, -0.088144f,
-        0.007530f, 0.240268f, 0.073270f, 0.013263f, 0.175323f, 0.012082f,
-        0.093993f, 0.015282f, 0.105854f, 0.107990f, 0.077798f, -0.096166f,
-        -0.079607f, 0.177820f, 0.142392f, 0.033337f, -0.078100f, -0.081616f,
-        -0.046993f, 0.139459f, 0.020272f, -0.123161f, 0.175269f, 0.105217f,
-        0.057328f, 0.080909f, -0.012612f, -0.097081f, 0.082060f, -0.096716f,
-        -0.063921f, 0.201884f, 0.128166f, -0.035051f, -0.032227f, -0.068139f,
-        -0.115915f, 0.095080f, -0.086007f, -0.067543f, 0.030776f, 0.032712f,
-        0.088937f, 0.054336f, -0.039329f, -0.114022f, 0.171672f, -0.112321f,
-        -0.217646f, 0.065186f, 0.060223f, 0.192174f, 0.055580f, -0.131107f,
-        -0.144338f, 0.056730f, -0.034707f, -0.081616f, -0.135298f, -0.000614f,
-        0.087189f, 0.014614f, 0.067709f, 0.107689f, 0.225780f, 0.084361f,
-        -0.008544f, 0.051649f, -0.048369f, -0.037739f, -0.060710f, 0.002654f,
-        0.016935f, 0.085563f, -0.015961f, -0.019265f, 0.111788f, 0.062376f,
-        0.202019f, 0.047713f, 0.042261f, 0.069716f, 0.242913f, 0.021052f,
-        -0.072812f, -0.155920f, -0.026436f, 0.035621f, -0.079300f, -0.028787f,
-        -0.048329f, 0.084718f, -0.060565f, -0.083750f, -0.164075f, -0.040742f,
-        -0.086219f, 0.015271f, -0.005204f, -0.016038f, 0.045816f, -0.050433f,
-        -0.077652f, 0.117109f, 0.009611f, -0.009045f, -0.008634f, -0.055373f,
-        -0.085968f, 0.028527f, -0.054736f, -0.168089f, 0.175839f, 0.071205f,
-        -0.023603f, 0.037907f, -0.004561f, -0.022634f, 0.123831f, 0.094469f,
-        -0.072920f, -0.133642f, -0.014032f, -0.142754f, -0.026999f, -0.199409f,
-        0.013268f, 0.226989f, 0.048650f, -0.170988f, -0.050141f, 0.007880f,
-        0.061880f, 0.019078f, -0.043578f, -0.038139f, 0.134814f, 0.054097f,
-        -0.081670f, 0.176838f, 0.047920f, -0.038176f, 0.050406f, -0.107181f,
-        -0.036279f, 0.027060f, 0.081594f, -0.002820f, 0.090507f, -0.033338f,
-        -0.059571f, 0.013404f, -0.099860f, 0.073371f, 0.342805f, 0.098305f,
-        -0.150910f, -0.020822f, -0.056960f, 0.046262f, -0.043413f, -0.149405f,
-        -0.129105f, -0.010899f, -0.014229f, -0.179949f, -0.113044f, -0.049468f,
-        -0.065513f, 0.090269f, -0.011919f, 0.087846f, 0.095796f, 0.146127f,
-        0.101599f, 0.078066f, -0.084348f, -0.100002f, -0.020134f, -0.050169f,
-        0.062122f, 0.014640f, 0.019143f, 0.036543f, 0.180924f, -0.013976f,
-        -0.066768f, -0.001090f, -0.070419f, -0.004839f, -0.001504f, 0.034483f,
-        -0.044954f, -0.050336f, -0.088638f, -0.174782f, -0.116082f, -0.205507f,
-        0.015587f, -0.042839f, -0.096879f, -0.144097f, -0.050268f, -0.196796f,
-        0.109639f, 0.271411f, 0.173732f, 0.108070f, 0.156437f, 0.124255f,
-        0.097242f, 0.238693f, 0.083941f, 0.109105f, 0.223940f, 0.267188f,
-        0.027385f, 0.025819f, 0.125070f, 0.093738f, 0.040353f, 0.038645f,
-        -0.012730f, 0.144063f, 0.052931f, -0.009138f, 0.084193f, 0.160272f,
-        -0.041366f, 0.011951f, -0.121446f, -0.106713f, -0.047566f, 0.047984f,
-        -0.255224f, -0.076116f, 0.098685f, -0.150845f, -0.171513f, -0.156590f,
-        0.058331f, 0.187493f, 0.413018f, 0.554265f, 0.372242f, 0.237943f,
-        0.124571f, 0.110829f, 0.010322f, -0.174477f, -0.067627f, -0.001979f,
-        0.142913f, 0.040597f, 0.019907f, 0.025963f, -0.043585f, -0.120732f,
-        0.099937f, 0.091059f, 0.247307f, 0.204226f, -0.042753f, -0.068580f,
-        -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
-        -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
-        -9.063785f
-    };
-    return std::vector<float>(detector, detector + sizeof(detector) / sizeof(detector[0]));
-}
-
-
-
-
-std::vector<float> cv::ocl::HOGDescriptor::getPeopleDetector64x128()
-{
-    static const float detector[] =
-    {
-        0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
-        0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
-        0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
-        0.01268418f, 0.08528346f, -0.06309239f, 0.13054633f, 0.08100729f,
-        -0.05209739f, -0.04315529f, 0.09341384f, 0.11035026f, -0.07596218f,
-        -0.05517511f, -0.04465296f, 0.02947334f, 0.04555536f,
-        -3.55954492e-003f, 0.07818956f, 0.07730991f, 0.07890715f, 0.06222893f,
-        0.09001380f, -0.03574381f, 0.03414327f, 0.05677258f, -0.04773581f,
-        0.03746637f, -0.03521175f, 0.06955440f, -0.03849038f, 0.01052293f,
-        0.01736112f, 0.10867710f, 0.08748853f, 3.29739624e-003f, 0.10907028f,
-        0.07913758f, 0.10393070f, 0.02091867f, 0.11594022f, 0.13182420f,
-        0.09879354f, 0.05362710f, -0.06745391f, -7.01260753e-003f,
-        5.24702156e-003f, 0.03236255f, 0.01407916f, 0.02207983f, 0.02537322f,
-        0.04547948f, 0.07200756f, 0.03129894f, -0.06274468f, 0.02107014f,
-        0.06035208f, 0.08636236f, 4.53164103e-003f, 0.02193363f, 0.02309801f,
-        0.05568166f, -0.02645093f, 0.04448695f, 0.02837519f, 0.08975694f,
-        0.04461516f, 0.08975355f, 0.07514391f, 0.02306982f, 0.10410084f,
-        0.06368385f, 0.05943464f, 4.58420580e-003f, 0.05220337f, 0.06675851f,
-        0.08358569f, 0.06712101f, 0.06559004f, -0.03930482f, -9.15936660e-003f,
-        -0.05897915f, 0.02816453f, 0.05032348f, 0.06780671f, 0.03377650f,
-        -6.09417039e-004f, -0.01795146f, -0.03083684f, -0.01302475f,
-        -0.02972313f, 7.88706727e-003f, -0.03525961f, -2.50397739e-003f,
-        0.05245084f, 0.11791293f, -0.02167498f, 0.05299332f, 0.06640524f,
-        0.05190265f, -8.27316567e-003f, 0.03033127f, 0.05842173f,
-        -4.01050318e-003f, -6.25105947e-003f, 0.05862958f, -0.02465461f,
-        0.05546781f, -0.08228195f, -0.07234028f, 0.04640540f, -0.01308254f,
-        -0.02506191f, 0.03100746f, -0.04665651f, -0.04591486f, 0.02949927f,
-        0.06035462f, 0.02244646f, -0.01698639f, 0.01040041f, 0.01131170f,
-        0.05419579f, -0.02130277f, -0.04321722f, -0.03665198f, 0.01126490f,
-        -0.02606488f, -0.02228328f, -0.02255680f, -0.03427236f,
-        -7.75165204e-003f, -0.06195229f, 8.21638294e-003f, 0.09535975f,
-        -0.03709979f, -0.06942501f, 0.14579427f, -0.05448192f, -0.02055904f,
-        0.05747357f, 0.02781788f, -0.07077577f, -0.05178314f, -0.10429011f,
-        -0.11235505f, 0.07529039f, -0.07559302f, -0.08786739f, 0.02983843f,
-        0.02667585f, 0.01382199f, -0.01797496f, -0.03141199f, -0.02098101f,
-        0.09029204f, 0.04955018f, 0.13718739f, 0.11379953f, 1.80019124e-003f,
-        -0.04577610f, -1.11108483e-003f, -0.09470536f, -0.11596080f,
-        0.04489342f, 0.01784211f, 3.06850672e-003f, 0.10781866f,
-        3.36498418e-003f, -0.10842580f, -0.07436839f, -0.10535070f,
-        -0.01866805f, 0.16057891f, -5.07316366e-003f, -0.04295658f,
-        -5.90488780e-003f, 8.82003549e-003f, -0.01492646f, -0.05029279f,
-        -0.12875880f, 8.78831954e-004f, -0.01297184f, -0.07592774f,
-        -0.02668831f, -6.93787413e-004f, 0.02406698f, -0.01773298f,
-        -0.03855745f, -0.05877856f, 0.03259695f, 0.12826584f, 0.06292590f,
-        -4.10733931e-003f, 0.10996531f, 0.01332991f, 0.02088735f, 0.04037504f,
-        -0.05210760f, 0.07760046f, 0.06399347f, -0.05751930f, -0.10053057f,
-        0.07505023f, -0.02139782f, 0.01796176f, 2.34400877e-003f, -0.04208319f,
-        0.07355055f, 0.05093350f, -0.02996780f, -0.02219072f, 0.03355330f,
-        0.04418742f, -0.05580705f, -0.05037573f, -0.04548179f, 0.01379514f,
-        0.02150671f, -0.02194211f, -0.13682702f, 0.05464972f, 0.01608082f,
-        0.05309116f, 0.04701022f, 1.33690401e-003f, 0.07575664f, 0.09625306f,
-        8.92647635e-003f, -0.02819123f, 0.10866830f, -0.03439325f,
-        -0.07092371f, -0.06004780f, -0.02712298f, -7.07467366e-003f,
-        -0.01637020f, 0.01336790f, -0.10313606f, 0.04906582f, -0.05732445f,
-        -0.02731079f, 0.01042235f, -0.08340668f, 0.03686501f, 0.06108340f,
-        0.01322748f, -0.07809529f, 0.03774724f, -0.03413248f, -0.06096525f,
-        -0.04212124f, -0.07982176f, -1.25973229e-003f, -0.03045501f,
-        -0.01236493f, -0.06312395f, 0.04789570f, -0.04602066f, 0.08576570f,
-        0.02521080f, 0.02988098f, 0.10314583f, 0.07060035f, 0.04520544f,
-        -0.04426654f, 0.13146530f, 0.08386490f, 0.02164590f, -2.12280243e-003f,
-        -0.03686353f, -0.02074944f, -0.03829959f, -0.01530596f, 0.02689708f,
-        0.11867401f, -0.06043470f, -0.02785023f, -0.04775074f, 0.04878745f,
-        0.06350956f, 0.03494788f, 0.01467400f, 1.17890188e-003f, 0.04379614f,
-        2.03681854e-003f, -0.03958609f, -0.01072688f, 6.43705716e-003f,
-        0.02996500f, -0.03418507f, -0.01960307f, -0.01219154f,
-        -4.37000440e-003f, -0.02549453f, 0.02646318f, -0.01632513f,
-        6.46516960e-003f, -0.01929734f, 4.78711911e-003f, 0.04962371f,
-        0.03809111f, 0.07265724f, 0.05758125f, -0.03741554f, 0.01648608f,
-        -8.45285598e-003f, 0.03996826f, -0.08185477f, 0.02638875f,
-        -0.04026615f, -0.02744674f, -0.04071517f, 1.05096330e-003f,
-        -0.04741232f, -0.06733172f, 8.70434940e-003f, -0.02192543f,
-        1.35350740e-003f, -0.03056974f, -0.02975521f, -0.02887780f,
-        -0.01210713f, -0.04828526f, -0.09066251f, -0.09969629f, -0.03665164f,
-        -8.88111943e-004f, -0.06826669f, -0.01866150f, -0.03627640f,
-        -0.01408288f, 0.01874239f, -0.02075835f, 0.09145175f, -0.03547291f,
-        0.05396780f, 0.04198981f, 0.01301925f, -0.03384354f, -0.12201976f,
-        0.06830920f, -0.03715654f, 9.55848210e-003f, 5.05685573e-003f,
-        0.05659294f, 3.90764466e-003f, 0.02808490f, -0.05518097f, -0.03711621f,
-        -0.02835565f, -0.04420464f, -0.01031947f, 0.01883466f,
-        -8.49525444e-003f, -0.09419250f, -0.01269387f, -0.02133371f,
-        -0.10190815f, -0.07844430f, 2.43644323e-003f, -4.09610150e-003f,
-        0.01202551f, -0.06452291f, -0.10593818f, -0.02464746f, -0.02199699f,
-        -0.07401930f, 0.07285886f, 8.87513801e-004f, 9.97662079e-003f,
-        8.46779719e-003f, 0.03730333f, -0.02905126f, 0.03573337f, -0.04393689f,
-        -0.12014472f, 0.03176554f, -2.76015815e-003f, 0.10824566f, 0.05090732f,
-        -3.30179278e-003f, -0.05123822f, 5.04784798e-003f, -0.05664124f,
-        -5.99415926e-003f, -0.05341901f, -0.01221393f, 0.01291318f,
-        9.91760660e-003f, -7.56987557e-003f, -0.06193124f, -2.24549137e-003f,
-        0.01987562f, -0.02018840f, -0.06975540f, -0.06601523f, -0.03349112f,
-        -0.08910118f, -0.03371435f, -0.07406893f, -0.02248047f, -0.06159951f,
-        2.77751544e-003f, -0.05723337f, -0.04792468f, 0.07518548f,
-        2.77279224e-003f, 0.04211938f, 0.03100502f, 0.05278448f, 0.03954679f,
-        -0.03006846f, -0.03851741f, -0.02792403f, -0.02875333f, 0.01531280f,
-        0.02186953f, -0.01989829f, 2.50679464e-003f, -0.10258728f,
-        -0.04785743f, -0.02887216f, 3.85063468e-003f, 0.01112236f,
-        8.29218887e-003f, -0.04822981f, -0.04503597f, -0.03713100f,
-        -0.06988008f, -0.11002295f, -2.69209221e-003f, 1.85383670e-003f,
-        -0.05921049f, -0.06105053f, -0.08458050f, -0.04527602f,
-        8.90329306e-004f, -0.05875023f, -2.68602883e-003f, -0.01591195f,
-        0.03631859f, 0.05493166f, 0.07300330f, 5.53333294e-003f, 0.06400407f,
-        0.01847740f, -5.76280477e-003f, -0.03210877f, 4.25160583e-003f,
-        0.01166520f, -1.44864211e-003f, 0.02253744f, -0.03367080f, 0.06983195f,
-        -4.22323542e-003f, -8.89401045e-003f, -0.07943393f, 0.05199728f,
-        0.06065201f, 0.04133492f, 1.44032843e-003f, -0.09585235f, -0.03964731f,
-        0.04232114f, 0.01750465f, -0.04487902f, -7.59733608e-003f, 0.02011171f,
-        0.04673622f, 0.09011173f, -0.07869188f, -0.04682482f, -0.05080139f,
-        -3.99383716e-003f, -0.05346331f, 0.01085723f, -0.03599333f,
-        -0.07097908f, 0.03551549f, 0.02680387f, 0.03471529f, 0.01790393f,
-        0.05471273f, 9.62048303e-003f, -0.03180215f, 0.05864431f, 0.02330614f,
-        0.01633144f, -0.05616681f, -0.10245429f, -0.08302189f, 0.07291322f,
-        -0.01972590f, -0.02619633f, -0.02485327f, -0.04627592f,
-        1.48853404e-003f, 0.05514185f, -0.01270860f, -0.01948900f, 0.06373586f,
-        0.05002292f, -0.03009798f, 8.76216311e-003f, -0.02474238f,
-        -0.05504891f, 1.74034527e-003f, -0.03333667f, 0.01524987f, 0.11663762f,
-        -1.32344989e-003f, -0.06608453f, 0.05687166f, -6.89525274e-004f,
-        -0.04402352f, 0.09450210f, -0.04222684f, -0.05360983f, 0.01779531f,
-        0.02561388f, -0.11075410f, -8.77790991e-003f, -0.01099504f,
-        -0.10380266f, 0.03103457f, -0.02105741f, -0.07371717f, 0.05146710f,
-        0.10581432f, -0.08617968f, -0.02892107f, 0.01092199f, 0.14551543f,
-        -2.24320893e-003f, -0.05818033f, -0.07390742f, 0.05701261f,
-        0.12937020f, -0.04986651f, 0.10182415f, 0.05028650f, 0.12515625f,
-        0.09175041f, 0.06404983f, 0.01523394f, 0.09460562f, 0.06106631f,
-        -0.14266998f, -0.02926703f, 0.02762171f, 0.02164151f,
-        -9.58488265e-004f, -0.04231362f, -0.09866509f, 0.04322244f,
-        0.05872034f, -0.04838847f, 0.06319253f, 0.02443798f, -0.03606876f,
-        9.38737206e-003f, 0.04289991f, -0.01027411f, 0.08156885f, 0.08751175f,
-        -0.13191354f, 8.16054735e-003f, -0.01452161f, 0.02952677f, 0.03615945f,
-        -2.09128903e-003f, 0.02246693f, 0.09623287f, 0.09412123f, -0.02924758f,
-        -0.07815186f, -0.02203079f, -2.02566991e-003f, 0.01094733f,
-        -0.01442332f, 0.02838561f, 0.11882371f, 7.28798332e-003f, -0.10345965f,
-        0.07561217f, -0.02049661f, 4.44177445e-003f, 0.01609347f, -0.04893158f,
-        -0.08758243f, -7.67420698e-003f, 0.08862378f, 0.06098121f, 0.06565887f,
-        7.32981879e-003f, 0.03558407f, -0.03874352f, -0.02490055f,
-        -0.06771075f, 0.09939223f, -0.01066077f, 0.01382995f, -0.07289080f,
-        7.47184316e-003f, 0.10621431f, -0.02878659f, 0.02383525f, -0.03274646f,
-        0.02137008f, 0.03837290f, 0.02450992f, -0.04296818f, -0.02895143f,
-        0.05327370f, 0.01499020f, 0.04998732f, 0.12938657f, 0.09391870f,
-        0.04292390f, -0.03359194f, -0.06809492f, 0.01125796f, 0.17290455f,
-        -0.03430733f, -0.06255233f, -0.01813114f, 0.11726857f, -0.06127599f,
-        -0.08677909f, -0.03429872f, 0.04684938f, 0.08161420f, 0.03538774f,
-        0.01833884f, 0.11321855f, 0.03261845f, -0.04826299f, 0.01752407f,
-        -0.01796414f, -0.10464549f, -3.30041884e-003f, 2.29343961e-004f,
-        0.01457292f, -0.02132982f, -0.02602923f, -9.87351313e-003f,
-        0.04273872f, -0.02103316f, -0.07994065f, 0.02614958f, -0.02111666f,
-        -0.06964913f, -0.13453490f, -0.06861878f, -6.09341264e-003f,
-        0.08251446f, 0.15612499f, 2.46531400e-003f, 8.88424646e-003f,
-        -0.04152999f, 0.02054853f, 0.05277953f, -0.03087788f, 0.02817579f,
-        0.13939077f, 0.07641046f, -0.03627627f, -0.03015098f, -0.04041540f,
-        -0.01360690f, -0.06227205f, -0.02738223f, 0.13577610f, 0.15235767f,
-        -0.05392922f, -0.11175954f, 0.02157129f, 0.01146481f, -0.05264937f,
-        -0.06595174f, -0.02749175f, 0.11812254f, 0.17404149f, -0.06137035f,
-        -0.11003478f, -0.01351621f, -0.01745916f, -0.08577441f, -0.04469909f,
-        -0.06106115f, 0.10559758f, 0.20806813f, -0.09174948f, 7.09621934e-004f,
-        0.03579374f, 0.07215115f, 0.02221742f, 0.01827742f, -7.90785067e-003f,
-        0.01489554f, 0.14519960f, -0.06425831f, 0.02990399f, -1.80181325e-003f,
-        -0.01401528f, -0.04171134f, -3.70530109e-003f, -0.09090481f,
-        0.09520713f, 0.08845516f, -0.02651753f, -0.03016730f, 0.02562448f,
-        0.03563816f, -0.03817881f, 0.01433385f, 0.02256983f, 0.02872120f,
-        0.01001934f, -0.06332260f, 0.04338406f, 0.07001807f, -0.04705722f,
-        -0.07318907f, 0.02630457f, 0.03106382f, 0.06648342f, 0.10913180f,
-        -0.01630815f, 0.02910308f, 0.02895109f, 0.08040254f, 0.06969310f,
-        0.06797734f, 6.08639978e-003f, 4.16588830e-003f, 0.08926726f,
-        -0.03123648f, 0.02700146f, 0.01168734f, -0.01631594f, 4.61015804e-003f,
-        8.51359498e-003f, -0.03544224f, 0.03571994f, 4.29766066e-003f,
-        -0.01970077f, -8.79793242e-003f, 0.09607988f, 0.01544222f,
-        -0.03923707f, 0.07308586f, 0.06061262f, 1.31683104e-004f,
-        -7.98222050e-003f, 0.02399261f, -0.06084389f, -0.02743429f,
-        -0.05475523f, -0.04131311f, 0.03559756f, 0.03055342f, 0.02981433f,
-        0.14860515f, 0.01766787f, 0.02945257f, 0.04898238f, 0.01026922f,
-        0.02811658f, 0.08267091f, 0.02732154f, -0.01237693f, 0.11760156f,
-        0.03802063f, -0.03309754f, 5.24957618e-003f, -0.02460510f, 0.02691451f,
-        0.05399988f, -0.10133506f, 0.06385437f, -0.01818005f, 0.02259503f,
-        0.03573135f, 0.01042848f, -0.04153402f, -0.04043029f, 0.01643575f,
-        0.08326677f, 4.61383024e-004f, -0.05308095f, -0.08536223f,
-        -1.61011645e-003f, -0.02163720f, -0.01783352f, 0.03859637f,
-        0.08498885f, -0.01725216f, 0.08625131f, 0.10995087f, 0.09177644f,
-        0.08498347f, 0.07646490f, 0.05580502f, 0.02693516f, 0.09996913f,
-        0.09070327f, 0.06667200f, 0.05873008f, -0.02247842f, 0.07772321f,
-        0.12408436f, 0.12629253f, -8.41997913e-004f, 0.01477783f, 0.09165990f,
-        -2.98401713e-003f, -0.06466447f, -0.07057302f, 2.09516948e-004f,
-        0.02210209f, -0.02158809f, -0.08602506f, -0.02284836f,
-        4.01876355e-003f, 9.56660323e-003f, -0.02073978f, -0.04635138f,
-        -7.59423291e-003f, -0.01377393f, -0.04559359f, -0.13284740f,
-        -0.08671406f, -0.03654395f, 0.01142869f, 0.03287891f, -0.04392983f,
-        0.06142959f, 0.17710890f, 0.10385257f, 0.01329137f, 0.10067633f,
-        0.12450829f, -0.04476709f, 0.09049144f, 0.04589312f, 0.11167907f,
-        0.08587538f, 0.04767583f, 1.67188141e-003f, 0.02359802f, -0.03808852f,
-        0.03126272f, -0.01919029f, -0.05698918f, -0.02365112f, -0.06519032f,
-        -0.05599358f, -0.07097308f, -0.03301812f, -0.04719102f, -0.02566297f,
-        0.01324074f, -0.09230672f, -0.05518232f, -0.04712864f, -0.03380903f,
-        -0.06719479f, 0.01183908f, -0.09326738f, 0.01642865f, 0.03789867f,
-        -6.61567831e-003f, 0.07796386f, 0.07246574f, 0.04706347f, -0.02523437f,
-        -0.01696830f, -0.08068866f, 0.06030888f, 0.10527060f, -0.06611756f,
-        0.02977346f, 0.02621830f, 0.01913855f, -0.08479366f, -0.06322418f,
-        -0.13570616f, -0.07644490f, 9.31900274e-003f, -0.08095149f,
-        -0.10197903f, -0.05204025f, 0.01413151f, -0.07800411f, -0.01885122f,
-        -0.07509381f, -0.10136326f, -0.05212355f, -0.09944065f,
-        -1.33606605e-003f, -0.06342617f, -0.04178550f, -0.12373723f,
-        -0.02832736f, -0.06057501f, 0.05830070f, 0.07604282f, -0.06462587f,
-        8.02447461e-003f, 0.11580125f, 0.12332212f, 0.01978462f,
-        -2.72378162e-003f, 0.05850752f, -0.04674481f, 0.05148062f,
-        -2.62542837e-003f, 0.11253355f, 0.09893716f, 0.09785093f, -0.04659257f,
-        -0.01102429f, -0.07002308f, 0.03088913f, -0.02565549f, -0.07671449f,
-        3.17443861e-003f, -0.10783514f, -0.02314270f, -0.11089555f,
-        -0.01024768f, 0.03116021f, -0.04964825f, 0.02281825f, 5.50005678e-003f,
-        -0.08427856f, -0.14685495f, -0.07719755f, -0.13342668f, -0.04525511f,
-        -0.09914210f, 0.02588859f, 0.03469279f, 0.04664020f, 0.11688190f,
-        0.09647275f, 0.10857815f, -0.01448726f, 0.04299758f, -0.06763151f,
-        1.33257592e-003f, 0.14331576f, 0.07574340f, 0.09166205f, 0.05674926f,
-        0.11325553f, -0.01106494f, 0.02062161f, -0.11484840f, -0.07492137f,
-        -0.02864293f, -0.01275638f, -0.06946032f, -0.10101652f, -0.04113498f,
-        -0.02214783f, -0.01273942f, -0.07480393f, -0.10556041f, -0.07622112f,
-        -0.09988393f, -0.11453961f, -0.12073903f, -0.09412795f, -0.07146588f,
-        -0.04054537f, -0.06127083f, 0.04221122f, 0.07688113f, 0.04099256f,
-        0.12663734f, 0.14683802f, 0.21761774f, 0.12525328f, 0.18431792f,
-        -1.66402373e-003f, 2.37777247e-003f, 0.01445475f, 0.03509416f,
-        0.02654697f, 0.01716739f, 0.05374011f, 0.02944174f, 0.11323927f,
-        -0.01485456f, -0.01611330f, -1.85554172e-003f, -0.01708549f,
-        -0.05435753f, -0.05302101f, 0.05260378f, -0.03582945f,
-        -3.42867890e-004f, 1.36076682e-003f, -0.04436073f, -0.04228432f,
-        0.03281291f, -0.05480836f, -0.10197772f, -0.07206279f, -0.10741059f,
-        -0.02366946f, 0.10278475f, -2.74783419e-003f, -0.03242477f,
-        0.02308955f, 0.02835869f, 0.10348799f, 0.19580358f, 0.10252027f,
-        0.08039929f, 0.05525554f, -0.13250865f, -0.14395352f, 3.13586881e-003f,
-        -0.03387071f, 8.94669443e-003f, 0.05406157f, -4.97324532e-003f,
-        -0.01189114f, 2.82919413e-004f, -0.03901557f, -0.04898705f,
-        0.02164520f, -0.01382906f, -0.01850416f, 0.01869347f, -0.02450060f,
-        0.02291678f, 0.08196463f, 0.03309153f, -0.10629974f, 0.02473924f,
-        0.05344394f, -0.02404823f, -0.03243643f, -5.55244600e-003f,
-        -0.08009996f, 0.02811539f, 0.04235742f, 0.01859004f, 0.04902123f,
-        -0.01438252f, -0.01526853f, 0.02044195f, -0.05008660f, 0.04244113f,
-        0.07611816f, 0.04950470f, -0.06020549f, -4.26026015e-003f, 0.13133512f,
-        -0.01438738f, -0.01958807f, -0.04044152f, -0.12425045f,
-        2.84353318e-003f, -0.05042776f, -0.09121484f, 7.34345755e-003f,
-        0.09388847f, 0.11800314f, 4.72295098e-003f, 4.44378285e-003f,
-        -0.07984917f, -0.03613737f, 0.04490915f, -0.02246483f, 0.04681071f,
-        0.05240871f, 0.02157206f, -0.04603431f, -0.01197929f, -0.02748779f,
-        0.13621049f, 0.08812155f, -0.07802048f, 4.86458559e-003f, -0.01598836f,
-        0.01024450f, -0.03463517f, -0.02304239f, -0.08692665f, 0.06655128f,
-        0.05785803f, -0.12640759f, 0.02307472f, 0.07337402f, 0.07525434f,
-        0.04943763f, -0.02241034f, -0.09978238f, 0.14487994f, -0.06570521f,
-        -0.07855482f, 0.02830222f, -5.29603509e-004f, -0.04669895f,
-        -0.11822784f, -0.12246452f, -0.15365660f, -0.02969127f, 0.08078201f,
-        0.13512598f, 0.11505685f, 0.04740673f, 0.01376022f, -0.05852978f,
-        -0.01537809f, -0.05541119f, 0.02491065f, -0.02870786f, 0.02760978f,
-        0.23836176f, 0.22347429f, 0.10306466f, -0.06919070f, -0.10132039f,
-        -0.20198342f, -0.05040560f, 0.27163076f, 0.36987007f, 0.34540465f,
-        0.29095781f, 0.05649706f, 0.04125737f, 0.07505883f, -0.02737836f,
-        -8.43431335e-003f, 0.07368195f, 0.01653876f, -0.09402955f,
-        -0.09574359f, 0.01474337f, -0.07128561f, -0.03460737f, 0.11438941f,
-        0.13752601f, -0.06385452f, -0.06310338f, 8.19548313e-003f, 0.11622470f,
-        5.05133113e-003f, -0.07602754f, 0.06695660f, 0.25723928f, 0.09037900f,
-        0.28826267f, 0.13165380f, -0.05312614f, -0.02137198f, -0.03442232f,
-        -0.06255679f, 0.03899667f, 0.18391028f, 0.26016650f, 0.03374462f,
-        0.01860465f, 0.19077586f, 0.18160543f, 3.43634398e-003f, -0.03036782f,
-        0.19683038f, 0.35378191f, 0.24968483f, -0.03222649f, 0.28972381f,
-        0.43091634f, 0.30778357f, 0.02335266f, -0.09877399f, -6.85245218e-003f,
-        0.08945240f, -0.08150686f, 0.02792493f, 0.24806842f, 0.17338486f,
-        0.06231801f, -0.10432383f, -0.16653322f, -0.13197899f, -0.08531576f,
-        -0.19271527f, -0.13536365f, 0.22240199f, 0.39219588f, 0.26597717f,
-        -0.01231649f, 0.01016179f, 0.13379875f, 0.12018334f, -0.04852953f,
-        -0.07915270f, 0.07036012f, 3.87723115e-003f, -0.06126805f,
-        -0.15015170f, -0.11406515f, -0.08556531f, -0.07429333f, -0.16115491f,
-        0.13214062f, 0.25691369f, 0.05697750f, 0.06861912f, -6.02903729e-003f,
-        -7.94562511e-003f, 0.04799571f, 0.06695165f, -0.01926842f, 0.06206308f,
-        0.13450983f, -0.06381495f, -2.98370165e-003f, -0.03482971f,
-        7.53991678e-003f, 0.03895611f, 0.11464261f, 0.01669971f,
-        8.27818643e-003f, -7.49160210e-003f, -0.11712562f, -0.10650621f,
-        -0.10353880f, -0.04994106f, -7.65618810e-004f, 0.03023767f,
-        -0.04759270f, -0.07302686f, -0.05825012f, -0.13156348f, -0.10639747f,
-        -0.19393684f, -0.09973683f, -0.07918908f, 4.63177625e-004f,
-        -6.61382044e-004f, 0.15853868f, 0.08561199f, -0.07660093f,
-        -0.08015265f, -0.06164073f, 0.01882577f, -7.29908410e-004f,
-        0.06840892f, 0.03843764f, 0.20274927f, 0.22028814f, -5.26101235e-003f,
-        0.01452435f, -0.06331623f, 0.02865064f, 0.05673740f, 0.12171564f,
-        0.03837196f, 0.03555467f, -0.02662914f, -0.10280123f, -0.06526285f,
-        -0.11066351f, -0.08988424f, -0.10103678f, 8.10526591e-003f,
-        5.95238712e-003f, 0.02617721f, -0.01705742f, -0.10897956f,
-        -0.08004991f, -0.11271993f, -0.06185647f, -0.06103712f, 0.01597041f,
-        -0.05923606f, 0.09410726f, 0.22858568f, 0.03263380f, 0.06772990f,
-        -0.09003516f, 0.01017870f, 0.01931688f, 0.08628357f, -0.01430009f,
-        0.10954945f, 0.16612452f, -0.02434544f, -0.03310068f, -0.04236627f,
-        0.01212392f, -6.15046406e-003f, 0.06954194f, 0.03015283f, 0.01787957f,
-        0.02781667f, -0.05561153f, -8.96244217e-003f, -0.04971489f,
-        0.07510284f, 0.01775282f, 0.05889897f, -0.07981427f, 0.03647643f,
-        -3.73833324e-003f, -0.08894575f, -0.06429435f, -0.08068276f,
-        0.03567704f, -0.07131936f, -7.21910037e-003f, -0.09566668f,
-        0.17886090f, 0.14911725f, 0.02070032f, -0.05017120f, -0.04992622f,
-        0.01570143f, -0.09906903f, 0.06456193f, 0.15329507f, 0.18820767f,
-        0.11689861f, -0.01178513f, -0.02225163f, -0.01905318f, 0.10271224f,
-        -7.27029052e-003f, 0.11664233f, 0.14796902f, 0.07771893f, 0.02400013f,
-        -0.05361797f, -0.01972888f, 0.01376177f, 0.06740040f, -0.06525395f,
-        0.05726178f, -0.02404981f, -0.14018567f, -0.02074987f, -0.04621970f,
-        -0.04688627f, -0.01842059f, 0.07722727f, -0.04852883f, 0.01529004f,
-        -0.19639495f, 0.10817073f, 0.03795860f, -0.09435206f, -0.07984378f,
-        -0.03383440f, 0.11081333f, 0.02237366f, 0.12703256f, 0.21613893f,
-        0.02918790f, 4.66472283e-003f, -0.10274266f, -0.04854131f,
-        -3.46305710e-003f, 0.08652268f, 0.02251546f, 0.09636052f, 0.17180754f,
-        -0.09272388f, 4.59174305e-004f, -0.11723048f, -0.12210111f,
-        -0.15547538f, 0.07218186f, -0.05297846f, 0.03779940f, 0.05150875f,
-        -0.03802310f, 0.03870645f, -0.15250699f, -0.08696499f, -0.02021560f,
-        0.04118926f, -0.15177974f, 0.01577647f, 0.10249301f, 7.50041893e-003f,
-        0.01721806f, -0.06828983f, -0.02397596f, -0.06598977f, -0.04317593f,
-        -0.08064980f, 6.66632550e-003f, 0.03333484f, 0.07093620f, 0.08231064f,
-        -0.06577903f, -0.06698844f, -0.06984019f, -0.06508023f, -0.14145090f,
-        -0.02393239f, 0.06485303f, 8.83263443e-003f, 0.09251080f, -0.07557579f,
-        -0.05067699f, -0.09798748f, -0.06703258f, -0.14056294f, 0.03245994f,
-        0.12554143f, 0.01761621f, 0.12980327f, -0.04081950f, -0.11906909f,
-        -0.14813015f, -0.08376863f, -0.12200681f, 0.04988137f, 0.05424247f,
-        -3.90952639e-003f, 0.03255733f, -0.12717837f, -0.07461493f,
-        -0.05703964f, -0.01736189f, -0.08026433f, -0.05433894f, -0.01719359f,
-        0.02886275f, 0.01772653f, -0.09163518f, 3.57789593e-003f, -0.10129993f,
-        -0.02653764f, -0.08131415f, -0.03847986f, -7.62157550e-004f,
-        0.06486648f, 0.19675669f, -0.04919156f, -0.07059129f, -0.04857785f,
-        -0.01042383f, -0.08328653f, 0.03660302f, -0.03696846f, 0.04969259f,
-        0.08241162f, -0.12514858f, -0.06122676f, -0.03750202f,
-        6.52989605e-003f, -0.10247213f, 0.02568346f, 4.51781414e-003f,
-        -0.03734229f, -0.01131264f, -0.05412074f, 8.89345480e-004f,
-        -0.12388977f, -0.05959237f, -0.12418608f, -0.06151643f, -0.07310260f,
-        0.02441575f, 0.07023528f, -0.07548289f, -7.57147965e-004f,
-        -0.09061348f, -0.08112976f, -0.06920306f, 9.54394229e-003f,
-        -0.01219902f, 1.21273217e-003f, -8.88989680e-003f, -0.08309301f,
-        -0.04552661f, -0.10739882f, -0.05691034f, -0.13928030f, 0.09027749f,
-        0.15123098f, 0.03175976f, 0.17763577f, 3.29913251e-004f, 0.05151888f,
-        -0.09844074f, -0.09475287f, -0.08571247f, 0.16241577f, 0.19336018f,
-        8.57454538e-003f, 0.11474732f, -0.01493934f, 0.03352379f, -0.08966240f,
-        -0.02322310f, 0.02663568f, 0.05448750f, -0.03536883f, -0.07210463f,
-        -0.06807277f, -0.03121621f, -0.05932408f, -0.17282860f, -0.15873498f,
-        -0.04956378f, 0.01603377f, -0.12385946f, 0.13878587f, 0.21468069f,
-        0.13510075f, 0.20992437f, 0.08845878f, 0.08104013f, 0.03754176f,
-        0.12173114f, 0.11103114f, 0.10643122f, 0.13941477f, 0.11640384f,
-        0.14786847f, 0.01218238f, 0.01160753f, 0.03547940f, 0.08794311f,
-        -0.01695384f, -0.07692261f, -0.08236158f, 6.79194089e-003f,
-        -0.02458403f, 0.13022894f, 0.10953187f, 0.09857773f, 0.04735930f,
-        -0.04353498f, -0.15173385f, -0.17904443f, -0.10450364f, -0.13418166f,
-        -0.06633098f, -0.03170381f, -0.06839000f, -0.11350126f, -0.06983913f,
-        0.19083543f, 0.17604128f, 0.07730632f, 0.10022651f, 0.36428109f,
-        0.28291923f, 0.12688625f, 0.15942036f, 0.14064661f, -0.11201853f,
-        -0.13969108f, -0.09088077f, -0.14107047f, 0.05117374f,
-        -2.63348082e-003f, -0.10794610f, -0.09715455f, -0.05284977f,
-        0.01565668f, 0.05031200f, 0.07021113f, -0.02963028f, 0.01766960f,
-        0.08333644f, -0.03211382f, 4.90096770e-003f, 0.05186674f, -0.05045737f,
-        -0.09624767f, -0.02525997f, 0.06916669f, 0.01213916f, 0.05333899f,
-        -0.03443280f, -0.10055527f, -0.06291115f, 5.42851724e-003f,
-        -6.30360236e-003f, 0.02270257f, -0.01769792f, 0.03273688f, 0.07746078f,
-        7.77099328e-003f, 0.05041346f, 0.01648103f, -0.02321534f, -0.09930186f,
-        -0.02293853f, 0.02034990f, -0.08324204f, 0.08510064f, -0.03732836f,
-        -0.06465405f, -0.06086946f, 0.13680504f, -0.11469388f, -0.03896406f,
-        -0.07142810f, 2.67581246e-003f, -0.03639632f, -0.09849060f,
-        -0.11014334f, 0.17489147f, 0.17610909f, -0.16091567f, -0.07248894f,
-        0.01567141f, 0.23742996f, 0.07552249f, -0.06270349f, -0.07303379f,
-        0.25442186f, 0.16903116f, -0.08168741f, -0.05913896f, -0.03954096f,
-        6.81776879e-003f, -0.05615319f, -0.07303037f, -0.12176382f,
-        0.12385108f, 0.22084464f, -0.05543206f, -0.03310431f, 0.05731593f,
-        0.19481890f, 0.04016430f, -0.06480758f, -0.12353460f, 0.18733442f,
-        -0.09631214f, -0.11192076f, 0.12404587f, 0.15671748f, 0.19256128f,
-        0.10895617f, 0.03391477f, -0.13032004f, -0.05626907f, -0.09025607f,
-        0.23485197f, 0.27812332f, 0.26725492f, 0.07255980f, 0.16565137f,
-        0.22388470f, 0.07441066f, -0.21003133f, -0.08075339f, -0.15031935f,
-        0.07023834f, 0.10872041f, 0.18156518f, 0.20037253f, 0.13571967f,
-        -0.11915682f, -0.11131983f, -0.18878011f, 0.06074620f, 0.20578890f,
-        0.12413109f, 0.03930207f, 0.29176015f, 0.29502738f, 0.27856228f,
-        -0.01803601f, 0.16646385f, 0.19268319f, 0.01900682f, 0.06026287f,
-        2.35868432e-003f, 0.01558199f, 0.02707230f, 0.11383014f, 0.12103992f,
-        0.03907350f, 0.04637353f, 0.09020995f, 0.11919726f, -3.63007211e-003f,
-        0.02220155f, 0.10336831f, 0.17351882f, 0.12259731f, 0.18983354f,
-        0.15736865f, 0.01160725f, -0.01690723f, -9.69582412e-004f, 0.07213813f,
-        0.01161613f, 0.17864859f, 0.24486147f, 0.18208991f, 0.20177495f,
-        0.05972528f, -8.93934630e-003f, -0.02316955f, 0.14436610f, 0.14114498f,
-        0.05520950f, 0.06353590f, -0.19124921f, 0.10174713f, 0.29414919f,
-        0.26448128f, 0.09344960f, 0.15284036f, 0.19797507f, 0.11369792f,
-        -0.12722753f, -0.21396367f, -0.02008235f, -0.06566695f, -0.01662150f,
-        -0.03937003f, 0.04778343f, 0.05017274f, -0.02299062f, -0.20208496f,
-        -0.06395898f, 0.13721776f, 0.22544557f, 0.14888357f, 0.08687132f,
-        0.27088094f, 0.32206613f, 0.09782200f, -0.18523243f, -0.17232181f,
-        -0.01041531f, 0.04008654f, 0.04199702f, -0.08081299f, -0.03755421f,
-        -0.04809646f, -0.05222081f, -0.21709201f, -0.06622940f, 0.02945281f,
-        -0.04600435f, -0.05256077f, -0.08432942f, 0.02848100f, 0.03490564f,
-        8.28621630e-003f, -0.11051246f, -0.11210597f, -0.01998289f,
-        -0.05369405f, -0.08869293f, -0.18799506f, -0.05436598f, -0.05011634f,
-        -0.05419716f, -0.06151857f, -0.10827805f, 0.04346735f, 0.04016083f,
-        0.01520820f, -0.12173316f, -0.04880285f, -0.01101406f, 0.03250847f,
-        -0.06009551f, -0.03082932f, -0.02295134f, -0.06856834f, -0.08775249f,
-        -0.23793389f, -0.09174541f, -0.05538322f, -0.04321031f, -0.11874759f,
-        -0.04221844f, -0.06070468f, 0.01194489f, 0.02608565f, -0.03892140f,
-        -0.01643151f, -0.02602034f, -0.01305472f, 0.03920100f, -0.06514261f,
-        0.01126918f, -6.27710763e-003f, -0.02720047f, -0.11133634f,
-        0.03300330f, 0.02398472f, 0.04079665f, -0.10564448f, 0.05966159f,
-        0.01195221f, -0.03179441f, -0.01692590f, -0.06177841f, 0.01841576f,
-        -5.51078189e-003f, -0.06821765f, -0.03191888f, -0.09545476f,
-        0.03030550f, -0.04896152f, -0.02914624f, -0.13283344f, -0.04783419f,
-        6.07836898e-003f, -0.01449538f, -0.13358212f, -0.09687774f,
-        -0.02813793f, 0.01213498f, 0.06650011f, -0.02039067f, 0.13356198f,
-        0.05986415f, -9.12760664e-003f, -0.18780160f, -0.11992817f,
-        -0.06342237f, 0.01229534f, 0.07143231f, 0.10713009f, 0.11085765f,
-        0.06569190f, -0.02956399f, -0.16288325f, -0.13993549f, -0.01292515f,
-        0.03833013f, 0.09130384f, -0.05086257f, 0.05617329f, -0.03896667f,
-        -0.06282311f, -0.11490010f, -0.14264110f, -0.04530499f, 0.01598189f,
-        0.09167797f, 0.08663294f, 0.04885277f, -0.05741219f, -0.07565769f,
-        -0.17136464f, -0.02619422f, -0.02477579f, 0.02679587f, 0.11621952f,
-        0.08788391f, 0.15520640f, 0.04709549f, 0.04504483f, -0.10214074f,
-        -0.12293372f, -0.04820546f, -0.05484834f, 0.05473754f, 0.07346445f,
-        0.05577277f, -0.08209965f, 0.03462975f, -0.20962234f, -0.09324598f,
-        3.79481679e-003f, 0.03617633f, 0.16742408f, 0.07058107f, 0.10204960f,
-        -0.06795346f, 3.22807301e-003f, -0.12589309f, -0.17496960f,
-        0.02078314f, -0.07694324f, 0.12184640f, 0.08997164f, 0.04793497f,
-        -0.11383379f, -0.08046359f, -0.25716835f, -0.08080962f,
-        6.80711539e-003f, -0.02930280f, -3.04938294e-003f, -0.11106286f,
-        -0.04628860f, -0.07821649f, 7.70127494e-003f, -0.10247706f,
-        1.21042714e-003f, 0.20573859f, -0.03241005f, 8.42972286e-003f,
-        0.01946464f, -0.01197973f, -0.14579976f, 0.04233614f,
-        -4.14096704e-003f, -0.06866436f, -0.02431862f, -0.13529138f,
-        1.25891645e-003f, -0.11425111f, -0.04303651f, -0.01694815f,
-        0.05720210f, -0.16040207f, 0.02772896f, 0.05498345f, -0.15010567f,
-        0.01450866f, 0.02350303f, -0.04301004f, -0.04951802f, 0.21702233f,
-        -0.03159155f, -0.01963303f, 0.18232647f, -0.03263875f,
-        -2.88476888e-003f, 0.01587562f, -1.94303901e-003f, -0.07789494f,
-        0.04674156f, -6.25576358e-003f, 0.08925962f, 0.21353747f, 0.01254677f,
-        -0.06999976f, -0.05931328f, -0.01884327f, -0.04306272f, 0.11794136f,
-        0.03842728f, -0.03907030f, 0.05636114f, -0.09766009f, -0.02104000f,
-        8.72711372e-003f, -0.02736877f, -0.05112274f, 0.16996814f, 0.02955785f,
-        0.02094014f, 0.08414304f, -0.03335762f, -0.03617457f, -0.05808248f,
-        -0.08872101f, 0.02927705f, 0.27077839f, 0.06075108f, 0.07478261f,
-        0.15282831f, -0.03908454f, -0.05101782f, -9.51998029e-003f,
-        -0.03272416f, -0.08735625f, 0.07633440f, -0.07185312f, 0.13841286f,
-        0.07812646f, -0.12901451f, -0.05488589f, -0.05644578f, -0.03290703f,
-        -0.11184757f, 0.03751570f, -0.05978153f, -0.09155276f, 0.05657315f,
-        -0.04328186f, -0.03047933f, -0.01413135f, -0.10181040f, -0.01384013f,
-        0.20132534f, -0.01536873f, -0.07641169f, 0.05906778f, -0.07833145f,
-        -0.01523801f, -0.07502609f, -0.09461885f, -0.15013233f, 0.16050665f,
-        0.09021381f, 0.08473236f, 0.03386267f, -0.09147339f, -0.09170618f,
-        -0.08498498f, -0.05119187f, -0.10431040f, 0.01041618f, -0.03064913f,
-        0.09340212f, 0.06448522f, -0.03881054f, -0.04985436f, -0.14794017f,
-        -0.05200112f, -0.02144495f, 0.04000821f, 0.12420804f, -0.01851651f,
-        -0.04116732f, -0.11951703f, -0.04879033f, -0.08722515f, -0.08454733f,
-        -0.10549165f, 0.11251976f, 0.10766345f, 0.19201984f, 0.06128913f,
-        -0.02734615f, -0.08834923f, -0.16999826f, -0.03548348f,
-        -5.36092324e-003f, 0.08297954f, 0.07226378f, 0.04194529f, 0.04668673f,
-        8.73902347e-003f, 0.06980139f, 0.05652480f, 0.05879445f, 0.02477076f,
-        0.02451423f, 0.12433673f, 0.05600227f, 0.06886370f, 0.03863076f,
-        0.07459056f, 0.02264139f, 0.01495469f, 0.06344220f, 0.06945208f,
-        0.02931899f, 0.11719371f, 0.04527427f, 0.03248192f, 2.08271481e-003f,
-        0.02044626f, 0.11403449f, 0.04303892f, 0.06444661f, 0.04959024f,
-        0.08174094f, 0.09240247f, 0.04894639f, 0.02252937f, -0.01652530f,
-        0.07587013f, 0.06064249f, 0.13954395f, 0.02772832f, 0.07093039f,
-        0.08501238f, 0.01701301f, 0.09055722f, 0.33421436f, 0.20163782f,
-        0.09821030f, 0.07951369f, 0.08695120f, -0.12757730f, -0.13865978f,
-        -0.06610068f, -0.10985506f, 0.03406816f, -0.01116336f, -0.07281768f,
-        -0.13525715f, -0.12844718f, 0.08956250f, 0.09171610f, 0.10092317f,
-        0.23385370f, 0.34489515f, 0.09901748f, 0.02002922f, 0.12335990f,
-        0.07606190f, -0.14899330f, -0.15634622f, -0.06494618f, -0.01760547f,
-        0.03404277f, -0.13208845f, -0.12101169f, -0.18294574f, -0.16560709f,
-        0.02183887f, -0.02752613f, 0.01813638f, 0.02000757f, 0.01319924f,
-        0.08030242f, 0.01220535f, 2.98233377e-003f, -0.01307070f, 0.05970297f,
-        -0.05345284f, -0.03381982f, -9.87543724e-003f, -0.06869387f,
-        0.03956730f, -0.03108176f, -0.05732809f, 0.02172386f, 0.04159765f,
-        2.62783933e-003f, 0.04813229f, 0.09358983f, -8.18389002e-003f,
-        0.01724574f, -0.02547474f, -0.04967288f, -0.02390376f, 0.06640504f,
-        -0.06306566f, 0.01137518f, 0.05589378f, -0.08237787f, 0.02455001f,
-        -0.03059422f, -0.08953978f, 0.06851497f, 0.07190268f, -0.07610799f,
-        7.87237938e-003f, -7.85830803e-003f, 0.06006952f, -0.01126728f,
-        -2.85743061e-003f, -0.04772895f, 0.01884944f, 0.15005857f,
-        -0.06268821f, -0.01989072f, 0.01138399f, 0.08760451f, 0.03879007f,
-        -9.66926850e-003f, -0.08012961f, 0.06414555f, -0.01362950f,
-        -0.09135523f, 0.01755159f, 0.04459474f, 0.09650917f, 0.05219948f,
-        -2.19440833e-003f, -0.07037939f, -0.01599054f, 0.13103317f,
-        -0.02492603f, -0.01032540f, -0.02903307f, 0.04489160f, 0.05148086f,
-        0.01858173f, -0.02919228f, 0.08299296f, -0.04590359f, -0.15745632f,
-        -0.09068198f, -0.02972453f, 0.12985018f, 0.22320485f, 0.24261914f,
-        0.03642650f, -0.05506422f, 2.67413049e-003f, -0.03834032f, 0.06449424f,
-        0.03834866f, 0.03816991f, 0.25039271f, 0.34212017f, 0.32433882f,
-        0.18824573f, -0.08599839f, -0.17599408f, -0.15317015f, -0.09913155f,
-        -0.02856072f, -0.05304699f, -1.06437842e-003f, -0.06641813f,
-        -0.07509298f, 0.01463361f, -0.07551918f, -0.04510373f,
-        -8.44620075e-003f, 0.01772176f, 0.04068235f, 0.20295307f, 0.15719447f,
-        0.05712103f, 0.26296997f, 0.14657754f, 0.01547317f, -0.05052776f,
-        -0.03881342f, -0.01437883f, -0.04930177f, 0.11719568f, 0.24098417f,
-        0.26468599f, 0.31698579f, 0.10103608f, -0.01096375f, -0.01367013f,
-        0.17104232f, 0.20065314f, 2.67622480e-003f, -0.01190034f, 0.18301608f,
-        0.09459770f, -0.06357619f, -0.06473801f, 0.01377906f, -0.10032775f,
-        -0.06388740f, 3.80393048e-003f, 0.06206078f, 0.10349120f, 0.26804337f,
-        8.17918684e-003f, -0.02314351f, 9.34422202e-003f, 0.09198381f,
-        0.03681326f, -8.77339672e-003f, -0.09662418f, -0.02715708f,
-        0.13503517f, 0.08962728f, -6.57071499e-003f, -0.03201199f, 0.28510824f,
-        0.32095715f, 0.18512695f, -0.14230858f, -0.14048551f, -0.07181299f,
-        -0.08575408f, -0.08661680f, -0.17416079f, 7.54326640e-004f,
-        0.05601677f, 0.13585392f, -0.04960437f, -0.07708392f, 0.10676333f,
-        -0.04407546f, -0.07209078f, 0.03663663f, 0.28949317f, 0.41127121f,
-        0.27431169f, -0.06900328f, -0.21474190f, -0.15578632f, -0.19555484f,
-        -0.15209621f, -0.11269179f, 0.07416003f, 0.18991330f, 0.26858172f,
-        0.01952259f, 0.01017922f, 0.02159843f, -4.95165400e-003f, -0.04368168f,
-        -0.12721671f, -0.06673957f, -0.11275250f, 0.04413409f, 0.05578312f,
-        0.03896771f, 0.03566417f, -0.05871816f, -0.07388090f, -0.17965563f,
-        -0.08570268f, -0.15273231f, -0.06022318f, -0.06999847f,
-        -6.81510568e-003f, 0.06294262f, -6.54901436e-004f, -0.01128654f,
-        -0.02289657f, 0.04849290f, 0.04140804f, 0.23681939f, 0.14545733f,
-        0.01989965f, 0.12032662f, 3.87463090e-003f, -6.02597650e-003f,
-        -0.05919775f, -0.03067224f, -0.07787777f, 0.10834727f, 0.02153730f,
-        0.02765649f, 0.03975543f, -0.12182906f, -0.04900113f, -0.09940100f,
-        -0.06453611f, -0.13757215f, -0.03721382f, 0.02827376f, -0.04351249f,
-        0.01907038f, -0.10284120f, -0.05671160f, -0.10760647f, -0.09624009f,
-        -0.09565596f, -0.01303654f, 0.03080539f, 0.01416511f, 0.05846142f,
-        -5.42971538e-003f, 0.06221476f, -0.03320325f, -0.06791797f,
-        -0.05791342f, 0.12851369f, 0.14990346f, 0.03634374f, 0.14262885f,
-        0.04330391f, 0.05032569f, -0.05631914f, 0.01606137f, 0.04387223f,
-        0.22344995f, 0.15722635f, -0.04693628f, 0.03006579f, -2.52882647e-003f,
-        0.05717621f, -0.07529724f, -0.02848588f, -0.06868757f,
-        -4.51729307e-003f, 0.06466042f, -0.05935378f, -0.04704857f,
-        -0.07363959f, 0.04843248f, -0.13421375f, -0.09789340f, -0.10255270f,
-        0.03509852f, 0.04751543f, -0.03822323f, 0.09740467f, 0.04762916f,
-        0.03940146f, -0.08283259f, 0.09552965f, 0.05038739f, 0.21258622f,
-        0.09646992f, 0.03241193f, 0.05167701f, 0.04614570f, 0.04330090f,
-        -0.02671840f, -0.06259909f, -0.02301898f, 0.18829170f, 0.10522786f,
-        0.04313190f, 0.01670948f, -0.08421925f, 0.05911417f, -0.10582602f,
-        -0.04855484f, -0.08373898f, 0.07775915f, 0.03723533f, -0.12047344f,
-        4.86345543e-003f, -0.10520902f, 0.06571782f, -0.07528137f,
-        -0.03245651f, -0.09869066f, -0.02917477f, -0.18293270f, 0.14810945f,
-        9.24033765e-003f, -0.04354914f, 0.02266885f, -0.11872729f,
-        -0.04016589f, 0.02830229f, 0.22539048f, 0.20565644f, 0.16701797f,
-        0.09019924f, 0.01300652f, 0.09760600f, -0.03675831f, -0.01935448f,
-        -0.06894835f, 0.08077277f, 0.19047537f, 0.11312226f, 0.04106043f,
-        -0.11187182f, 0.04312806f, -0.18548580f, -0.11287174f, -0.08794551f,
-        0.02078281f, -0.15295486f, 0.11806386f, -0.01103218f, -0.15971117f,
-        0.02153538f, -0.05232147f, -0.10835317f, -0.13910367f, 0.05920752f,
-        -0.10122602f, 0.20174250f, 0.09105796f, -0.01881348f, 0.09559010f,
-        -0.03725745f, -0.09442931f, -0.09763174f, 0.05854454f, 0.08287182f,
-        0.12919849f, 0.08594352f, -2.49806582e-003f, 0.02398440f,
-        5.67950122e-003f, -0.06296340f, -0.12993270f, 0.03855852f, 0.05186560f,
-        0.10839908f, -0.03380463f, -0.12654832f, -0.05399339f, -0.07456800f,
-        -0.04736232f, -0.10164231f, 0.07496139f, 0.08125214f, 0.07656177f,
-        -0.04999603f, -0.12823077f, -0.07692395f, -0.11317524f, -0.09118655f,
-        -0.05695669f, 0.10477209f, 0.07468581f, 0.01630048f, -8.00961629e-003f,
-        -0.06582128f, -0.04019095f, -0.04682907f, -0.01907842f, -0.10997720f,
-        0.04911406f, 0.02931030f, 0.04197735f, -0.05773980f, -0.09670641f,
-        -0.03594951f, -0.03402121f, -0.07149299f, -0.10566200f, 0.10601286f,
-        0.06340689f, -0.01518632f, -5.96402306e-003f, -0.07628012f,
-        -3.52779147e-003f, -0.02683854f, -0.10265494f, -0.02680815f,
-        0.16338381f, 0.03103515f, 0.02296976f, 0.01624348f, -0.10831620f,
-        -0.02314233f, -0.04789969f, -0.05530700f, -0.06461314f, 0.10494506f,
-        0.04642856f, -0.07592955f, -0.06197905f, -0.09042154f, -0.01445521f,
-        -0.04297818f, -0.11262015f, -0.11430512f, 0.03174541f, -0.03677487f,
-        -0.02963996f, -0.06610169f, -0.13292049f, -0.07059067f, -0.08444111f,
-        -0.02640536f, -0.07136250f, 0.04559967f, 0.01459980f, 0.17989251f,
-        0.04435328f, -0.12464730f, -0.02871115f, -0.10752209f, -0.03393742f,
-        -0.03791408f, 0.02548251f, 0.01956050f, 0.19245651f, 0.13963254f,
-        -0.05904696f, -0.07424626f, -0.10411884f, 1.54176133e-003f,
-        0.01797429f, 0.13025844f, 0.04547642f, -0.05710349f, -0.10697161f,
-        -0.13489437f, -0.06515755f, -0.06406886f, -4.08572936e-003f,
-        -0.01336483f, 0.04368737f, -0.11259720f, -0.05701635f, -0.06469971f,
-        -0.08346602f, -0.04166770f, -0.05795543f, -0.08247511f, -0.05742628f,
-        0.08452254f, -0.03350224f, 0.13980860f, 0.13252275f, 0.07589617f,
-        0.07539988f, 0.12155797f, 0.19087289f, 0.15050751f, 0.21250245f,
-        0.14206800f, 0.01298489f, 0.07450245f, 0.06559097f, 0.01700557f,
-        0.04512971f, 0.16950700f, 0.10261577f, 0.16389982f, 0.05505059f,
-        -0.03453077f, 0.08622462f, 0.07935954f, 0.03976260f, 0.02036091f,
-        3.95744899e-003f, 0.03267065f, 0.15235919f, 0.01297494f, -0.08109194f,
-        0.01407558f, 4.40693414e-003f, -0.15157418f, -0.11390478f,
-        -0.07487597f, -7.81322457e-003f, -0.02749545f, -0.10181408f,
-        0.13755716f, 0.14007211f, 0.13482562f, 0.27517235f, 0.34251109f,
-        0.07639657f, 0.07268607f, 0.19823882f, 0.16135791f, -0.04186463f,
-        -0.12784107f, -0.09846287f, 0.03169041f, 0.10974082f, -0.15051922f,
-        -0.08916726f, -0.07138767f, -0.04153349f, 6.25418453e-003f,
-        0.01266654f, 0.10533249f, 0.12749144f, 0.15148053f, 0.01498513f,
-        0.06305949f, -0.01247123f, -0.08778401f, -0.08551880f, -0.11955146f,
-        -0.08493572f, -0.02901620f, -0.02394859f, -0.13427313f, -0.11053200f,
-        -0.14413260f, -0.15203285f, 0.03972760f, -3.72127310e-004f,
-        -0.04200919f, 0.06105104f, 0.01904975f, -0.01106191f,
-        -7.27445772e-003f, -0.01520341f, 1.10228511e-003f, -0.04949187f,
-        -0.08013099f, 5.72071038e-003f, 0.08415454f, -0.06523152f, 0.03664081f,
-        -0.02673042f, -0.12066154f, -0.03702074f, 0.06006580f, 0.01628682f,
-        -6.17772620e-003f, 0.08192339f, -3.41629819e-003f, 0.02870512f,
-        0.05807141f, 0.04959986f, 0.04618251f, -0.04901629f, -0.10579574f,
-        0.02274442f, 0.12070961f, 2.23597488e-003f, 0.09831765f, -0.03019848f,
-        -0.11181970f, -0.04961075f, 0.02498928f, -0.03714991f, -0.01619653f,
-        0.02643486f, -7.62964319e-003f, -0.02882290f, -0.06242594f,
-        -0.08439861f, 0.07220893f, 0.07263952f, 0.01561574f, 0.03091968f,
-        0.01708712f, -0.03797151f, -3.18561122e-003f, 0.01624021f,
-        -0.02828573f, 0.11284444f, -1.32280716e-003f, -0.07784860f,
-        -0.07209100f, 0.03372242f, 0.12154529f, 0.02278104f, -0.05275500f,
-        -0.01918484f, 0.12989293f, 0.05424401f, 0.02333086f, 0.04029022f,
-        0.12392918f, 0.09495489f, 0.09190340f, 0.07935889f, 8.76816828e-003f,
-        0.17148446f, -8.51302687e-003f, -0.08011249f, -0.06796283f,
-        0.04884845f, 0.01112272f, -0.07835306f, -1.14811445e-003f,
-        -0.03440760f, 0.02845243f, 0.07695542f, -0.07069533f, -0.01151784f,
-        -8.53884313e-003f, -0.01662786f, -0.04163864f, 0.05400505f,
-        0.02859163f, 0.02921852f, 0.05003135f, -6.85718050e-003f, -0.01632611f,
-        0.07780217f, 0.04042810f, -0.01216440f, 3.60914599e-003f, -0.06322435f,
-        0.09516726f, 0.12877031f, -9.69162490e-003f, 0.01031179f, 0.05180895f,
-        -9.34659224e-003f, -0.01644533f, -0.04849347f, -0.04343236f,
-        0.10514783f, 0.08046635f, -0.04615205f, -0.03975486f, -0.01485525f,
-        0.13096830f, -0.01517950f, -0.06571898f, -0.04016372f, 0.01849786f,
-        0.02439670f, 0.08067258f, 1.74824719e-003f, 0.07053747f, 0.08819518f,
-        -5.08352555e-003f, -0.06550863f, -0.08266170f, -0.07780605f,
-        0.01453450f, -0.08756890f, 0.01096501f, -8.71319138e-003f, 0.10110464f,
-        0.02420769f, -0.06708383f, 0.02007811f, 5.93133038e-003f, 0.05398923f,
-        0.07538138f, 0.02049227f, 0.02242589f, 0.04011070f, -1.44875818e-003f,
-        -4.19115182e-003f, 0.06367654f, 0.02506934f, 0.02434536f, 0.05879405f,
-        -8.22952855e-003f, -0.01242441f, 0.04224926f, -0.01754923f,
-        0.05958161f, 0.03818886f, -0.01830363f, -0.04308917f, -0.04422197f,
-        -0.02432721f, 0.02264866f, 2.03751423e-003f, 0.01197031f, 0.04439203f,
-        0.12169247f, 0.03602713f, -0.02599251f, -1.98226492e-003f, 0.02046336f,
-        -0.02639058f, -1.91242550e-003f, -0.09334669f, -0.03595153f,
-        -9.88179818e-003f, -0.06848445f, -0.04666303f, -0.09955736f,
-        -0.04206430f, 0.02609075f, 9.09005292e-003f, -0.07138551f,
-        -4.22313227e-004f, 0.01766645f, 0.02756404f, 0.01308276f, 0.04052891f,
-        0.02387515f, 0.05337298f, 0.02500631f, -0.04970853f, -0.12467445f,
-        0.17604403f, 0.12256411f, -0.07512254f, 8.70451052e-003f, -0.05697548f,
-        -0.03626474f, -8.76623299e-003f, -0.01210897f, -0.09451522f,
-        0.07490732f, -0.02008001f, -0.02681278f, -0.06463405f, -0.01517507f,
-        7.33757764e-003f, 6.07147906e-003f, -0.09316964f, -0.04575328f,
-        0.13261597f, 0.15424870f, -0.01655918f, -0.02772390f, -0.05243644f,
-        -0.02356456f, -0.02351753f, -0.10211615f, -0.12873036f, 0.14549787f,
-        0.12519856f, 4.38762689e-003f, 0.02795992f, 0.05170322f, 0.09223596f,
-        0.05890015f, 0.02376701f, -0.02777346f, 0.09506908f, 0.02328936f,
-        -0.02319928f, -0.03218696f, -0.01527841f, -0.01016694f, -0.02674719f,
-        0.05137179f, 0.01980666f, 0.06544447f, -0.01746171f, 0.01026380f,
-        0.01561806f, 7.97004555e-004f, 0.07601810f, 0.01907250f, -0.03083035f,
-        -0.05987392f, 0.09242783f, 0.14555025f, 0.01035827f, 0.03092401f,
-        -0.09562709f, -0.03802354f, 0.02531144f, 0.03079449f, -0.07100715f,
-        0.03330721f, -2.69116857e-003f, 0.03167490f, 0.05744999f, 0.03259895f,
-        1.91266940e-003f, 0.03194578f, 0.07389776f, 0.02198060f, 0.07633314f,
-        0.03293105f, -0.09103648f, 0.04718142f, 0.06102672f, -0.01003063f,
-        5.85481385e-003f, -0.01522574f, 0.02323526f, 0.10584345f,
-        4.35879454e-003f, 0.06107873f, 0.05868603f, -0.03115531f, 0.01214679f,
-        0.08567052f, 3.93926632e-003f, -0.02521488f, -1.88425183e-003f,
-        0.02038053f, -6.26854831e-004f, 0.04897438f, -0.04280585f,
-        -0.04819689f, -0.04812867f, -0.01451186f, 0.05101469f,
-        -9.01125465e-003f, -0.03333859f, 0.03917955f, 0.04196448f, 0.04292135f,
-        0.02809529f, 0.02999715f, 0.04081348f, 9.10039060e-003f, 0.09703232f,
-        0.10379741f, 0.02348725f, -4.72756615e-003f, 0.01027325f, 0.10402658f,
-        0.12071823f, 0.09817299f, -0.02612033f, 0.03638414f, 0.05896405f,
-        0.04865025f, 0.04793910f, -0.03882321f, -0.02962117f, -0.01222268f,
-        0.04071597f, 0.01922777f, -0.02287866f, 0.03328381f, 0.01859092f,
-        0.09024994f, 0.03804455f, -0.01424510f, 0.01953739f, 0.02509617f,
-        -0.03390914f, -0.05663941f, -0.01641979f, 0.05848591f, 0.04639670f,
-        0.02092116f, 0.12911791f, 0.19918139f, 0.07739855f, -7.25806039e-003f,
-        0.04074838f, 0.03183993f, 1.39251316e-003f, -0.01428625f, 0.01865480f,
-        0.08529541f, 0.13547510f, 0.11189661f, 0.03998901f, 0.09575938f,
-        -0.02631102f, -0.03458253f, -0.04749985f, -0.06070716f,
-        4.71884012e-003f, 0.06445789f, -0.02450038f, -0.05483776f,
-        -0.04657237f, -0.02030717f, -0.03480766f, -0.09397731f, -0.06399718f,
-        -0.01804585f, 5.62348310e-003f, -6.64811488e-003f, -0.06517869f,
-        6.96210237e-003f, -0.01860148f, -0.04245830f, -0.05850367f,
-        -3.24417115e-003f, 0.07700698f, 0.11290991f, 0.09923030f, -0.02970599f,
-        0.05592411f, 0.04813979f, -0.09811195f, -0.09357996f, -0.03276114f,
-        0.05218338f, 0.04141375f, 3.92977800e-003f, -0.05047480f, 0.15960084f,
-        0.04612800f, -0.03114098f, -0.04650044f, -0.03249795f, -0.02425641f,
-        -0.04311355f, 0.04307659f, -0.09401883f, -0.04742785f, -0.01254499f,
-        -0.06598741f, 3.41369561e-003f, -0.05620445f, -7.28127593e-003f,
-        -0.05998361f, -0.03274450f, -0.07376868f, 3.19015374e-003f,
-        -0.07733069f, 0.05815864f, -0.02471071f, 0.03850617f, 0.13838784f,
-        0.15399861f, 0.01731321f, -0.01477586f, 0.10393341f, 0.05159833f,
-        -0.01945555f, -0.03427503f, -0.04867341f, 0.09237480f, 0.10732719f,
-        0.06071450f, -0.01355071f, 0.01844356f, -0.03480803f, -0.03796671f,
-        2.15628621e-004f, -0.05440186f, 0.01889855f, -0.01443413f,
-        -0.02607902f, -0.02938001f, 0.02720689f, -0.06228397f, -0.02970936f,
-        -0.03426210f, -0.10280876f, -0.06739304f, -0.05227850f, 0.03360292f,
-        -0.11278441f, -0.06966180f, -0.13937433f, 9.10932291e-003f,
-        2.52020749e-004f, -4.07359656e-003f, 0.12310639f, 0.09343060f,
-        0.07302511f, 0.03222093f, 0.07532879f, 0.03792387f, -0.04985180f,
-        0.01804602f, 0.02694195f, 0.13481498f, 0.04601225f, 0.04106982f,
-        0.08511057f, 0.12314661f, 0.01320830f, 0.05044121f, -5.52943908e-003f,
-        -0.08992624f, -0.02249301f, -0.08181777f, 0.06165213f, -0.03256603f,
-        -0.01068920f, -0.01323473f, -0.11970232f, -0.04616347f, -0.12088681f,
-        -0.06762606f, -0.08676834f, -0.06434575f, 0.01772529f, 0.03469615f,
-        -0.10926618f, 0.03013873f, 0.14030397f, 0.16130108f, 0.17985588f,
-        0.11281928f, 0.10530639f, 0.08905948f, 0.07733764f, 0.06695238f,
-        0.02142088f, 0.06438877f, 0.09794453f, 0.05745072f, 0.02788557f,
-        0.02632830f, 0.07985807f, 4.24902979e-003f, 8.47890321e-003f,
-        -0.02679466f, -5.28812688e-003f, -0.02162580f, -0.07490715f,
-        -0.08251337f, -0.02056576f, -0.01026194f, -1.15492963e-003f,
-        -5.75720915e-004f, -0.07210591f, -0.07320981f, -0.04883312f,
-        -0.10897151f, -0.07477258f, -0.08867134f, -0.09222437f, -0.10924666f,
-        -0.10430276f, 0.07953499f, 0.02767959f, 0.11393359f, 0.18779543f,
-        0.03313421f, 0.02143700f, 0.05852016f, -2.12067598e-003f,
-        -3.76984011e-003f, 0.02774167f, -0.03124610f, 0.01465141f, 0.01616004f,
-        -0.01391913f, -0.04404102f, -0.05444227f, -0.14684731f, -0.15016587f,
-        0.04509468f, 1.29563001e-003f, 0.01398350f, 0.05610404f, -0.04868806f,
-        -0.04776716f, -8.16873740e-003f, -2.30126386e-003f, -0.02286313f,
-        0.11983398f, -0.04703261f, -0.08814441f, -0.07585249f, -0.10799607f,
-        -0.03232087f, 0.01509786f, -0.04843464f, -0.03967846f, 0.09589416f,
-        0.01352560f, -0.01458119f, 0.01050829f, -0.03038946f, 0.01608388f,
-        1.11975556e-003f, -0.01250656f, 2.86211423e-003f, 0.04333691f,
-        -0.14603497f, -0.01946543f, -0.02327525f, -0.01973944f, 0.07944400f,
-        -0.02224544f, -0.06701808f, 0.03476532f, 0.11505594f, -0.02712801f,
-        -0.01665113f, 0.06315716f, -0.08205860f, 0.07431999f, 0.04915778f,
-        -0.04468752f, -0.01490402f, 0.07400476f, -0.11650901f, 0.05102430f,
-        0.04559118f, -0.05916039f, 0.08840760f, -0.01587902f, -0.14890194f,
-        0.07857784f, 0.04710254f, -0.05381983f, -0.07331945f, -0.03604643f,
-        0.15611970f, 0.07649943f, -0.05959348f, -0.02776607f, 0.11098688f,
-        0.03758875f, -0.04446875f, 0.04933187f, 0.01345535f, 0.06921103f,
-        0.07364785f, 0.05518956f, 0.02899585f, 0.09375840f, 0.10518434f,
-        -0.04420241f, 0.01915282f, -3.56386811e-003f, 0.14586878f, 0.10286101f,
-        -0.04360626f, -0.12723237f, 0.09076386f, 0.11119842f, -0.06035013f,
-        0.09674817f, 0.08938243f, 0.07065924f, 0.02603180f, 5.84815582e-003f,
-        -0.05922065f, 0.12360309f, 3.59695964e-003f, 2.99844006e-003f,
-        0.03697936f, 0.02043072f, 0.04168725f, 0.01025975f, -0.01359980f,
-        -0.01600920f, 0.02581056f, 0.02329250f, 2.98100687e-003f, 0.01629762f,
-        0.06652115f, 0.05855627f, 0.01237463f, -0.01297135f, 0.01761587f,
-        0.05090865f, 0.06549342f, -0.04425945f, 2.43203156e-003f,
-        3.07327788e-003f, 0.06678630f, -0.04303836f, 0.01082393f, -0.06476044f,
-        0.04077786f, 0.12441979f, 0.08237778f, 0.07424165f, 0.04065890f,
-        0.06905543f, 0.09556347f, 0.12724875f, -0.02132082f, 0.08514154f,
-        -0.04175328f, -0.02666954f, 0.01897836f, 0.03317382f, 9.45465732e-003f,
-        -0.01238974f, -0.04242500f, -0.01419479f, -0.03545213f, -0.02440874f,
-        0.08684119f, 0.04212951f, 0.02462858f, -0.01104825f, -5.01706870e-003f,
-        0.02968982f, 0.02597476f, -0.01568939f, 0.04514892f, 0.06974549f,
-        0.08670278f, 0.06828108f, 0.10238872f, 0.05405957f, 0.06548470f,
-        -0.03763957f, 0.01366090f, 0.07069602f, 0.05363748f, 0.04798120f,
-        0.11706422f, 0.05466456f, -0.01869259f, 0.06344382f, 0.03106543f,
-        0.08432506f, -0.02061096f, 0.03821088f, -6.92190882e-003f,
-        6.40467042e-003f, -0.01271779f, 6.89014705e-005f, 0.04541415f,
-        -0.01899539f, -0.05020239f, 0.03000903f, 0.01090422f, 4.52452758e-003f,
-        0.02573632f, -0.02388454f, -0.04200457f, 1.72783900e-003f,
-        -0.05978370f, -0.02720562f, 0.06573715f, 0.01154317f, 0.01265615f,
-        0.07375994f, -9.19828378e-003f, -0.04914120f, 0.02124831f, 0.06455322f,
-        0.04372910f, -0.03310043f, 0.03605788f, -6.78055827e-003f,
-        9.36202332e-003f, 0.01747596f, -0.06406314f, -0.06812935f, 0.08080816f,
-        -0.02778088f, 0.02735260f, 0.06393493f, 0.06652229f, 0.05676993f,
-        0.08640018f, -7.59188086e-003f, -0.02012847f, -0.04741159f,
-        -0.01657069f, -0.01624399f, 0.05547778f, -2.33309763e-003f,
-        0.01120033f, 0.06141156f, -0.06285004f, -0.08732341f, -0.09313398f,
-        -0.04267832f, 5.57443965e-003f, 0.04809862f, 0.01773641f,
-        5.37361018e-003f, 0.14842421f, -0.06298012f, -0.02935147f, 0.11443478f,
-        -0.05034208f, 5.65494271e-003f, 0.02076526f, -0.04577984f,
-        -0.04735741f, 0.02961071f, -0.09307127f, -0.04417921f, -0.04990027f,
-        -0.03940028f, 0.01306016f, 0.06267900f, 0.03758737f, 0.08460117f,
-        0.13858789f, 0.04862388f, -0.06319809f, -0.05655516f, 0.01885816f,
-        -0.03285607f, 0.03371567f, -0.07040928f, -0.04514049f, 0.01392166f,
-        0.08184422f, -0.07230316f, 0.02386871f, 0.02184591f, 0.02605764f,
-        -0.01033954f, 9.29878280e-003f, 7.67351175e-003f, 0.15189242f,
-        0.02069071f, -0.09738296f, -0.08894105f, -0.07768748f, 0.02332268f,
-        -0.01778995f, -0.03258888f, -0.08180822f, -0.08492987f, 0.02290156f,
-        -0.11368170f, -0.03554465f, -0.04533844f, -0.02861580f, 0.06782424f,
-        0.01113123f, 0.02453644f, 0.12721945f, 0.08084814f, -0.03607795f,
-        0.01109122f, 0.04803548f, -0.03489929f, 0.03399536f, -0.05682014f,
-        8.59533902e-003f, -4.27904585e-003f, 0.03230887f, -0.01300198f,
-        -0.01038137f, -0.07930113f, 8.33097473e-003f, 0.02296994f,
-        -0.01306500f, -0.01881626f, 0.04413369f, 0.05729880f, -0.03761553f,
-        0.01942326f, 1.64540811e-003f, -0.03811319f, 0.04190650f, -0.14978096f,
-        -0.04514487f, 0.01209545f, -5.46460645e-003f, -0.01647195f,
-        7.63064111e-003f, -0.07494587f, 0.08415288f, 0.10020141f, -0.01228561f,
-        0.06553826f, 0.04554005f, 0.07890417f, 0.03041138f, 0.01752007f,
-        0.09208256f, -3.74419295e-004f, 0.10549527f, 0.04686913f, 0.01894833f,
-        -0.02651412f, -4.34682379e-003f, 5.44942822e-003f, 0.01444484f,
-        0.05882156f, -0.03336544f, 0.04603891f, -0.10432546f, 0.01923928f,
-        0.01842845f, -0.01712168f, -0.02222766f, 0.04693324f, -0.06202956f,
-        -0.01422159f, 0.08732220f, -0.07706107f, 0.02661049f, -0.04300238f,
-        -0.03092422f, -0.03552184f, -0.01886088f, -0.04979934f, 0.03906401f,
-        0.04608644f, 0.04966111f, 0.04275464f, -0.04621769f, -0.02653212f,
-        8.57011229e-003f, 0.03839684f, 0.05818764f, 0.03880796f,
-        -2.76100676e-004f, 0.03076511f, -0.03266929f, -0.05374557f,
-        0.04986527f, -9.45429131e-003f, 0.03582499f, -2.64564669e-003f,
-        -1.07461517e-003f, 0.02962313f, -0.01483363f, 0.03060869f, 0.02448327f,
-        0.01845641f, 0.03282966f, -0.03534438f, -0.01084059f, -0.01119136f,
-        -1.85360224e-003f, -5.94652840e-004f, -0.04451817f, 2.98327743e-003f,
-        0.06272484f, -0.02152076f, -3.05971340e-003f, -0.05070828f,
-        0.01531762f, 0.01282815f, 0.05167150f, 9.46266949e-003f,
-        -3.34558333e-003f, 0.11442288f, -0.03906701f, -2.67325155e-003f,
-        0.03069184f, -0.01134165f, 0.02949462f, 0.02879886f, 0.03855566f,
-        -0.03450781f, 0.09142872f, -0.02156654f, 0.06075062f, -0.06220816f,
-        0.01944680f, 6.68372354e-003f, -0.06656796f, 8.70784000e-003f,
-        0.03456013f, 0.02434320f, -0.13236357f, -0.04177035f, -0.02069627f,
-        0.01068112f, 0.01505432f, -0.07517391f, -3.83571628e-003f,
-        -0.06298508f, -0.02881260f, -0.13101046f, -0.07221562f,
-        -5.79945277e-003f, -8.57300125e-003f, 0.03782469f, 0.02762164f,
-        0.04942456f, -0.02936396f, 0.09597211f, 0.01921411f, 0.06101191f,
-        -0.04787507f, -0.01379578f, -7.40224449e-003f, -0.02220136f,
-        -0.01313756f, 7.77558051e-003f, 0.12296968f, 0.02939998f, 0.03594062f,
-        -0.07788624f, -0.01133144f, 3.99316690e-004f, -0.06090347f,
-        -0.01122066f, -4.68682544e-003f, 0.07633100f, -0.06748922f,
-        -0.05640298f, -0.05265681f, -0.01139122f, -0.01624347f, -0.04715714f,
-        -0.01099092f, 0.01048561f, 3.28499987e-003f, -0.05810167f,
-        -0.07699911f, -0.03330683f, 0.04185145f, 0.03478536f, 0.02275165f,
-        0.02304766f, 6.66040834e-003f, 0.10968148f, -5.93013782e-003f,
-        -0.04858336f, -0.04203213f, -0.09316786f, -6.13074889e-003f,
-        -0.02544625f, 0.01366201f, 9.18555818e-003f, -0.01846578f,
-        -0.05622401f, -0.03989377f, -0.07810296f, 6.91275718e-003f,
-        0.05957597f, -0.03901334f, 0.01572002f, -0.01193903f,
-        -6.89400872e-003f, -0.03093356f, -0.04136098f, -0.01562869f,
-        -0.04604580f, 0.02865234f, -0.08678447f, -0.03232484f, -0.05364593f,
-        -0.01445016f, -0.07003860f, -0.08669746f, -0.04520775f, 0.04274122f,
-        0.03117515f, 0.08175703f, 0.01081109f, 0.06379741f, 0.06199206f,
-        0.02865988f, 0.02360346f, 0.06725410f, -0.03248780f, -9.37702879e-003f,
-        0.08265898f, -0.02245839f, 0.05125763f, -0.01862395f, 0.01973453f,
-        -0.01994494f, -0.10770868f, 0.03180375f, 3.23935156e-003f,
-        -0.02142080f, -0.04256190f, 0.04760900f, 0.04282863f, 0.05635953f,
-        -0.01870849f, 0.05540622f, -0.03042666f, 0.01455277f, -0.06630179f,
-        -0.05843807f, -0.03739681f, -0.09739155f, -0.03220233f, -0.05620182f,
-        -0.10381401f, 0.07400211f, 4.20676917e-003f, 0.03258535f,
-        2.14308966e-003f, 0.05121966f, -0.01274337f, 0.02384761f, 0.06335578f,
-        -0.07905591f, 0.08375625f, -0.07898903f, -0.06508528f, -0.02498444f,
-        0.06535810f, 0.03970535f, 0.04895468f, -0.01169566f, -0.03980601f,
-        0.05682293f, 0.05925463f, -0.01165808f, -0.07936699f, -0.04208954f,
-        0.01333987f, 0.09051196f, 0.10098671f, -0.03974256f, 0.01238771f,
-        -0.07501741f, -0.03655440f, -0.04301528f, 0.09216860f,
-        4.63579083e-004f, 0.02851115f, 0.02142735f, 1.28244064e-004f,
-        0.02879687f, -0.08554889f, -0.04838862f, 0.08135369f, -0.05756533f,
-        0.01413900f, 0.03451880f, -0.06619488f, -0.03053130f, 0.02961676f,
-        -0.07384635f, 0.01135692f, 0.05283910f, -0.07778034f, -0.02107482f,
-        -0.05511716f, -0.13473752f, 0.03030157f, 0.06722020f, -0.06218817f,
-        -0.05826827f, 0.06254654f, 0.02895772f, -0.01664000f, -0.03620280f,
-        -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
-        -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
-        -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f
-    };
-    return std::vector<float>(detector, detector + sizeof(detector) / sizeof(detector[0]));
-}
-
-/* Returns the nearest upper power of two, works only for
-the typical GPU thread count (pert block) values */
-static int power_2up(unsigned int n)
-{
-    if (n < 1) return 1;
-    else if (n < 2) return 2;
-    else if (n < 4) return 4;
-    else if (n < 8) return 8;
-    else if (n < 16) return 16;
-    else if (n < 32) return 32;
-    else if (n < 64) return 64;
-    else if (n < 128) return 128;
-    else if (n < 256) return 256;
-    else if (n < 512) return 512;
-    else if (n < 1024) return 1024;
-    return -1; // Input is too big
-}
-
-void cv::ocl::device::hog::set_up_constants(int nbins,
-                                            int block_stride_x, int block_stride_y,
-                                            int nblocks_win_x, int nblocks_win_y)
-{
-    cnbins = nbins;
-    cblock_stride_x = block_stride_x;
-    cblock_stride_y = block_stride_y;
-    cnblocks_win_x = nblocks_win_x;
-    cnblocks_win_y = nblocks_win_y;
-
-    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
-    cblock_hist_size = block_hist_size;
-
-    int descr_width = nblocks_win_x * block_hist_size;
-    cdescr_width = descr_width;
-    cdescr_height = nblocks_win_y;
-
-    int descr_size = descr_width * nblocks_win_y;
-    cdescr_size = descr_size;
-}
-
-void cv::ocl::device::hog::compute_hists(int nbins,
-                                         int block_stride_x, int block_stride_y,
-                                         int height, int width,
-                                         const cv::ocl::oclMat &grad,
-                                         const cv::ocl::oclMat &qangle,
-                                         const cv::ocl::oclMat &gauss_w_lut,
-                                         cv::ocl::oclMat &block_hists)
-{
-    Context *clCxt = Context::getContext();
-    std::vector< std::pair<size_t, const void *> > args;
-    String kernelName = "compute_hists_lut_kernel";
-
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
-        / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
-        / block_stride_y;
-    int blocks_total = img_block_width * img_block_height;
-
-    int grad_quadstep = grad.step >> 2;
-    int qangle_step = qangle.step;
-
-    int blocks_in_group = 4;
-    size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
-    size_t globalThreads[3] = {
-        divUp(img_block_width * img_block_height, blocks_in_group) * localThreads[0], 2, 1 };
-
-    int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12) * sizeof(float);
-    int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y) * sizeof(float);
-
-    int smem = (hists_size + final_hists_size) * blocks_in_group;
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_stride_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_stride_y));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnbins));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&blocks_in_group));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&blocks_total));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&grad_quadstep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&qangle_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&grad.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
-    args.push_back( std::make_pair( smem, (void *)NULL));
-
-    if(hog_device_cpu)
-    {
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-            localThreads, args, -1, -1, "-D CPU");
-    }
-    else
-    {
-        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        size_t wave_size = queryWaveFrontSize(kernel);
-        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-            localThreads, args, -1, -1, opt);
-    }
-}
-
-void cv::ocl::device::hog::normalize_hists(int nbins,
-                                           int block_stride_x, int block_stride_y,
-                                           int height, int width,
-                                           cv::ocl::oclMat &block_hists,
-                                           float threshold)
-{
-    Context *clCxt = Context::getContext();
-    std::vector< std::pair<size_t, const void *> > args;
-    String kernelName;
-
-    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
-        / block_stride_x;
-    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y)
-        / block_stride_y;
-    int nthreads;
-    size_t globalThreads[3] = { 1, 1, 1  };
-    size_t localThreads[3] = { 1, 1, 1  };
-
-    if ( nbins == 9 )
-    {
-        /* optimized for the case of 9 bins */
-        kernelName = "normalize_hists_36_kernel";
-        int blocks_in_group = NTHREADS / block_hist_size;
-        nthreads = blocks_in_group * block_hist_size;
-        int num_groups = divUp( img_block_width * img_block_height, blocks_in_group);
-        globalThreads[0] = nthreads * num_groups;
-        localThreads[0] = nthreads;
-    }
-    else
-    {
-        kernelName = "normalize_hists_kernel";
-        nthreads = power_2up(block_hist_size);
-        globalThreads[0] = img_block_width * nthreads;
-        globalThreads[1] = img_block_height;
-        localThreads[0] = nthreads;
-
-        if ((nthreads < 32) || (nthreads > 512) )
-            cv::error(Error::StsBadArg, "normalize_hists: histogram's size is too small or too big",
-                "normalize_hists", __FILE__, __LINE__);
-
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&nthreads));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_hist_size));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
-    }
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&threshold));
-    args.push_back( std::make_pair( nthreads * sizeof(float), (void *)NULL));
-
-    if(hog_device_cpu)
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-                             localThreads, args, -1, -1, "-D CPU");
-    else
-    {
-        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        size_t wave_size = queryWaveFrontSize(kernel);
-        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-                             localThreads, args, -1, -1, opt);
-    }
-}
-
-void cv::ocl::device::hog::classify_hists(int win_height, int win_width,
-                                          int block_stride_y, int block_stride_x,
-                                          int win_stride_y, int win_stride_x,
-                                          int height, int width,
-                                          const cv::ocl::oclMat &block_hists,
-                                          const cv::ocl::oclMat &coefs,
-                                          float free_coef, float threshold,
-                                          cv::ocl::oclMat &labels)
-{
-    Context *clCxt = Context::getContext();
-    std::vector< std::pair<size_t, const void *> > args;
-
-    int nthreads;
-    String kernelName;
-    switch (cdescr_width)
-    {
-    case 180:
-        nthreads = 180;
-        kernelName = "classify_hists_180_kernel";
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_height));
-        break;
-    case 252:
-        nthreads = 256;
-        kernelName = "classify_hists_252_kernel";
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_height));
-        break;
-    default:
-        nthreads = 256;
-        kernelName = "classify_hists_kernel";
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_size));
-        args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
-    }
-
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
-        block_stride_x;
-
-    size_t globalThreads[3] = { img_win_width * nthreads, img_win_height, 1 };
-    size_t localThreads[3] = { nthreads, 1, 1 };
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_win_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_y));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&coefs.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&free_coef));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&threshold));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&labels.data));
-
-    if(hog_device_cpu)
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-                             localThreads, args, -1, -1, "-D CPU");
-    else
-    {
-        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &objdetect_hog, kernelName);
-        size_t wave_size = queryWaveFrontSize(kernel);
-        char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", (int)wave_size);
-        openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-                             localThreads, args, -1, -1, opt);
-    }
-}
-
-void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
-                                                  int block_stride_y, int block_stride_x,
-                                                  int win_stride_y, int win_stride_x,
-                                                  int height, int width,
-                                                  const cv::ocl::oclMat &block_hists,
-                                                  cv::ocl::oclMat &descriptors)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "extract_descrs_by_rows_kernel";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
-        block_stride_x;
-    int descriptors_quadstep = descriptors.step >> 2;
-
-    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors_quadstep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_y));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-        localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
-                                                  int block_stride_y, int block_stride_x,
-                                                  int win_stride_y, int win_stride_x,
-                                                  int height, int width,
-                                                  const cv::ocl::oclMat &block_hists,
-                                                  cv::ocl::oclMat &descriptors)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "extract_descrs_by_cols_kernel";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    int win_block_stride_x = win_stride_x / block_stride_x;
-    int win_block_stride_y = win_stride_y / block_stride_y;
-    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
-    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
-    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
-        block_stride_x;
-    int descriptors_quadstep = descriptors.step >> 2;
-
-    size_t globalThreads[3] = { img_win_width * NTHREADS, img_win_height, 1 };
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cblock_hist_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors_quadstep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cdescr_size));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnblocks_win_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnblocks_win_y));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_block_width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&win_block_stride_y));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&block_hists.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
-
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-        localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
-                                                  const cv::ocl::oclMat &img,
-                                                  float angle_scale,
-                                                  cv::ocl::oclMat &grad,
-                                                  cv::ocl::oclMat &qangle,
-                                                  bool correct_gamma)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "compute_gradients_8UC1_kernel";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
-    size_t globalThreads[3] = { width, height, 1 };
-    char correctGamma = (correct_gamma) ? 1 : 0;
-    int img_step = img.step;
-    int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&grad_quadstep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&qangle_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&grad.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&angle_scale));
-    args.push_back( std::make_pair( sizeof(cl_char), (void *)&correctGamma));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnbins));
-
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-        localThreads, args, -1, -1);
-}
-
-void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
-                                                  const cv::ocl::oclMat &img,
-                                                  float angle_scale,
-                                                  cv::ocl::oclMat &grad,
-                                                  cv::ocl::oclMat &qangle,
-                                                  bool correct_gamma)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "compute_gradients_8UC4_kernel";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    size_t localThreads[3] = { NTHREADS, 1, 1 };
-    size_t globalThreads[3] = { width, height, 1 };
-
-    char correctGamma = (correct_gamma) ? 1 : 0;
-    int img_step = img.step >> 2;
-    int grad_quadstep = grad.step >> 3;
-    int qangle_step = qangle.step >> 1;
-
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&grad_quadstep));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&qangle_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&grad.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&qangle.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&angle_scale));
-    args.push_back( std::make_pair( sizeof(cl_char), (void *)&correctGamma));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cnbins));
-
-    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads,
-        localThreads, args, -1, -1);
-}
diff --git a/modules/ocl/src/hough.cpp b/modules/ocl/src/hough.cpp
deleted file mode 100644
index dca1d8b4c..000000000
--- a/modules/ocl/src/hough.cpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-#if !defined (HAVE_OPENCL)
-
-void cv::ocl::HoughCircles(const oclMat&, oclMat&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
-void cv::ocl::HoughCircles(const oclMat&, oclMat&, HoughCirclesBuf&, int, float, float, int, int, int, int, int) { throw_nogpu(); }
-void cv::ocl::HoughCirclesDownload(const oclMat&, OutputArray) { throw_nogpu(); }
-
-#else /* !defined (HAVE_OPENCL) */
-
-#define MUL_UP(a, b) ((a)/(b)+1)*(b)
-
-//////////////////////////////////////////////////////////
-// common functions
-
-namespace
-{
-    int buildPointList_gpu(const oclMat& src, oclMat& list)
-    {
-        const int PIXELS_PER_THREAD = 16;
-
-        int totalCount = 0;
-        int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(*(cl_context*)src.clCxt->getOpenCLContextPtr(),
-                                        CL_MEM_COPY_HOST_PTR,
-                                        sizeof(int),
-                                        &totalCount,
-                                        &err);
-        openCLSafeCall(err);
-
-        const size_t blkSizeX = 32;
-        const size_t blkSizeY = 4;
-        size_t localThreads[3] = { blkSizeX, blkSizeY, 1 };
-
-        const int PIXELS_PER_BLOCK = blkSizeX * PIXELS_PER_THREAD;
-        const size_t glbSizeX = src.cols % (PIXELS_PER_BLOCK) == 0 ? src.cols : MUL_UP(src.cols, PIXELS_PER_BLOCK);
-        const size_t glbSizeY = src.rows % blkSizeY == 0 ? src.rows : MUL_UP(src.rows, blkSizeY);
-        size_t globalThreads[3] = { glbSizeX, glbSizeY, 1 };
-
-        std::vector<std::pair<size_t , const void *> > args;
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&src.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&src.step ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&list.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&counter ));
-
-        // WARNING: disabled until
-        openCLExecuteKernel(src.clCxt, &imgproc_hough, "buildPointList", globalThreads, localThreads, args, -1, -1);
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)src.clCxt->getOpenCLCommandQueuePtr(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
-        openCLSafeCall(clReleaseMemObject(counter));
-
-        return totalCount;
-    }
-}
-
-//////////////////////////////////////////////////////////
-// HoughCircles
-
-namespace
-{
-    void circlesAccumCenters_gpu(const oclMat& list, int count, const oclMat& dx, const oclMat& dy, oclMat& accum, int minRadius, int maxRadius, float idp)
-    {
-        const size_t blkSizeX = 256;
-        size_t localThreads[3] = { 256, 1, 1 };
-
-        const size_t glbSizeX = count % blkSizeX == 0 ? count : MUL_UP(count, blkSizeX);
-        size_t globalThreads[3] = { glbSizeX, 1, 1 };
-
-        const int width  = accum.cols - 2;
-        const int height = accum.rows - 2;
-
-        std::vector<std::pair<size_t , const void *> > args;
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&list.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&count ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&dx.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dx.step ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&dy.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&dy.step ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&accum.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&accum.step ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&width ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&height ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&minRadius));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&maxRadius));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&idp));
-
-        openCLExecuteKernel(accum.clCxt, &imgproc_hough, "circlesAccumCenters", globalThreads, localThreads, args, -1, -1);
-    }
-
-    int buildCentersList_gpu(const oclMat& accum, oclMat& centers, int threshold)
-    {
-        int totalCount = 0;
-        int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(*(cl_context*)accum.clCxt->getOpenCLContextPtr(),
-                                        CL_MEM_COPY_HOST_PTR,
-                                        sizeof(int),
-                                        &totalCount,
-                                        &err);
-        openCLSafeCall(err);
-
-        const size_t blkSizeX = 32;
-        const size_t blkSizeY = 8;
-        size_t localThreads[3] = { blkSizeX, blkSizeY, 1 };
-
-        const size_t glbSizeX = (accum.cols - 2) % blkSizeX == 0 ? accum.cols - 2 : MUL_UP(accum.cols - 2, blkSizeX);
-        const size_t glbSizeY = (accum.rows - 2) % blkSizeY == 0 ? accum.rows - 2 : MUL_UP(accum.rows - 2, blkSizeY);
-        size_t globalThreads[3] = { glbSizeX, glbSizeY, 1 };
-
-        std::vector<std::pair<size_t , const void *> > args;
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&accum.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&accum.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&accum.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&accum.step ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&centers.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&threshold ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&counter ));
-
-        openCLExecuteKernel(accum.clCxt, &imgproc_hough, "buildCentersList", globalThreads, localThreads, args, -1, -1);
-
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)accum.clCxt->getOpenCLCommandQueuePtr(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
-        openCLSafeCall(clReleaseMemObject(counter));
-
-        return totalCount;
-    }
-
-    int circlesAccumRadius_gpu(const oclMat& centers, int centersCount,
-                               const oclMat& list, int count,
-                               oclMat& circles, int maxCircles,
-                               float dp, int minRadius, int maxRadius, int threshold)
-    {
-        int totalCount = 0;
-        int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(*(cl_context*)circles.clCxt->getOpenCLContextPtr(),
-                                        CL_MEM_COPY_HOST_PTR,
-                                        sizeof(int),
-                                        &totalCount,
-                                        &err);
-        openCLSafeCall(err);
-
-        const size_t blkSizeX = circles.clCxt->getDeviceInfo().maxWorkGroupSize;
-        size_t localThreads[3] = { blkSizeX, 1, 1 };
-
-        const size_t glbSizeX = centersCount * blkSizeX;
-        size_t globalThreads[3] = { glbSizeX, 1, 1 };
-
-        const int histSize = maxRadius - minRadius + 1;
-        size_t smemSize = (histSize + 2) * sizeof(int);
-
-        std::vector<std::pair<size_t , const void *> > args;
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&centers.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&list.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&count ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&circles.data ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&maxCircles ));
-        args.push_back( std::make_pair( sizeof(cl_float), (void *)&dp ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&minRadius ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&maxRadius ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&histSize ));
-        args.push_back( std::make_pair( sizeof(cl_int)  , (void *)&threshold ));
-        args.push_back( std::make_pair( smemSize        , (void *)NULL ));
-        args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&counter ));
-
-        CV_Assert(circles.offset == 0);
-
-        openCLExecuteKernel(circles.clCxt, &imgproc_hough, "circlesAccumRadius", globalThreads, localThreads, args, -1, -1);
-
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)circles.clCxt->getOpenCLCommandQueuePtr(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
-
-        openCLSafeCall(clReleaseMemObject(counter));
-
-        totalCount = std::min(totalCount, maxCircles);
-
-        return totalCount;
-    }
-
-
-} // namespace
-
-
-
-void cv::ocl::HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
-{
-    HoughCirclesBuf buf;
-    HoughCircles(src, circles, buf, method, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius, maxCircles);
-}
-
-void cv::ocl::HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method,
-                           float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
-{
-    CV_Assert(src.type() == CV_8UC1);
-    CV_Assert(src.cols < std::numeric_limits<unsigned short>::max());
-    CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
-    CV_Assert(method == HOUGH_GRADIENT);
-    CV_Assert(dp > 0);
-    CV_Assert(minRadius > 0 && maxRadius > minRadius);
-    CV_Assert(cannyThreshold > 0);
-    CV_Assert(votesThreshold > 0);
-    CV_Assert(maxCircles > 0);
-
-    const float idp = 1.0f / dp;
-
-    cv::ocl::Canny(src, buf.cannyBuf, buf.edges, std::max(cannyThreshold / 2, 1), cannyThreshold);
-
-    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.srcPoints);
-    const int pointsCount = buildPointList_gpu(buf.edges, buf.srcPoints);
-    if (pointsCount == 0)
-    {
-        circles.release();
-        return;
-    }
-
-    ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, buf.accum);
-    buf.accum.setTo(Scalar::all(0));
-
-    circlesAccumCenters_gpu(buf.srcPoints, pointsCount, buf.cannyBuf.dx, buf.cannyBuf.dy, buf.accum, minRadius, maxRadius, idp);
-
-    ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf.centers);
-    int centersCount = buildCentersList_gpu(buf.accum, buf.centers, votesThreshold);
-    if (centersCount == 0)
-    {
-        circles.release();
-        return;
-    }
-
-    if (minDist > 1)
-    {
-        cv::AutoBuffer<unsigned int> oldBuf_(centersCount);
-        cv::AutoBuffer<unsigned int> newBuf_(centersCount);
-        int newCount = 0;
-
-        unsigned int* oldBuf = oldBuf_;
-        unsigned int* newBuf = newBuf_;
-
-        openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)buf.centers.clCxt->getOpenCLCommandQueuePtr(),
-                                           (cl_mem)buf.centers.data,
-                                           CL_TRUE,
-                                           0,
-                                           centersCount * sizeof(unsigned int),
-                                           oldBuf,
-                                           0,
-                                           NULL,
-                                           NULL));
-
-
-        const int cellSize = cvRound(minDist);
-        const int gridWidth = (src.cols + cellSize - 1) / cellSize;
-        const int gridHeight = (src.rows + cellSize - 1) / cellSize;
-
-        std::vector< std::vector<unsigned int> > grid(gridWidth * gridHeight);
-
-        const float minDist2 = minDist * minDist;
-
-        for (int i = 0; i < centersCount; ++i)
-        {
-            unsigned int p = oldBuf[i];
-            const int px = p & 0xFFFF;
-            const int py = (p >> 16) & 0xFFFF;
-
-            bool good = true;
-
-            int xCell = static_cast<int>(px / cellSize);
-            int yCell = static_cast<int>(py / cellSize);
-
-            int x1 = xCell - 1;
-            int y1 = yCell - 1;
-            int x2 = xCell + 1;
-            int y2 = yCell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(gridWidth - 1, x2);
-            y2 = std::min(gridHeight - 1, y2);
-
-            for (int yy = y1; yy <= y2; ++yy)
-            {
-                for (int xx = x1; xx <= x2; ++xx)
-                {
-                    std::vector<unsigned int>& m = grid[yy * gridWidth + xx];
-
-                    for(size_t j = 0; j < m.size(); ++j)
-                    {
-                        const int val = m[j];
-                        const int jx = val & 0xFFFF;
-                        const int jy = (val >> 16) & 0xFFFF;
-
-                        float dx = (float)(px - jx);
-                        float dy = (float)(py - jy);
-
-                        if (dx * dx + dy * dy < minDist2)
-                        {
-                            good = false;
-                            goto break_out;
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[yCell * gridWidth + xCell].push_back(p);
-                newBuf[newCount++] = p;
-            }
-        }
-
-        openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)buf.centers.clCxt->getOpenCLCommandQueuePtr(),
-                                            (cl_mem)buf.centers.data,
-                                            CL_TRUE,
-                                            0,
-                                            newCount * sizeof(unsigned int),
-                                            newBuf,
-                                            0,
-                                            0,
-                                            0));
-        centersCount = newCount;
-    }
-
-    ensureSizeIsEnough(1, maxCircles, CV_32FC3, circles);
-
-    const int circlesCount = circlesAccumRadius_gpu(buf.centers, centersCount,
-                                                           buf.srcPoints, pointsCount,
-                                                           circles, maxCircles,
-                                                           dp, minRadius, maxRadius, votesThreshold);
-
-    if (circlesCount > 0)
-        circles.cols = circlesCount;
-    else
-        circles.release();
-}
-
-void cv::ocl::HoughCirclesDownload(const oclMat& d_circles, cv::OutputArray h_circles_)
-{
-    // FIX ME: garbage values are copied!
-    CV_Error(Error::StsNotImplemented, "HoughCirclesDownload is not implemented");
-
-    if (d_circles.empty())
-    {
-        h_circles_.release();
-        return;
-    }
-
-    CV_Assert(d_circles.rows == 1 && d_circles.type() == CV_32FC3);
-
-    h_circles_.create(1, d_circles.cols, CV_32FC3);
-    Mat h_circles = h_circles_.getMat();
-    d_circles.download(h_circles);
-}
-
-#endif /* !defined (HAVE_OPENCL) */
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
deleted file mode 100644
index f730df10f..000000000
--- a/modules/ocl/src/imgproc.cpp
+++ /dev/null
@@ -1,1964 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Rock Li, Rock.Li@amd.com
-//    Zero Lin, Zero.Lin@amd.com
-//    Zhang Ying, zhangying913@gmail.com
-//    Xu Pang, pangxu010@163.com
-//    Wu Zailong, bullet@yeah.net
-//    Wenju He, wenju@multicorewareinc.com
-//    Peng Xiao, pengxiao@outlook.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        ////////////////////////////////////OpenCL call wrappers////////////////////////////
-
-        template <typename T> struct index_and_sizeof;
-        template <> struct index_and_sizeof<char>
-        {
-            enum { index = 1 };
-        };
-        template <> struct index_and_sizeof<unsigned char>
-        {
-            enum { index = 2 };
-        };
-        template <> struct index_and_sizeof<short>
-        {
-            enum { index = 3 };
-        };
-        template <> struct index_and_sizeof<unsigned short>
-        {
-            enum { index = 4 };
-        };
-        template <> struct index_and_sizeof<int>
-        {
-            enum { index = 5 };
-        };
-        template <> struct index_and_sizeof<float>
-        {
-            enum { index = 6 };
-        };
-        template <> struct index_and_sizeof<double>
-        {
-            enum { index = 7 };
-        };
-
-        /////////////////////////////////////////////////////////////////////////////////////
-        // threshold
-
-        static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
-        {
-            CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
-
-            static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
-                                       sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
-
-            int elemSize1 = sizeMap[depth];
-            int bufSize = elemSize1 * ocn;
-            std::vector<uchar> _buf(bufSize);
-            uchar * buf = &_buf[0];
-            scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
-            memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
-
-            return _buf;
-        }
-
-        static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
-        {
-            bool ival = src.depth() < CV_32F;
-            int cn = src.channels(), vecSize = 4, depth = src.depth();
-            std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
-                                                               dst.oclchannels(), dst.channels());
-            std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
-
-            const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
-                                                  "THRESH_TOZERO", "THRESH_TOZERO_INV" };
-            const char * const channelMap[] = { "", "", "2", "4", "4" };
-            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-            std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]);
-
-            int elemSize = src.elemSize();
-            int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
-            int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
-            args.push_back( std::make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
-            args.push_back( std::make_pair(maxValue.size(), (void *)&maxValue[0]));
-
-            int max_index = dst.cols, cols = dst.cols;
-            if (cn == 1 && vecSize > 1)
-            {
-                CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16);
-                cols = divUp(cols, vecSize);
-                buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d",
-                                       typeMap[depth], vecSize, vecSize, vecSize, vecSize);
-
-                int vecSizeBytes = vecSize * dst.elemSize1();
-                if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0)
-                    buildOptions += " -D DST_ALIGNED";
-                if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0)
-                    buildOptions += " -D SRC_ALIGNED";
-
-                args.push_back( std::make_pair(sizeof(cl_int), (void *)&max_index));
-            }
-
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
-
-            size_t localThreads[3] = { 16, 16, 1 };
-            size_t globalThreads[3] = { cols, dst.rows, 1 };
-
-            openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
-                                -1, -1, buildOptions.c_str());
-        }
-
-        double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
-        {
-            CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
-                      || thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
-
-            dst.create(src.size(), src.type());
-            threshold_runner(src, dst, thresh, maxVal, thresholdType);
-
-            return thresh;
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////
-        ///////////////////////////////   remap   //////////////////////////////////////////////////
-        ////////////////////////////////////////////////////////////////////////////////////////////
-
-        void remap( const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int borderType, const Scalar &borderValue )
-        {
-            Context *clCxt = src.clCxt;
-            bool supportsDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
-            if (!supportsDouble && src.depth() == CV_64F)
-            {
-                CV_Error(CV_OpenCLDoubleNotSupported, "Selected device does not support double");
-                return;
-            }
-
-            if (map1.empty())
-                map1.swap(map2);
-
-            CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST);
-            CV_Assert((map1.type() == CV_16SC2 && (map2.empty() || (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ) ||
-                      (map1.type() == CV_32FC2 && !map2.data) ||
-                      (map1.type() == CV_32FC1 && map2.type() == CV_32FC1));
-            CV_Assert(!map2.data || map2.size() == map1.size());
-            CV_Assert(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE || borderType == BORDER_WRAP
-                      || borderType == BORDER_REFLECT_101 || borderType == BORDER_REFLECT);
-
-            dst.create(map1.size(), src.type());
-
-            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-            const char * const channelMap[] = { "", "", "2", "4", "4" };
-            const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
-            const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
-                                   "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
-
-            String kernelName = "remap";
-            if (map1.type() == CV_32FC2 && map2.empty())
-                kernelName += "_32FC2";
-            else if (map1.type() == CV_16SC2)
-            {
-                kernelName += "_16SC2";
-                if (!map2.empty())
-                    kernelName += "_16UC1";
-            }
-            else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
-                kernelName += "_2_32FC1";
-            else
-                CV_Error(Error::StsBadArg, "Unsupported map types");
-
-            int ocn = dst.oclchannels();
-            size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-            Mat scalar(1, 1, CV_MAKE_TYPE(dst.depth(), ocn), borderValue);
-            String buildOptions = format("-D %s -D %s -D T=%s%s", interMap[interpolation],
-                                         borderMap[borderType], typeMap[src.depth()], channelMap[ocn]);
-
-            if (interpolation != INTER_NEAREST)
-            {
-                int wdepth = std::max(CV_32F, dst.depth());
-                buildOptions = buildOptions
-                              + format(" -D WT=%s%s -D convertToT=convert_%s%s%s -D convertToWT=convert_%s%s"
-                                       " -D convertToWT2=convert_%s2 -D WT2=%s2",
-                                       typeMap[wdepth], channelMap[ocn],
-                                       typeMap[src.depth()], channelMap[ocn], src.depth() < CV_32F ? "_sat_rte" : "",
-                                       typeMap[wdepth], channelMap[ocn],
-                                       typeMap[wdepth], typeMap[wdepth]);
-            }
-
-            int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
-            int map1_step = map1.step / map1.elemSize(), map1_offset = map1.offset / map1.elemSize();
-            int map2_step = map2.step / map2.elemSize(), map2_offset = map2.offset / map2.elemSize();
-            int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&map1.data));
-            if (!map2.empty())
-                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&map2.data));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1_offset));
-            if (!map2.empty())
-                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map2_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1_step));
-            if (!map2.empty())
-                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map2_step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair(scalar.elemSize(), (void *)scalar.data));
-
-#ifdef ANDROID
-            openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, NULL, args, -1, -1, buildOptions.c_str());
-#else
-            size_t localThreads[3] = { 256, 1, 1 };
-            openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
-#endif
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////
-        // resize
-
-        static void computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
-                                          float * const alpha_tab, int * const ofs_tab)
-        {
-            int k = 0, dx = 0;
-            for ( ; dx < dsize; dx++)
-            {
-                ofs_tab[dx] = k;
-
-                double fsx1 = dx * scale;
-                double fsx2 = fsx1 + scale;
-                double cellWidth = std::min(scale, ssize - fsx1);
-
-                int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
-
-                sx2 = std::min(sx2, ssize - 1);
-                sx1 = std::min(sx1, sx2);
-
-                if (sx1 - fsx1 > 1e-3)
-                {
-                    map_tab[k] = sx1 - 1;
-                    alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
-                }
-
-                for (int sx = sx1; sx < sx2; sx++)
-                {
-                    map_tab[k] = sx;
-                    alpha_tab[k++] = float(1.0 / cellWidth);
-                }
-
-                if (fsx2 - sx2 > 1e-3)
-                {
-                    map_tab[k] = sx2;
-                    alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
-                }
-            }
-            ofs_tab[dx] = k;
-        }
-
-        static void computeResizeAreaFastTabs(int * dmap_tab, int * smap_tab, int scale, int dcols, int scol)
-        {
-            for (int i = 0; i < dcols; ++i)
-                dmap_tab[i] = scale * i;
-
-            for (int i = 0, size = dcols * scale; i < size; ++i)
-                smap_tab[i] = std::min(scol - 1, i);
-        }
-
-        static void resize_gpu( const oclMat &src, oclMat &dst, double ifx, double ify, int interpolation)
-        {
-            float ifxf = (float)ifx, ifyf = (float)ify;
-            int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
-            int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-            int ocn = dst.oclchannels(), depth = dst.depth();
-
-            const char * const interMap[] = { "NN", "LN", "CUBIC", "AREA", "LAN4" };
-            std::string kernelName = std::string("resize") + interMap[interpolation];
-
-            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-            const char * const channelMap[] = { "" , "", "2", "4", "4" };
-            std::string buildOption = format("-D %s -D T=%s%s", interMap[interpolation], typeMap[depth], channelMap[ocn]);
-
-            int wdepth = std::max(src.depth(), CV_32F);
-
-            // check if fx, fy is integer and then we have inter area fast mode
-            int iscale_x = saturate_cast<int>(ifx);
-            int iscale_y = saturate_cast<int>(ify);
-
-            bool is_area_fast = std::abs(ifx - iscale_x) < DBL_EPSILON &&
-                std::abs(ify - iscale_y) < DBL_EPSILON;
-            if (is_area_fast)
-                wdepth = std::max(src.depth(), CV_32S);
-
-            if (interpolation != INTER_NEAREST)
-            {
-                buildOption += format(" -D WT=%s -D WTV=%s%s -D convertToWTV=convert_%s%s -D convertToT=convert_%s%s%s",
-                                      typeMap[wdepth], typeMap[wdepth], channelMap[ocn],
-                                      typeMap[wdepth], channelMap[ocn],
-                                      typeMap[src.depth()], channelMap[ocn], src.depth() <= CV_32S ? "_sat_rte" : "");
-            }
-
-#ifdef ANDROID
-            size_t blkSizeX = 16, blkSizeY = 8;
-#else
-            size_t blkSizeX = 16, blkSizeY = 16;
-#endif
-            size_t glbSizeX;
-            if (src.type() == CV_8UC1 && interpolation == INTER_LINEAR)
-            {
-                size_t cols = (dst.cols + dst.offset % 4 + 3) / 4;
-                glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
-            }
-            else
-                glbSizeX = dst.cols;
-
-            oclMat alphaOcl, mapOcl, tabofsOcl;
-            if (interpolation == INTER_AREA)
-            {
-                if (is_area_fast)
-                {
-                    kernelName += "_FAST";
-                    int wdepth2 = std::max(CV_32F, src.depth());
-                    buildOption += format(" -D WT2V=%s%s -D convertToWT2V=convert_%s%s -D AREA_FAST -D XSCALE=%d -D YSCALE=%d -D SCALE=%f",
-                                          typeMap[wdepth2], channelMap[ocn], typeMap[wdepth2], channelMap[ocn],
-                                          iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
-
-                    int smap_tab_size = dst.cols * iscale_x + dst.rows * iscale_y;
-                    AutoBuffer<int> dmap_tab(dst.cols + dst.rows), smap_tab(smap_tab_size);
-                    int * dxmap_tab = dmap_tab, * dymap_tab = dxmap_tab + dst.cols;
-                    int * sxmap_tab = smap_tab, * symap_tab = smap_tab + dst.cols * iscale_y;
-
-                    computeResizeAreaFastTabs(dxmap_tab, sxmap_tab, iscale_x, dst.cols, src.cols);
-                    computeResizeAreaFastTabs(dymap_tab, symap_tab, iscale_y, dst.rows, src.rows);
-
-                    tabofsOcl = oclMat(1, dst.cols + dst.rows, CV_32SC1, (void *)dmap_tab);
-                    mapOcl = oclMat(1, smap_tab_size, CV_32SC1, (void *)smap_tab);
-                }
-                else
-                {
-                    Size ssize = src.size(), dsize = dst.size();
-                    int xytab_size = (ssize.width + ssize.height) << 1;
-                    int tabofs_size = dsize.height + dsize.width + 2;
-
-                    AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
-                    AutoBuffer<float> _xyalpha_tab(xytab_size);
-                    int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
-                    float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
-                    int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
-
-                    computeResizeAreaTabs(ssize.width, dsize.width, ifx, xmap_tab, xalpha_tab, xofs_tab);
-                    computeResizeAreaTabs(ssize.height, dsize.height, ify, ymap_tab, yalpha_tab, yofs_tab);
-
-                    // loading precomputed arrays to GPU
-                    alphaOcl = oclMat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab);
-                    mapOcl = oclMat(1, xytab_size, CV_32SC1, (void *)_xymap_tab);
-                    tabofsOcl = oclMat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab);
-                }
-            }
-
-            size_t globalThreads[3] = { glbSizeX, dst.rows, 1 };
-            size_t localThreads[3] = { blkSizeX, blkSizeY, 1 };
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-
-            if (wdepth == CV_64F)
-            {
-                args.push_back( std::make_pair(sizeof(cl_double), (void *)&ifx));
-                args.push_back( std::make_pair(sizeof(cl_double), (void *)&ify));
-            }
-            else
-            {
-                args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifxf));
-                args.push_back( std::make_pair(sizeof(cl_float), (void *)&ifyf));
-            }
-
-            // precomputed tabs
-            if (!tabofsOcl.empty())
-                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&tabofsOcl.data));
-
-            if (!mapOcl.empty())
-                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&mapOcl.data));
-
-            if (!alphaOcl.empty())
-                args.push_back( std::make_pair(sizeof(cl_mem), (void *)&alphaOcl.data));
-
-            ocn = interpolation == INTER_LINEAR ? ocn : -1;
-            depth = interpolation == INTER_LINEAR ? depth : -1;
-
-            openCLExecuteKernel(src.clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args,
-                                ocn, depth, buildOption.c_str());
-        }
-
-        void resize(const oclMat &src, oclMat &dst, Size dsize, double fx, double fy, int interpolation)
-        {
-            if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(CV_OpenCLDoubleNotSupported, "Selected device does not support double");
-                return;
-            }
-
-            CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
-                      || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
-            CV_Assert(dsize.area() > 0 || (fx > 0 && fy > 0));
-
-            if (dsize.area() == 0)
-            {
-                dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
-                CV_Assert(dsize.area() > 0);
-            }
-            else
-            {
-                fx = (double)dsize.width / src.cols;
-                fy = (double)dsize.height / src.rows;
-            }
-
-            double inv_fy = 1 / fy, inv_fx = 1 / fx;
-            CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST ||
-                      (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1));
-
-            dst.create(dsize, src.type());
-
-            resize_gpu( src, dst, inv_fx, inv_fy, interpolation);
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // medianFilter
-
-        void medianFilter(const oclMat &src, oclMat &dst, int m)
-        {
-            CV_Assert( m % 2 == 1 && m > 1 );
-            CV_Assert( (src.depth() == CV_8U || src.depth() == CV_32F) && (src.channels() == 1 || src.channels() == 4));
-            dst.create(src.size(), src.type());
-
-            int srcStep = src.step / src.elemSize(), dstStep = dst.step / dst.elemSize();
-            int srcOffset = src.offset /  src.elemSize(), dstOffset = dst.offset / dst.elemSize();
-
-            Context *clCxt = src.clCxt;
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcOffset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstOffset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep));
-
-            size_t globalThreads[3] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
-            size_t localThreads[3] = {16, 16, 1};
-
-            if (m == 3)
-            {
-                String kernelName = "medianFilter3";
-                openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
-            }
-            else if (m == 5)
-            {
-                String kernelName = "medianFilter5";
-                openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
-            }
-            else
-                CV_Error(Error::StsBadArg, "Non-supported filter length");
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // copyMakeBorder
-
-        void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
-        {
-            if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device does not support double");
-                return;
-            }
-
-            oclMat _src = src;
-
-            CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
-
-            if( (_src.wholecols != _src.cols || _src.wholerows != _src.rows) && (bordertype & BORDER_ISOLATED) == 0 )
-            {
-                Size wholeSize;
-                Point ofs;
-                _src.locateROI(wholeSize, ofs);
-                int dtop = std::min(ofs.y, top);
-                int dbottom = std::min(wholeSize.height - _src.rows - ofs.y, bottom);
-                int dleft = std::min(ofs.x, left);
-                int dright = std::min(wholeSize.width - _src.cols - ofs.x, right);
-                _src.adjustROI(dtop, dbottom, dleft, dright);
-                top -= dtop;
-                left -= dleft;
-                bottom -= dbottom;
-                right -= dright;
-            }
-            bordertype &= ~cv::BORDER_ISOLATED;
-
-            dst.create(_src.rows + top + bottom, _src.cols + left + right, _src.type());
-            int srcStep = _src.step / _src.elemSize(),  dstStep = dst.step / dst.elemSize();
-            int srcOffset = _src.offset / _src.elemSize(), dstOffset = dst.offset / dst.elemSize();
-            int depth = _src.depth(), ochannels = _src.oclchannels();
-
-            int __bordertype[] = { BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101 };
-            const char *borderstr[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" };
-
-            int bordertype_index = -1;
-            for (int i = 0, end = sizeof(__bordertype) / sizeof(int); i < end; i++)
-                if (__bordertype[i] == bordertype)
-                {
-                    bordertype_index = i;
-                    break;
-                }
-            if (bordertype_index < 0)
-                CV_Error(Error::StsBadArg, "Unsupported border type");
-
-            size_t localThreads[3] = { 16, 16, 1 };
-            size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&_src.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&_src.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&_src.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcOffset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstOffset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&top));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&left));
-
-            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-            const char * const channelMap[] = { "", "", "2", "4", "4" };
-            std::string buildOptions = format("-D GENTYPE=%s%s -D %s",
-                                              typeMap[depth], channelMap[ochannels],
-                                              borderstr[bordertype_index]);
-
-            int cn = src.channels(), ocn = src.oclchannels();
-            int bufSize = src.elemSize1() * ocn;
-            AutoBuffer<uchar> _buf(bufSize);
-            uchar * buf = (uchar *)_buf;
-            scalarToRawData(scalar, buf, dst.type());
-            memset(buf + src.elemSize1() * cn, 0, (ocn - cn) * src.elemSize1());
-
-            args.push_back( std::make_pair( bufSize , (void *)buf ));
-
-            openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, "copymakeborder", globalThreads,
-                                localThreads, args, -1, -1, buildOptions.c_str());
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // warp
-
-        namespace
-        {
-#define F double
-
-            void convert_coeffs(F *M)
-            {
-                double D = M[0] * M[4] - M[1] * M[3];
-                D = D != 0 ? 1. / D : 0;
-                double A11 = M[4] * D, A22 = M[0] * D;
-                M[0] = A11;
-                M[1] *= -D;
-                M[3] *= -D;
-                M[4] = A22;
-                double b1 = -M[0] * M[2] - M[1] * M[5];
-                double b2 = -M[3] * M[2] - M[4] * M[5];
-                M[2] = b1;
-                M[5] = b2;
-            }
-
-            double invert(double *M)
-            {
-#define Sd(y,x) (Sd[y*3+x])
-#define Dd(y,x) (Dd[y*3+x])
-#define det3(m)    (m(0,0)*(m(1,1)*m(2,2) - m(1,2)*m(2,1)) -  \
-                    m(0,1)*(m(1,0)*m(2,2) - m(1,2)*m(2,0)) +  \
-                    m(0,2)*(m(1,0)*m(2,1) - m(1,1)*m(2,0)))
-                double *Sd = M;
-                double *Dd = M;
-                double d = det3(Sd);
-                double result = 0;
-                if ( d != 0)
-                {
-                    double t[9];
-                    result = d;
-                    d = 1. / d;
-
-                    t[0] = (Sd(1, 1) * Sd(2, 2) - Sd(1, 2) * Sd(2, 1)) * d;
-                    t[1] = (Sd(0, 2) * Sd(2, 1) - Sd(0, 1) * Sd(2, 2)) * d;
-                    t[2] = (Sd(0, 1) * Sd(1, 2) - Sd(0, 2) * Sd(1, 1)) * d;
-
-                    t[3] = (Sd(1, 2) * Sd(2, 0) - Sd(1, 0) * Sd(2, 2)) * d;
-                    t[4] = (Sd(0, 0) * Sd(2, 2) - Sd(0, 2) * Sd(2, 0)) * d;
-                    t[5] = (Sd(0, 2) * Sd(1, 0) - Sd(0, 0) * Sd(1, 2)) * d;
-
-                    t[6] = (Sd(1, 0) * Sd(2, 1) - Sd(1, 1) * Sd(2, 0)) * d;
-                    t[7] = (Sd(0, 1) * Sd(2, 0) - Sd(0, 0) * Sd(2, 1)) * d;
-                    t[8] = (Sd(0, 0) * Sd(1, 1) - Sd(0, 1) * Sd(1, 0)) * d;
-
-                    Dd(0, 0) = t[0];
-                    Dd(0, 1) = t[1];
-                    Dd(0, 2) = t[2];
-                    Dd(1, 0) = t[3];
-                    Dd(1, 1) = t[4];
-                    Dd(1, 2) = t[5];
-                    Dd(2, 0) = t[6];
-                    Dd(2, 1) = t[7];
-                    Dd(2, 2) = t[8];
-                }
-                return result;
-            }
-
-            void warpAffine_gpu(const oclMat &src, oclMat &dst, F coeffs[2][3], int interpolation)
-            {
-                CV_Assert( (src.oclchannels() == dst.oclchannels()) );
-                int srcStep = src.step1();
-                int dstStep = dst.step1();
-                float float_coeffs[2][3];
-                cl_mem coeffs_cm;
-
-                Context *clCxt = src.clCxt;
-                String s[3] = {"NN", "Linear", "Cubic"};
-                String kernelName = "warpAffine" + s[interpolation];
-
-                if (src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
-                {
-                    cl_int st;
-                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
-                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0,
-                                                        sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
-                }
-                else
-                {
-                    cl_int st;
-                    for(int m = 0; m < 2; m++)
-                        for(int n = 0; n < 3; n++)
-                            float_coeffs[m][n] = coeffs[m][n];
-
-                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
-                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm,
-                                                        1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
-
-                }
-
-                //TODO: improve this kernel
-#ifdef ANDROID
-                size_t blkSizeX = 16, blkSizeY = 4;
-#else
-                size_t blkSizeX = 16, blkSizeY = 16;
-#endif
-                size_t glbSizeX;
-                size_t cols;
-
-                if (src.type() == CV_8UC1 && interpolation != 2)
-                {
-                    cols = (dst.cols + dst.offset % 4 + 3) / 4;
-                    glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
-                }
-                else
-                {
-                    cols = dst.cols;
-                    glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
-                }
-
-                size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
-                size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-                size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
-                std::vector< std::pair<size_t, const void *> > args;
-
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcStep));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstStep));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
-
-                openCLExecuteKernel(clCxt, &imgproc_warpAffine, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
-                openCLSafeCall(clReleaseMemObject(coeffs_cm));
-            }
-
-            void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation)
-            {
-                CV_Assert( (src.oclchannels() == dst.oclchannels()) );
-                int srcStep = src.step1();
-                int dstStep = dst.step1();
-                float float_coeffs[3][3];
-                cl_mem coeffs_cm;
-
-                Context *clCxt = src.clCxt;
-                String s[3] = {"NN", "Linear", "Cubic"};
-                String kernelName = "warpPerspective" + s[interpolation];
-
-                if (src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
-                {
-                    cl_int st;
-                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
-                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0,
-                                                        sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
-                }
-                else
-                {
-                    cl_int st;
-                    for(int m = 0; m < 3; m++)
-                        for(int n = 0; n < 3; n++)
-                            float_coeffs[m][n] = coeffs[m][n];
-
-                    coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
-                    openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0,
-                                                        sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
-                }
-
-                //TODO: improve this kernel
-#ifdef ANDROID
-                size_t blkSizeX = 16, blkSizeY = 8;
-#else
-                size_t blkSizeX = 16, blkSizeY = 16;
-#endif
-                size_t glbSizeX;
-                size_t cols;
-                if (src.type() == CV_8UC1 && interpolation == 0)
-                {
-                    cols = (dst.cols + dst.offset % 4 + 3) / 4;
-                    glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
-                }
-                else
-                {
-                    cols = dst.cols;
-                    glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
-                }
-
-                size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
-                size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
-                size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
-
-                std::vector< std::pair<size_t, const void *> > args;
-
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&srcStep));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dstStep));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-                args.push_back(std::make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
-                args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
-
-                openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
-                openCLSafeCall(clReleaseMemObject(coeffs_cm));
-            }
-        }
-
-        void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
-        {
-            int interpolation = flags & INTER_MAX;
-
-            CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
-            CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-
-            dst.create(dsize, src.type());
-
-            CV_Assert(M.rows == 2 && M.cols == 3);
-
-            int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
-            F coeffs[2][3];
-
-            double coeffsM[2*3];
-            Mat coeffsMat(2, 3, CV_64F, (void *)coeffsM);
-            M.convertTo(coeffsMat, coeffsMat.type());
-            if (!warpInd)
-                convert_coeffs(coeffsM);
-
-            for(int i = 0; i < 2; ++i)
-                for(int j = 0; j < 3; ++j)
-                    coeffs[i][j] = coeffsM[i*3+j];
-
-            warpAffine_gpu(src, dst, coeffs, interpolation);
-        }
-
-        void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
-        {
-            int interpolation = flags & INTER_MAX;
-
-            CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
-            CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-
-            dst.create(dsize, src.type());
-
-
-            CV_Assert(M.rows == 3 && M.cols == 3);
-
-            int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
-            double coeffs[3][3];
-
-            double coeffsM[3*3];
-            Mat coeffsMat(3, 3, CV_64F, (void *)coeffsM);
-            M.convertTo(coeffsMat, coeffsMat.type());
-            if (!warpInd)
-                invert(coeffsM);
-
-            for(int i = 0; i < 3; ++i)
-                for(int j = 0; j < 3; ++j)
-                    coeffs[i][j] = coeffsM[i*3+j];
-
-            warpPerspective_gpu(src, dst, coeffs, interpolation);
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // integral
-
-        void integral(const oclMat &src, oclMat &sum, oclMat &sqsum, int sdepth)
-        {
-            CV_Assert(src.type() == CV_8UC1);
-            if (!src.clCxt->supportsFeature(ocl::FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(Error::OpenCLDoubleNotSupported, "Select device doesn't support double");
-                return;
-            }
-
-            if( sdepth <= 0 )
-                sdepth = CV_32S;
-            sdepth = CV_MAT_DEPTH(sdepth);
-            int type = CV_MAKE_TYPE(sdepth, 1);
-
-            int vlen = 4;
-            int offset = src.offset / vlen;
-            int pre_invalid = src.offset % vlen;
-            int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
-
-            oclMat t_sum , t_sqsum;
-            int w = src.cols + 1, h = src.rows + 1;
-
-            char build_option[250];
-            if(Context::getContext()->supportsFeature(ocl::FEATURE_CL_DOUBLE))
-            {
-                t_sqsum.create(src.cols, src.rows, CV_64FC1);
-                sqsum.create(h, w, CV_64FC1);
-                sprintf(build_option, "-D TYPE=double -D TYPE4=double4 -D convert_TYPE4=convert_double4");
-            }
-            else
-            {
-                t_sqsum.create(src.cols, src.rows, CV_32FC1);
-                sqsum.create(h, w, CV_32FC1);
-                sprintf(build_option, "-D TYPE=float -D TYPE4=float4 -D convert_TYPE4=convert_float4");
-            }
-
-            t_sum.create(src.cols, src.rows, type);
-            sum.create(h, w, type);
-
-            int sum_offset = sum.offset / sum.elemSize();
-            int sqsum_offset = sqsum.offset / sqsum.elemSize();
-
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
-            size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
-            openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, sdepth, build_option);
-
-            args.clear();
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sqsum.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
-            size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
-            openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, sdepth, build_option);
-        }
-
-        void integral(const oclMat &src, oclMat &sum, int sdepth)
-        {
-            CV_Assert(src.type() == CV_8UC1);
-            int vlen = 4;
-            int offset = src.offset / vlen;
-            int pre_invalid = src.offset % vlen;
-            int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
-
-            if( sdepth <= 0 )
-                sdepth = CV_32S;
-            sdepth = CV_MAT_DEPTH(sdepth);
-            int type = CV_MAKE_TYPE(sdepth, 1);
-
-            oclMat t_sum;
-            int w = src.cols + 1, h = src.rows + 1;
-
-            t_sum.create(src.cols, src.rows, type);
-            sum.create(h, w, type);
-
-            int sum_offset = sum.offset / vlen;
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
-            size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
-            openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, sdepth);
-
-            args.clear();
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
-            size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
-            openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, sdepth);
-        }
-
-        /////////////////////// corner //////////////////////////////
-
-        static void extractCovData(const oclMat &src, oclMat &Dx, oclMat &Dy,
-                            int blockSize, int ksize, int borderType)
-        {
-            CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
-            double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
-            if (ksize < 0)
-                scale *= 2.;
-
-            if (src.depth() == CV_8U)
-            {
-                scale *= 255.;
-                scale = 1. / scale;
-            }
-            else
-                scale = 1. / scale;
-
-            if (ksize > 0)
-            {
-                Context* clCxt = Context::getContext();
-                if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
-                    src.cols % 8 == 0 && src.rows % 8 == 0 &&
-                    ksize==3 &&
-                    (borderType ==cv::BORDER_REFLECT ||
-                     borderType == cv::BORDER_REPLICATE ||
-                     borderType ==cv::BORDER_REFLECT101 ||
-                     borderType ==cv::BORDER_WRAP))
-                {
-                    Dx.create(src.size(), CV_32FC1);
-                    Dy.create(src.size(), CV_32FC1);
-
-                    const unsigned int block_x = 8;
-                    const unsigned int block_y = 8;
-
-                    unsigned int src_pitch = src.step;
-                    unsigned int dst_pitch = Dx.cols;
-
-                    float _scale = scale;
-
-                    std::vector<std::pair<size_t , const void *> > args;
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
-                    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-                    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
-                    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
-                    size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
-
-                    String option = "-D BLK_X=8 -D BLK_Y=8";
-                    switch(borderType)
-                    {
-                    case cv::BORDER_REPLICATE:
-                        option += " -D BORDER_REPLICATE";
-                        break;
-                    case cv::BORDER_REFLECT:
-                        option += " -D BORDER_REFLECT";
-                        break;
-                    case cv::BORDER_REFLECT101:
-                        option += " -D BORDER_REFLECT101";
-                        break;
-                    case cv::BORDER_WRAP:
-                        option += " -D BORDER_WRAP";
-                        break;
-                    }
-                    openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
-                }
-                else
-                {
-                    Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
-                    Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
-                }
-            }
-            else
-            {
-                Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
-                Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
-            }
-            CV_Assert(Dx.offset == 0 && Dy.offset == 0);
-        }
-
-        static void corner_ocl(const cv::ocl::ProgramEntry* source, String kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
-                        oclMat &dst, int border_type)
-        {
-            char borderType[30];
-            switch (border_type)
-            {
-            case cv::BORDER_CONSTANT:
-                sprintf(borderType, "BORDER_CONSTANT");
-                break;
-            case cv::BORDER_REFLECT101:
-                sprintf(borderType, "BORDER_REFLECT101");
-                break;
-            case cv::BORDER_REFLECT:
-                sprintf(borderType, "BORDER_REFLECT");
-                break;
-            case cv::BORDER_REPLICATE:
-                sprintf(borderType, "BORDER_REPLICATE");
-                break;
-            default:
-                CV_Error(Error::StsBadFlag, "BORDER type is not supported!");
-            }
-
-            std::string buildOptions = format("-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
-                    block_size / 2, block_size / 2, block_size, block_size, borderType);
-
-            size_t blockSizeX = 256, blockSizeY = 1;
-            size_t gSize = blockSizeX - block_size / 2 * 2;
-            size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
-            size_t rows_per_thread = 2;
-            size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
-                                 ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
-                                 (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
-
-            size_t gt[3] = { globalSizeX, globalSizeY, 1 };
-            size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dx.step));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dy.step));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&k));
-
-            openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str());
-        }
-
-        void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
-                          double k, int borderType)
-        {
-            oclMat dx, dy;
-            cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
-        }
-
-        void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
-                          double k, int borderType)
-        {
-            if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-                return;
-            }
-
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE
-                      || borderType == cv::BORDER_REFLECT);
-
-            extractCovData(src, dx, dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32FC1);
-            corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
-        }
-
-        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
-        {
-            oclMat dx, dy;
-            cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
-        }
-
-        void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
-        {
-            if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-                return;
-            }
-
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 ||
-                      borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-
-            extractCovData(src, dx, dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32F);
-
-            corner_ocl(&imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
-        }
-
-        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-
-        static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
-        {
-            CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
-            CV_Assert( !(dst.step & 0x3) );
-
-            //Arrange the NDRange
-            int col = src.cols, row = src.rows;
-            int ltx = 16, lty = 8;
-            if (src.cols % ltx != 0)
-                col = (col / ltx + 1) * ltx;
-            if (src.rows % lty != 0)
-                row = (row / lty + 1) * lty;
-
-            size_t globalThreads[3] = {col, row, 1};
-            size_t localThreads[3]  = {ltx, lty, 1};
-
-            //set args
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sp ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sr ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&maxIter ));
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&eps ));
-
-            openCLExecuteKernel(src.clCxt, &meanShift, "meanshift_kernel", globalThreads, localThreads, args, -1, -1);
-        }
-
-        void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria)
-        {
-            if (src.empty())
-                CV_Error(Error::StsBadArg, "The input image is empty");
-
-            if ( src.depth() != CV_8U || src.oclchannels() != 4 )
-                CV_Error(Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
-
-            dst.create( src.size(), CV_8UC4 );
-
-            if ( !(criteria.type & TermCriteria::MAX_ITER) )
-                criteria.maxCount = 5;
-
-            int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
-
-            float eps;
-            if ( !(criteria.type & TermCriteria::EPS) )
-                eps = 1.f;
-            eps = (float)std::max(criteria.epsilon, 0.0);
-
-            meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
-        }
-
-        static void meanShiftProc_gpu(const oclMat &src, oclMat dstr, oclMat dstsp, int sp, int sr, int maxIter, float eps)
-        {
-            //sanity checks
-            CV_Assert( (src.cols == dstr.cols) && (src.rows == dstr.rows) &&
-                       (src.rows == dstsp.rows) && (src.cols == dstsp.cols));
-            CV_Assert( !(dstsp.step & 0x3) );
-
-            //Arrange the NDRange
-            int col = src.cols, row = src.rows;
-            int ltx = 16, lty = 8;
-            if (src.cols % ltx != 0)
-                col = (col / ltx + 1) * ltx;
-            if (src.rows % lty != 0)
-                row = (row / lty + 1) * lty;
-
-            size_t globalThreads[3] = {col, row, 1};
-            size_t localThreads[3]  = {ltx, lty, 1};
-
-            //set args
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dstr.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dstsp.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstr.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstsp.step ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstr.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstsp.offset ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstr.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstr.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sp ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sr ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&maxIter ));
-            args.push_back( std::make_pair( sizeof(cl_float) , (void *)&eps ));
-
-            openCLExecuteKernel(src.clCxt, &meanShift, "meanshiftproc_kernel", globalThreads, localThreads, args, -1, -1);
-        }
-
-        void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria)
-        {
-            if (src.empty())
-                CV_Error(Error::StsBadArg, "The input image is empty");
-
-            if ( src.depth() != CV_8U || src.oclchannels() != 4 )
-                CV_Error(Error::StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported");
-
-//            if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
-//            {
-//                CV_Error(Error::OpenCLDoubleNotSupportedNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
-//                return;
-//            }
-
-            dstr.create( src.size(), CV_8UC4 );
-            dstsp.create( src.size(), CV_16SC2 );
-
-            if ( !(criteria.type & TermCriteria::MAX_ITER) )
-                criteria.maxCount = 5;
-
-            int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
-
-            float eps;
-            if ( !(criteria.type & TermCriteria::EPS) )
-                eps = 1.f;
-            eps = (float)std::max(criteria.epsilon, 0.0);
-
-            meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////////////////////////
-        ////////////////////////////////////////////////////hist///////////////////////////////////////////////
-        /////////////////////////////////////////////////////////////////////////////////////////////////////
-
-        namespace histograms
-        {
-            const int PARTIAL_HISTOGRAM256_COUNT = 256;
-            const int HISTOGRAM256_BIN_COUNT = 256;
-        }
-        ///////////////////////////////calcHist/////////////////////////////////////////////////////////////////
-        static void calc_sub_hist(const oclMat &mat_src, const oclMat &mat_sub_hist)
-        {
-            using namespace histograms;
-
-            int depth = mat_src.depth();
-
-            size_t localThreads[3]  = { HISTOGRAM256_BIN_COUNT, 1, 1 };
-            size_t globalThreads[3] = { PARTIAL_HISTOGRAM256_COUNT *localThreads[0], 1, 1};
-
-            int dataWidth = 16;
-            int dataWidth_bits = 4;
-            int mask = dataWidth - 1;
-
-            int cols = mat_src.cols * mat_src.oclchannels();
-            int src_offset = mat_src.offset;
-            int hist_step = mat_sub_hist.step >> 2;
-            int left_col = 0, right_col = 0;
-
-            if (cols >= dataWidth * 2 - 1)
-            {
-                left_col = dataWidth - (src_offset & mask);
-                left_col &= mask;
-                src_offset += left_col;
-                cols -= left_col;
-                right_col = cols & mask;
-                cols -= right_col;
-            }
-            else
-            {
-                left_col = cols;
-                right_col = 0;
-                cols = 0;
-                globalThreads[0] = 0;
-            }
-
-            std::vector<std::pair<size_t , const void *> > args;
-            if (globalThreads[0] != 0)
-            {
-                int tempcols = cols >> dataWidth_bits;
-                int inc_x = globalThreads[0] % tempcols;
-                int inc_y = globalThreads[0] / tempcols;
-                src_offset >>= dataWidth_bits;
-                int src_step = mat_src.step >> dataWidth_bits;
-                int datacount = tempcols * mat_src.rows;
-
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&datacount));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tempcols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&inc_x));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&inc_y));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&hist_step));
-
-                openCLExecuteKernel(mat_src.clCxt, &imgproc_histogram, "calc_sub_hist", globalThreads, localThreads, args, -1, depth);
-            }
-
-            if (left_col != 0 || right_col != 0)
-            {
-                src_offset = mat_src.offset;
-                localThreads[0] = 1;
-                localThreads[1] = 256;
-                globalThreads[0] = left_col + right_col;
-                globalThreads[1] = mat_src.rows;
-
-                args.clear();
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&left_col));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&hist_step));
-
-                openCLExecuteKernel(mat_src.clCxt, &imgproc_histogram, "calc_sub_hist_border", globalThreads, localThreads, args, -1, depth);
-            }
-        }
-
-        static void merge_sub_hist(const oclMat &sub_hist, oclMat &mat_hist)
-        {
-            using namespace histograms;
-
-            size_t localThreads[3]  = { 256, 1, 1 };
-            size_t globalThreads[3] = { HISTOGRAM256_BIN_COUNT *localThreads[0], 1, 1};
-            int src_step = sub_hist.step >> 2;
-
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sub_hist.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step));
-
-            openCLExecuteKernel(sub_hist.clCxt, &imgproc_histogram, "merge_hist", globalThreads, localThreads, args, -1, -1);
-        }
-
-        void calcHist(const oclMat &mat_src, oclMat &mat_hist)
-        {
-            using namespace histograms;
-            CV_Assert(mat_src.type() == CV_8UC1);
-            mat_hist.create(1, 256, CV_32SC1);
-
-            oclMat buf(PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_BIN_COUNT, CV_32SC1);
-            buf.setTo(0);
-
-            calc_sub_hist(mat_src, buf);
-            merge_sub_hist(buf, mat_hist);
-        }
-
-        ///////////////////////////////////equalizeHist/////////////////////////////////////////////////////
-        void equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
-        {
-            mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
-
-            oclMat mat_hist(1, 256, CV_32SC1);
-
-            calcHist(mat_src, mat_hist);
-
-            size_t localThreads[3] = { 256, 1, 1};
-            size_t globalThreads[3] = { 256, 1, 1};
-            oclMat lut(1, 256, CV_8UC1);
-            int total = mat_src.rows * mat_src.cols;
-
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
-            args.push_back( std::make_pair( sizeof(int), (void *)&total));
-
-            openCLExecuteKernel(mat_src.clCxt, &imgproc_histogram, "calLUT", globalThreads, localThreads, args, -1, -1);
-            LUT(mat_src, lut, mat_dst);
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // CLAHE
-        namespace clahe
-        {
-            static void calcLut(const oclMat &src, oclMat &dst,
-                const int tilesX, const int tilesY, const cv::Size tileSize,
-                const int clipLimit, const float lutScale)
-            {
-                cl_int2 tile_size;
-                tile_size.s[0] = tileSize.width;
-                tile_size.s[1] = tileSize.height;
-
-                std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
-                args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-
-                String kernelName = "calcLut";
-                size_t localThreads[3]  = { 32, 8, 1 };
-                size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
-                bool is_cpu = isCpuDevice();
-                if (is_cpu)
-                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)"-D CPU");
-                else
-                {
-                    cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
-                    int wave_size = (int)queryWaveFrontSize(kernel);
-                    openCLSafeCall(clReleaseKernel(kernel));
-
-                    std::string opt = format("-D WAVE_SIZE=%d", wave_size);
-                    openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt.c_str());
-                }
-            }
-
-            static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
-                const int tilesX, const int tilesY, const Size & tileSize)
-            {
-                cl_int2 tile_size;
-                tile_size.s[0] = tileSize.width;
-                tile_size.s[1] = tileSize.height;
-
-                std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.offset ));
-
-                size_t localThreads[3]  = { 32, 8, 1 };
-                size_t globalThreads[3] = { src.cols, src.rows, 1 };
-
-                openCLExecuteKernel(Context::getContext(), &imgproc_clahe, "transform", globalThreads, localThreads, args, -1, -1);
-            }
-        }
-
-        namespace
-        {
-            class CLAHE_Impl : public cv::CLAHE
-            {
-            public:
-                CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
-
-                cv::AlgorithmInfo* info() const;
-
-                void apply(cv::InputArray src, cv::OutputArray dst);
-
-                void setClipLimit(double clipLimit);
-                double getClipLimit() const;
-
-                void setTilesGridSize(cv::Size tileGridSize);
-                cv::Size getTilesGridSize() const;
-
-                void collectGarbage();
-
-            private:
-                double clipLimit_;
-                int tilesX_;
-                int tilesY_;
-
-                oclMat srcExt_;
-                oclMat lut_;
-            };
-
-            CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
-                clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
-            {
-            }
-
-            CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_OCL",
-                obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
-                obj.info()->addParam(obj, "tilesX", obj.tilesX_);
-                obj.info()->addParam(obj, "tilesY", obj.tilesY_))
-
-            void CLAHE_Impl::apply(cv::InputArray src_raw, cv::OutputArray dst_raw)
-            {
-                oclMat& src = getOclMatRef(src_raw);
-                oclMat& dst = getOclMatRef(dst_raw);
-                CV_Assert( src.type() == CV_8UC1 );
-
-                dst.create( src.size(), src.type() );
-
-                const int histSize = 256;
-
-                ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
-
-                cv::Size tileSize;
-                oclMat srcForLut;
-
-                if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
-                {
-                    tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-                    srcForLut = src;
-                }
-                else
-                {
-                    ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0,
-                                            tilesX_ - (src.cols % tilesX_), BORDER_REFLECT_101, Scalar::all(0));
-
-                    tileSize = Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-                    srcForLut = srcExt_;
-                }
-
-                const int tileSizeTotal = tileSize.area();
-                const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
-
-                int clipLimit = 0;
-                if (clipLimit_ > 0.0)
-                {
-                    clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
-                    clipLimit = std::max(clipLimit, 1);
-                }
-
-                clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
-                clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
-            }
-
-            void CLAHE_Impl::setClipLimit(double clipLimit)
-            {
-                clipLimit_ = clipLimit;
-            }
-
-            double CLAHE_Impl::getClipLimit() const
-            {
-                return clipLimit_;
-            }
-
-            void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
-            {
-                tilesX_ = tileGridSize.width;
-                tilesY_ = tileGridSize.height;
-            }
-
-            cv::Size CLAHE_Impl::getTilesGridSize() const
-            {
-                return cv::Size(tilesX_, tilesY_);
-            }
-
-            void CLAHE_Impl::collectGarbage()
-            {
-                srcExt_.release();
-                lut_.release();
-            }
-        }
-
-        cv::Ptr<cv::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
-        {
-            return makePtr<CLAHE_Impl>(clipLimit, tileGridSize.width, tileGridSize.height);
-        }
-
-        //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
-
-        static void oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
-                               double sigma_color, double sigma_space,
-                               int borderType )
-        {
-            int cn = src.channels();
-            int i, j, maxk, radius;
-
-            CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
-                       src.type() == dst.type() && src.size() == dst.size() &&
-                       src.data != dst.data );
-
-            if ( sigma_color <= 0 )
-                sigma_color = 1;
-            if ( sigma_space <= 0 )
-                sigma_space = 1;
-
-            double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
-            double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
-
-            if ( d <= 0 )
-                radius = cvRound(sigma_space * 1.5);
-            else
-                radius = d / 2;
-            radius = MAX(radius, 1);
-            d = radius * 2 + 1;
-
-            oclMat temp;
-            copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-
-            std::vector<float> _color_weight(cn * 256);
-            std::vector<float> _space_weight(d * d);
-            std::vector<int> _space_ofs(d * d);
-            float *color_weight = &_color_weight[0];
-            float *space_weight = &_space_weight[0];
-            int *space_ofs = &_space_ofs[0];
-
-            int dst_step_in_pixel = dst.step / dst.elemSize();
-            int dst_offset_in_pixel = dst.offset / dst.elemSize();
-            int temp_step_in_pixel = temp.step / temp.elemSize();
-
-            // initialize color-related bilateral filter coefficients
-            for( i = 0; i < 256 * cn; i++ )
-                color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
-
-            // initialize space-related bilateral filter coefficients
-            for( i = -radius, maxk = 0; i <= radius; i++ )
-                for( j = -radius; j <= radius; j++ )
-                {
-                    double r = std::sqrt((double)i * i + (double)j * j);
-                    if ( r > radius )
-                        continue;
-                    space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
-                    space_ofs[maxk++] = (int)(i * temp_step_in_pixel + j);
-                }
-
-            oclMat oclcolor_weight(1, cn * 256, CV_32FC1, color_weight);
-            oclMat oclspace_weight(1, d * d, CV_32FC1, space_weight);
-            oclMat oclspace_ofs(1, d * d, CV_32SC1, space_ofs);
-
-            String kernelName = "bilateral";
-#ifdef ANDROID
-            size_t localThreads[3]  = { 16, 8, 1 };
-#else
-            size_t localThreads[3]  = { 16, 16, 1 };
-#endif
-            size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-            if ((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
-            {
-                kernelName = "bilateral2";
-                globalThreads[0] = dst.cols >> 2;
-            }
-
-            std::vector<std::pair<size_t , const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&temp.data ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxk ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&radius ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step_in_pixel ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset_in_pixel ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp_step_in_pixel ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp.cols ));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
-
-            openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
-        }
-
-        void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
-        {
-            dst.create( src.size(), src.type() );
-            if ( src.depth() == CV_8U )
-                oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
-            else
-                CV_Error(Error::StsUnsupportedFormat, "Bilateral filtering is only implemented for CV_8U images");
-        }
-
-    }
-}
-//////////////////////////////////mulSpectrums////////////////////////////////////////////////////
-void cv::ocl::mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int /*flags*/, float scale, bool conjB)
-{
-    CV_Assert(a.type() == CV_32FC2);
-    CV_Assert(b.type() == CV_32FC2);
-
-    c.create(a.size(), CV_32FC2);
-
-    size_t lt[3]  = { 16, 16, 1 };
-    size_t gt[3]  = { a.cols, a.rows, 1 };
-
-    String kernelName = conjB ? "mulAndScaleSpectrumsKernel_CONJ":"mulAndScaleSpectrumsKernel";
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&a.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&b.data ));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&c.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.step ));
-
-    Context *clCxt = Context::getContext();
-    openCLExecuteKernel(clCxt, &imgproc_mulAndScaleSpectrums, kernelName, gt, lt, args, -1, -1);
-}
-//////////////////////////////////convolve////////////////////////////////////////////////////
-// ported from CUDA module
-void cv::ocl::ConvolveBuf::create(Size image_size, Size templ_size)
-{
-    result_size = Size(image_size.width - templ_size.width + 1,
-                       image_size.height - templ_size.height + 1);
-
-    block_size = user_block_size;
-    if (user_block_size.width == 0 || user_block_size.height == 0)
-        block_size = estimateBlockSize(result_size, templ_size);
-
-    dft_size.width  = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
-    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
-
-    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
-    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
-    //if (dft_size.width > 8192)
-    dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1.);
-    //if (dft_size.height > 8192)
-    dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1.);
-
-    // To avoid wasting time doing small DFTs
-    dft_size.width = std::max(dft_size.width, 512);
-    dft_size.height = std::max(dft_size.height, 512);
-
-    image_block.create(dft_size, CV_32F);
-    templ_block.create(dft_size, CV_32F);
-    result_data.create(dft_size, CV_32F);
-
-    //spect_len = dft_size.height * (dft_size.width / 2 + 1);
-    image_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
-    templ_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
-    result_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
-
-    // Use maximum result matrix block size for the estimated DFT block size
-    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
-    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
-}
-
-Size cv::ocl::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
-{
-    int width = (result_size.width + 2) / 3;
-    int height = (result_size.height + 2) / 3;
-    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);
-    return Size(width, height);
-}
-
-static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
-{
-#if defined HAVE_CLAMDFFT
-    CV_Assert(image.type() == CV_32F);
-    CV_Assert(templ.type() == CV_32F);
-
-    buf.create(image.size(), templ.size());
-    result.create(buf.result_size, CV_32F);
-
-    Size& block_size = buf.block_size;
-    Size& dft_size = buf.dft_size;
-
-    oclMat& image_block = buf.image_block;
-    oclMat& templ_block = buf.templ_block;
-    oclMat& result_data = buf.result_data;
-
-    oclMat& image_spect = buf.image_spect;
-    oclMat& templ_spect = buf.templ_spect;
-    oclMat& result_spect = buf.result_spect;
-
-    oclMat templ_roi = templ;
-    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
-                   templ_block.cols - templ_roi.cols, 0, Scalar());
-
-    cv::ocl::dft(templ_block, templ_spect, dft_size);
-
-    // Process all blocks of the result matrix
-    for (int y = 0; y < result.rows; y += block_size.height)
-    {
-        for (int x = 0; x < result.cols; x += block_size.width)
-        {
-            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                std::min(y + dft_size.height, image.rows) - y);
-            Rect roi0(x, y, image_roi_size.width, image_roi_size.height);
-
-            oclMat image_roi(image, roi0);
-
-            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                           0, image_block.cols - image_roi.cols, 0, Scalar());
-
-            cv::ocl::dft(image_block, image_spect, dft_size);
-
-            mulSpectrums(image_spect, templ_spect, result_spect, 0,
-                                 1.f / dft_size.area(), ccorr);
-
-            cv::ocl::dft(result_spect, result_data, dft_size, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
-
-            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                 std::min(y + block_size.height, result.rows) - y);
-
-            Rect roi1(x, y, result_roi_size.width, result_roi_size.height);
-            Rect roi2(0, 0, result_roi_size.width, result_roi_size.height);
-
-            oclMat result_roi(result, roi1);
-            oclMat result_block(result_data, roi2);
-
-            result_block.copyTo(result_roi);
-        }
-    }
-
-#else
-    CV_Error(Error::OpenCLNoAMDBlasFft, "OpenCL DFT is not implemented");
-#define UNUSED(x) (void)(x);
-    UNUSED(image) UNUSED(templ) UNUSED(result) UNUSED(ccorr) UNUSED(buf)
-#undef UNUSED
-#endif
-}
-
-static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
-{
-    CV_Assert(src.depth() == CV_32FC1);
-    CV_Assert(temp1.depth() == CV_32F);
-    CV_Assert(temp1.cols <= 17 && temp1.rows <= 17);
-
-    dst.create(src.size(), src.type());
-
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
-    CV_Assert(src.type() == dst.type());
-
-    size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    int src_step = src.step / src.elemSize(), src_offset = src.offset / src.elemSize();
-    int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
-    int temp1_step = temp1.step / temp1.elemSize(), temp1_offset = temp1.offset / temp1.elemSize();
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&temp1.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp1_step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp1.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp1.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&temp1_offset ));
-
-    openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, -1, dst.depth());
-}
-
-void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y, bool ccorr)
-{
-    CV_Assert(x.depth() == CV_32F);
-    CV_Assert(t.depth() == CV_32F);
-    y.create(x.size(), x.type());
-    String kernelName = "convolve";
-    if(t.cols > 17 || t.rows > 17)
-    {
-        ConvolveBuf buf;
-        convolve_run_fft(x, t, y, ccorr, buf);
-    }
-    else
-    {
-        CV_Assert(ccorr == false);
-        convolve_run(x, t, y, kernelName, &imgproc_convolve);
-    }
-}
-void cv::ocl::convolve(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
-{
-    result.create(image.size(), image.type());
-    convolve_run_fft(image, templ, result, ccorr, buf);
-}
diff --git a/modules/ocl/src/interpolate_frames.cpp b/modules/ocl/src/interpolate_frames.cpp
deleted file mode 100644
index 47d6c837a..000000000
--- a/modules/ocl/src/interpolate_frames.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Comuter Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular urpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        namespace interpolate
-        {
-            //The following are ported from NPP_staging.cu
-            // As it is not valid to do pointer offset operations on host for default oclMat's native cl_mem pointer,
-            // we may have to do this on kernel
-            void memsetKernel(float val, oclMat &img, int height, int offset);
-            void normalizeKernel(oclMat &buffer, int height, int factor_offset, int dst_offset);
-            void forwardWarpKernel(const oclMat &src, oclMat &buffer, const oclMat &u, const oclMat &v, const float time_scale,
-                                   int b_offset, int d_offset); // buffer, dst offset
-
-            //OpenCL conversion of nppiStVectorWarp_PSF2x2_32f_C1
-            void vectorWarp(const oclMat &src, const oclMat &u, const oclMat &v,
-                            oclMat &buffer, int buf_offset, float timeScale, int dst_offset);
-            //OpenCL conversion of BlendFrames
-            void blendFrames(const oclMat &frame0, const oclMat &frame1, const oclMat &buffer,
-                             float pos, oclMat &newFrame, cl_mem &, cl_mem &);
-
-            // bind a buffer to an image
-            void bindImgTex(const oclMat &img, cl_mem &tex);
-        }
-    }
-}
-
-void cv::ocl::interpolateFrames(const oclMat &frame0, const oclMat &frame1,
-                                const oclMat &fu, const oclMat &fv,
-                                const oclMat &bu, const oclMat &bv,
-                                float pos, oclMat &newFrame, oclMat &buf)
-{
-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
-    CV_Assert(fu.size() == frame0.size() && fu.type() == frame0.type());
-    CV_Assert(fv.size() == frame0.size() && fv.type() == frame0.type());
-    CV_Assert(bu.size() == frame0.size() && bu.type() == frame0.type());
-    CV_Assert(bv.size() == frame0.size() && bv.type() == frame0.type());
-
-    newFrame.create(frame0.size(), frame0.type());
-
-    buf.create(6 * frame0.rows, frame0.cols, CV_32FC1);
-    buf.setTo(Scalar::all(0));
-
-    size_t step = frame0.step;
-
-    CV_Assert(frame1.step == step && fu.step == step && fv.step == step && bu.step == step && bv.step == step && newFrame.step == step && buf.step == step);
-    cl_mem tex_src0 = 0, tex_src1 = 0;
-
-    // warp flow
-    using namespace interpolate;
-
-    bindImgTex(frame0, tex_src0);
-    bindImgTex(frame1, tex_src1);
-
-    // CUDA Offsets
-    enum
-    {
-        cov0 = 0,
-        cov1,
-        fwdU,
-        fwdV,
-        bwdU,
-        bwdV
-    };
-
-    vectorWarp(fu, fu, fv, buf, cov0, pos,        fwdU);
-    vectorWarp(fv, fu, fv, buf, cov0, pos,        fwdV);
-    vectorWarp(bu, bu, bv, buf, cov1, 1.0f - pos, bwdU);
-    vectorWarp(bv, bu, bv, buf, cov1, 1.0f - pos, bwdU);
-
-    blendFrames(frame0, frame1, buf, pos, newFrame, tex_src0, tex_src1);
-
-    openCLFree(tex_src0);
-    openCLFree(tex_src1);
-}
-
-void interpolate::memsetKernel(float val, oclMat &img, int height, int offset)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "memsetKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    int step = img.step / sizeof(float);
-    offset = step * height * offset;
-
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&val));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&img.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&offset));
-
-    size_t globalThreads[3] = {img.cols, height, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &interpolate_frames, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-void interpolate::normalizeKernel(oclMat &buffer, int height, int factor_offset, int dst_offset)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "normalizeKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    int step   = buffer.step / sizeof(float);
-    factor_offset = step * height * factor_offset;
-    dst_offset    = step * height * dst_offset;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buffer.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&buffer.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&factor_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset));
-
-    size_t globalThreads[3] = {buffer.cols, height, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &interpolate_frames, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void interpolate::forwardWarpKernel(const oclMat &src, oclMat &buffer, const oclMat &u, const oclMat &v, const float time_scale,
-                                    int b_offset, int d_offset)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "forwardWarpKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    int f_step  = u.step / sizeof(float); // flow step
-    int b_step  = buffer.step / sizeof(float);
-
-    b_offset  = b_step * src.rows * b_offset;
-    d_offset  = b_step * src.rows * d_offset;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buffer.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&u.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&v.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&f_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&b_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&b_offset));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&d_offset));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&time_scale));
-
-    size_t globalThreads[3] = {src.cols, src.rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &interpolate_frames, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void interpolate::vectorWarp(const oclMat &src, const oclMat &u, const oclMat &v,
-                             oclMat &buffer, int b_offset, float timeScale, int d_offset)
-{
-    memsetKernel(0, buffer, src.rows, b_offset);
-    forwardWarpKernel(src, buffer, u, v, timeScale, b_offset, d_offset);
-    normalizeKernel(buffer, src.rows, b_offset, d_offset);
-}
-
-void interpolate::blendFrames(const oclMat &frame0, const oclMat &/*frame1*/, const oclMat &buffer, float pos, oclMat &newFrame, cl_mem &tex_src0, cl_mem &tex_src1)
-{
-    int step = buffer.step / sizeof(float);
-
-    Context *clCxt = Context::getContext();
-    String kernelName = "blendFramesKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&tex_src0));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&tex_src1));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buffer.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&newFrame.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&frame0.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&frame0.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&step));
-    args.push_back( std::make_pair( sizeof(cl_float), (void *)&pos));
-
-    size_t globalThreads[3] = {frame0.cols, frame0.rows, 1};
-    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &interpolate_frames, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-void interpolate::bindImgTex(const oclMat &img, cl_mem &texture)
-{
-    if(texture)
-    {
-        openCLFree(texture);
-    }
-    texture = bindTexture(img);
-}
diff --git a/modules/ocl/src/kalman.cpp b/modules/ocl/src/kalman.cpp
deleted file mode 100644
index 5a133a7b1..000000000
--- a/modules/ocl/src/kalman.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//     Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-KalmanFilter::KalmanFilter()
-{
-
-}
-
-KalmanFilter::KalmanFilter(int dynamParams, int measureParams, int controlParams, int type)
-{
-    init(dynamParams, measureParams, controlParams, type);
-}
-
-void KalmanFilter::init(int DP, int MP, int CP, int type)
-{
-    CV_Assert( DP > 0 && MP > 0 );
-    CV_Assert( type == CV_32F || type == CV_64F );
-    CP = cv::max(CP, 0);
-
-    statePre.create(DP, 1, type);
-    statePre.setTo(Scalar::all(0));
-
-    statePost.create(DP, 1, type);
-    statePost.setTo(Scalar::all(0));
-
-    transitionMatrix.create(DP, DP, type);
-    setIdentity(transitionMatrix, 1);
-
-    processNoiseCov.create(DP, DP, type);
-    setIdentity(processNoiseCov, 1);
-
-    measurementNoiseCov.create(MP, MP, type);
-    setIdentity(measurementNoiseCov, 1);
-
-    measurementMatrix.create(MP, DP, type);
-    measurementMatrix.setTo(Scalar::all(0));
-
-    errorCovPre.create(DP, DP, type);
-    errorCovPre.setTo(Scalar::all(0));
-
-    errorCovPost.create(DP, DP, type);
-    errorCovPost.setTo(Scalar::all(0));
-
-    gain.create(DP, MP, type);
-    gain.setTo(Scalar::all(0));
-
-    if( CP > 0 )
-    {
-        controlMatrix.create(DP, CP, type);
-        controlMatrix.setTo(Scalar::all(0));
-    }
-    else
-        controlMatrix.release();
-
-    temp1.create(DP, DP, type);
-    temp2.create(MP, DP, type);
-    temp3.create(MP, MP, type);
-    temp4.create(MP, DP, type);
-    temp5.create(MP, 1, type);
-}
-
-CV_EXPORTS const oclMat& KalmanFilter::predict(const oclMat& control)
-{
-    gemm(transitionMatrix, statePost, 1, oclMat(), 0, statePre);
-    oclMat temp;
-
-    if(control.data)
-        gemm(controlMatrix, control, 1, statePre, 1, statePre);
-    gemm(transitionMatrix, errorCovPost, 1, oclMat(), 0, temp1);
-    gemm(temp1, transitionMatrix, 1, processNoiseCov, 1, errorCovPre, GEMM_2_T);
-    statePre.copyTo(statePost);
-    return statePre;
-}
-
-CV_EXPORTS const oclMat& KalmanFilter::correct(const oclMat& measurement)
-{
-    CV_Assert(measurement.empty() == false);
-    gemm(measurementMatrix, errorCovPre, 1, oclMat(), 0, temp2);
-    gemm(temp2, measurementMatrix, 1, measurementNoiseCov, 1, temp3, GEMM_2_T);
-    Mat temp;
-    solve(Mat(temp3), Mat(temp2), temp, DECOMP_SVD);
-    temp4.upload(temp);
-    gain = temp4.t();
-    gemm(measurementMatrix, statePre, -1, measurement, 1, temp5);
-    gemm(gain, temp5, 1, statePre, 1, statePost);
-    gemm(gain, temp2, -1, errorCovPre, 1, errorCovPost);
-    return statePost;
-}
diff --git a/modules/ocl/src/kmeans.cpp b/modules/ocl/src/kmeans.cpp
deleted file mode 100644
index 9a5b1931d..000000000
--- a/modules/ocl/src/kmeans.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//     Xiaopeng Fu, fuxiaopeng2222@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-static void generateRandomCenter(const std::vector<Vec2f>& box, float* center, RNG& rng)
-{
-    size_t j, dims = box.size();
-    float margin = 1.f/dims;
-    for( j = 0; j < dims; j++ )
-        center[j] = ((float)rng*(1.f+margin*2.f)-margin)*(box[j][1] - box[j][0]) + box[j][0];
-}
-
-// This class is copied from matrix.cpp in core module.
-class KMeansPPDistanceComputer : public ParallelLoopBody
-{
-public:
-    KMeansPPDistanceComputer( float *_tdist2,
-                              const float *_data,
-                              const float *_dist,
-                              int _dims,
-                              size_t _step,
-                              size_t _stepci )
-        : tdist2(_tdist2),
-          data(_data),
-          dist(_dist),
-          dims(_dims),
-          step(_step),
-          stepci(_stepci) { }
-
-    void operator()( const cv::Range& range ) const
-    {
-        const int begin = range.start;
-        const int end = range.end;
-
-        for ( int i = begin; i<end; i++ )
-        {
-            tdist2[i] = std::min(normL2Sqr_(data + step*i, data + stepci, dims), dist[i]);
-        }
-    }
-
-private:
-    KMeansPPDistanceComputer& operator=(const KMeansPPDistanceComputer&); // to quiet MSVC
-
-    float *tdist2;
-    const float *data;
-    const float *dist;
-    const int dims;
-    const size_t step;
-    const size_t stepci;
-};
-/*
-k-means center initialization using the following algorithm:
-Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
-*/
-static void generateCentersPP(const Mat& _data, Mat& _out_centers,
-                              int K, RNG& rng, int trials)
-{
-    int i, j, k, dims = _data.cols, N = _data.rows;
-    const float* data = (float*)_data.data;
-    size_t step = _data.step/sizeof(data[0]);
-    std::vector<int> _centers(K);
-    int* centers = &_centers[0];
-    std::vector<float> _dist(N*3);
-    float* dist = &_dist[0], *tdist = dist + N, *tdist2 = tdist + N;
-    double sum0 = 0;
-
-    centers[0] = (unsigned)rng % N;
-
-    for( i = 0; i < N; i++ )
-    {
-        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
-        sum0 += dist[i];
-    }
-
-    for( k = 1; k < K; k++ )
-    {
-        double bestSum = DBL_MAX;
-        int bestCenter = -1;
-
-        for( j = 0; j < trials; j++ )
-        {
-            double p = (double)rng*sum0, s = 0;
-            for( i = 0; i < N-1; i++ )
-                if( (p -= dist[i]) <= 0 )
-                    break;
-            int ci = i;
-
-            parallel_for_(Range(0, N),
-                          KMeansPPDistanceComputer(tdist2, data, dist, dims, step, step*ci));
-            for( i = 0; i < N; i++ )
-            {
-                s += tdist2[i];
-            }
-
-            if( s < bestSum )
-            {
-                bestSum = s;
-                bestCenter = ci;
-                std::swap(tdist, tdist2);
-            }
-        }
-        centers[k] = bestCenter;
-        sum0 = bestSum;
-        std::swap(dist, tdist);
-    }
-
-    for( k = 0; k < K; k++ )
-    {
-        const float* src = data + step*centers[k];
-        float* dst = _out_centers.ptr<float>(k);
-        for( j = 0; j < dims; j++ )
-            dst[j] = src[j];
-    }
-}
-
-void cv::ocl::distanceToCenters(const oclMat &src, const oclMat &centers, Mat &dists, Mat &labels, int distType)
-{
-    CV_Assert(src.cols * src.channels() == centers.cols * centers.channels());
-    CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F);
-    CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR);
-
-    dists.create(src.rows, 1, CV_32FC1);
-    labels.create(src.rows, 1, CV_32SC1);
-
-    std::stringstream build_opt_ss;
-    build_opt_ss << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST");
-
-    int src_step = src.step / src.elemSize1();
-    int centers_step = centers.step / centers.elemSize1();
-    int feature_width = centers.cols * centers.oclchannels();
-    int src_offset = src.offset / src.elemSize1();
-    int centers_offset = centers.offset / centers.elemSize1();
-
-    int all_dist_count = src.rows * centers.rows;
-    oclMat all_dist(1, all_dist_count, CV_32FC1);
-
-    std::vector<std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&centers.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&all_dist.data));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&feature_width));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers.rows));
-
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&centers_offset));
-
-    size_t globalThreads[3] = { all_dist_count, 1, 1 };
-
-    openCLExecuteKernel(Context::getContext(), &kmeans_kernel,
-                        "distanceToCenters", globalThreads, NULL, args, -1, -1, build_opt_ss.str().c_str());
-
-    Mat all_dist_cpu;
-    all_dist.download(all_dist_cpu);
-
-    for (int i = 0; i < src.rows; ++i)
-    {
-        Point p;
-        double minVal;
-
-        Rect roi(i * centers.rows, 0, centers.rows, 1);
-        Mat hdr(all_dist_cpu, roi);
-
-        cv::minMaxLoc(hdr, &minVal, NULL, &p);
-
-        dists.at<float>(i, 0) = static_cast<float>(minVal);
-        labels.at<int>(i, 0) = p.x;
-    }
-}
-
-///////////////////////////////////k - means /////////////////////////////////////////////////////////
-
-double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
-                       TermCriteria criteria, int attempts, int flags, oclMat &_centers)
-{
-    const int SPP_TRIALS = 3;
-    bool isrow = _src.rows == 1 && _src.oclchannels() > 1;
-    int N = !isrow ? _src.rows : _src.cols;
-    int dims = (!isrow ? _src.cols : 1) * _src.oclchannels();
-    int type = _src.depth();
-
-    attempts = std::max(attempts, 1);
-    CV_Assert(type == CV_32F && K > 0 );
-    CV_Assert( N >= K );
-
-    Mat _labels;
-    if( flags & KMEANS_USE_INITIAL_LABELS )
-    {
-        CV_Assert( (_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
-                   _bestLabels.cols * _bestLabels.rows == N &&
-                   _bestLabels.type() == CV_32S );
-        _bestLabels.download(_labels);
-    }
-    else
-    {
-        if( !((_bestLabels.cols == 1 || _bestLabels.rows == 1) &&
-                _bestLabels.cols * _bestLabels.rows == N &&
-                _bestLabels.type() == CV_32S &&
-                _bestLabels.isContinuous()))
-            _bestLabels.create(N, 1, CV_32S);
-        _labels.create(_bestLabels.size(), _bestLabels.type());
-    }
-    int* labels = _labels.ptr<int>();
-
-    Mat data;
-    _src.download(data);
-    Mat centers(K, dims, type), old_centers(K, dims, type), temp(1, dims, type);
-    std::vector<int> counters(K);
-    std::vector<Vec2f> _box(dims);
-    Vec2f* box = &_box[0];
-    double best_compactness = DBL_MAX, compactness = 0;
-    RNG& rng = theRNG();
-    int a, iter, i, j, k;
-
-    if( criteria.type & TermCriteria::EPS )
-        criteria.epsilon = std::max(criteria.epsilon, 0.);
-    else
-        criteria.epsilon = FLT_EPSILON;
-    criteria.epsilon *= criteria.epsilon;
-
-    if( criteria.type & TermCriteria::COUNT )
-        criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
-    else
-        criteria.maxCount = 100;
-
-    if( K == 1 )
-    {
-        attempts = 1;
-        criteria.maxCount = 2;
-    }
-
-    const float* sample = data.ptr<float>();
-    for( j = 0; j < dims; j++ )
-        box[j] = Vec2f(sample[j], sample[j]);
-
-    for( i = 1; i < N; i++ )
-    {
-        sample = data.ptr<float>(i);
-        for( j = 0; j < dims; j++ )
-        {
-            float v = sample[j];
-            box[j][0] = std::min(box[j][0], v);
-            box[j][1] = std::max(box[j][1], v);
-        }
-    }
-
-    for( a = 0; a < attempts; a++ )
-    {
-        double max_center_shift = DBL_MAX;
-        for( iter = 0;; )
-        {
-            swap(centers, old_centers);
-
-            if( iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)) )
-            {
-                if( flags & KMEANS_PP_CENTERS )
-                    generateCentersPP(data, centers, K, rng, SPP_TRIALS);
-                else
-                {
-                    for( k = 0; k < K; k++ )
-                        generateRandomCenter(_box, centers.ptr<float>(k), rng);
-                }
-            }
-            else
-            {
-                if( iter == 0 && a == 0 && (flags & KMEANS_USE_INITIAL_LABELS) )
-                {
-                    for( i = 0; i < N; i++ )
-                        CV_Assert( (unsigned)labels[i] < (unsigned)K );
-                }
-
-                // compute centers
-                centers = Scalar(0);
-                for( k = 0; k < K; k++ )
-                    counters[k] = 0;
-
-                for( i = 0; i < N; i++ )
-                {
-                    sample = data.ptr<float>(i);
-                    k = labels[i];
-                    float* center = centers.ptr<float>(k);
-                    j=0;
-#if CV_ENABLE_UNROLLED
-                    for(; j <= dims - 4; j += 4 )
-                    {
-                        float t0 = center[j] + sample[j];
-                        float t1 = center[j+1] + sample[j+1];
-
-                        center[j] = t0;
-                        center[j+1] = t1;
-
-                        t0 = center[j+2] + sample[j+2];
-                        t1 = center[j+3] + sample[j+3];
-
-                        center[j+2] = t0;
-                        center[j+3] = t1;
-                    }
-#endif
-                    for( ; j < dims; j++ )
-                        center[j] += sample[j];
-                    counters[k]++;
-                }
-
-                if( iter > 0 )
-                    max_center_shift = 0;
-
-                for( k = 0; k < K; k++ )
-                {
-                    if( counters[k] != 0 )
-                        continue;
-
-                    // if some cluster appeared to be empty then:
-                    //   1. find the biggest cluster
-                    //   2. find the farthest from the center point in the biggest cluster
-                    //   3. exclude the farthest point from the biggest cluster and form a new 1-point cluster.
-                    int max_k = 0;
-                    for( int k1 = 1; k1 < K; k1++ )
-                    {
-                        if( counters[max_k] < counters[k1] )
-                            max_k = k1;
-                    }
-
-                    double max_dist = 0;
-                    int farthest_i = -1;
-                    float* new_center =  centers.ptr<float>(k);
-                    float* old_center =  centers.ptr<float>(max_k);
-                    float* _old_center = temp.ptr<float>(); // normalized
-                    float scale = 1.f/counters[max_k];
-                    for( j = 0; j < dims; j++ )
-                        _old_center[j] = old_center[j]*scale;
-
-                    for( i = 0; i < N; i++ )
-                    {
-                        if( labels[i] != max_k )
-                            continue;
-                        sample = data.ptr<float>(i);
-                        double dist = normL2Sqr_(sample, _old_center, dims);
-
-                        if( max_dist <= dist )
-                        {
-                            max_dist = dist;
-                            farthest_i = i;
-                        }
-                    }
-
-                    counters[max_k]--;
-                    counters[k]++;
-                    labels[farthest_i] = k;
-                    sample = data.ptr<float>(farthest_i);
-
-                    for( j = 0; j < dims; j++ )
-                    {
-                        old_center[j] -= sample[j];
-                        new_center[j] += sample[j];
-                    }
-                }
-
-                for( k = 0; k < K; k++ )
-                {
-                    float* center = centers.ptr<float>(k);
-                    CV_Assert( counters[k] != 0 );
-
-                    float scale = 1.f/counters[k];
-                    for( j = 0; j < dims; j++ )
-                        center[j] *= scale;
-
-                    if( iter > 0 )
-                    {
-                        double dist = 0;
-                        const float* old_center = old_centers.ptr<float>(k);
-                        for( j = 0; j < dims; j++ )
-                        {
-                            double t = center[j] - old_center[j];
-                            dist += t*t;
-                        }
-                        max_center_shift = std::max(max_center_shift, dist);
-                    }
-                }
-            }
-
-            if( ++iter == MAX(criteria.maxCount, 2) || max_center_shift <= criteria.epsilon )
-                break;
-
-            // assign labels
-            Mat dists(1, N, CV_64F);
-            _centers.upload(centers);
-            distanceToCenters(_src, _centers, dists, _labels);
-            _bestLabels.upload(_labels);
-
-            float* dist = dists.ptr<float>(0);
-            compactness = 0;
-            for( i = 0; i < N; i++ )
-                compactness += (double)dist[i];
-        }
-
-        if( compactness < best_compactness )
-            best_compactness = compactness;
-    }
-
-    return best_compactness;
-}
diff --git a/modules/ocl/src/knearest.cpp b/modules/ocl/src/knearest.cpp
deleted file mode 100644
index 143e7aa7a..000000000
--- a/modules/ocl/src/knearest.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-KNearestNeighbour::KNearestNeighbour()
-{
-    clear();
-}
-
-KNearestNeighbour::~KNearestNeighbour()
-{
-    clear();
-    samples_ocl.release();
-}
-
-void KNearestNeighbour::clear()
-{
-    CvKNearest::clear();
-}
-
-bool KNearestNeighbour::train(const Mat& trainData, Mat& labels, Mat& sampleIdx,
-                              bool isRegression, int _max_k, bool updateBase)
-{
-    max_k = _max_k;
-    bool cv_knn_train = CvKNearest::train(trainData, labels, sampleIdx, isRegression, max_k, updateBase);
-
-    CvVectors* s = CvKNearest::samples;
-
-    cv::Mat samples_mat(s->count, CvKNearest::var_count + 1, s->type);
-
-    float* s1 = (float*)(s + 1);
-    for(int i = 0; i < s->count; i++)
-    {
-        float* t1 = s->data.fl[i];
-        for(int j = 0; j < CvKNearest::var_count; j++)
-        {
-            Point pos(j, i);
-            samples_mat.at<float>(pos) = t1[j];
-        }
-
-        Point pos_label(CvKNearest::var_count, i);
-        samples_mat.at<float>(pos_label) = s1[i];
-    }
-
-    samples_ocl = samples_mat;
-    return cv_knn_train;
-}
-
-void KNearestNeighbour::find_nearest(const oclMat& samples, int k, oclMat& lables)
-{
-    CV_Assert(!samples_ocl.empty());
-    lables.create(samples.rows, 1, CV_32FC1);
-
-    CV_Assert(samples.cols == CvKNearest::var_count);
-    CV_Assert(samples.type() == CV_32FC1);
-    CV_Assert(k >= 1 && k <= max_k);
-
-    int k1 = KNearest::get_sample_count();
-    k1 = MIN( k1, k );
-
-    String kernel_name = "knn_find_nearest";
-    cl_ulong local_memory_size = (cl_ulong)Context::getContext()->getDeviceInfo().localMemorySize;
-    int nThreads = local_memory_size / (2 * k * 4);
-    if(nThreads >= 256)
-        nThreads = 256;
-
-    int smem_size = nThreads * k * 4 * 2;
-    size_t local_thread[] = {1, nThreads, 1};
-    size_t global_thread[] = {1, samples.rows, 1};
-
-    char build_option[50];
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        sprintf(build_option, " ");
-    }else
-        sprintf(build_option, "-D DOUBLE_SUPPORT");
-
-    std::vector< std::pair<size_t, const void*> > args;
-
-    int samples_ocl_step = samples_ocl.step/samples_ocl.elemSize();
-    int samples_step = samples.step/samples.elemSize();
-    int lables_step = lables.step/lables.elemSize();
-
-    int _regression = 0;
-    if(CvKNearest::regression)
-        _regression = 1;
-
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&samples_ocl.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void*)&lables.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&lables_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&_regression));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&k1));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&samples_ocl.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void*)&nThreads));
-    args.push_back(std::make_pair(smem_size, (void*)NULL));
-    openCLExecuteKernel(Context::getContext(), &knearest, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
-}
\ No newline at end of file
diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp
deleted file mode 100644
index c95ad8421..000000000
--- a/modules/ocl/src/match_template.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        void matchTemplate_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void matchTemplate_SQDIFF_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void convolve_32F(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void matchTemplate_CCORR(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void matchTemplate_CCORR_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void matchTemplate_CCOFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-        void matchTemplate_CCOFF_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
-
-
-        void matchTemplateNaive_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, int cn);
-
-        void matchTemplateNaive_CCORR(
-            const oclMat &image, const oclMat &templ, oclMat &result, int cn);
-
-        void extractFirstChannel_32F(
-            const oclMat &image, oclMat &result);
-
-        // Evaluates optimal template's area threshold. If
-        // template's area is less  than the threshold, we use naive match
-        // template version, otherwise FFT-based (if available)
-        static bool useNaive(int method, int depth, Size size)
-        {
-#ifdef HAVE_CLAMDFFT
-            if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)))
-            {
-                return true;
-            }
-            else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
-            {
-                return size.height < 18 && size.width < 18;
-            }
-            else
-                return false;
-#else
-#define UNUSED(x) (void)(x);
-            UNUSED(method) UNUSED(depth) UNUSED(size)
-#undef  UNUSED
-            return true;
-#endif
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // SQDIFF
-        void matchTemplate_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
-            {
-                matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
-                return;
-            }
-            else
-            {
-                buf.image_sqsums.resize(1);
-
-                // TODO, add double support for ocl::integral
-                // use CPU integral temporarily
-                Mat sums, sqsums;
-                cv::integral(Mat(image.reshape(1)), sums, sqsums);
-                buf.image_sqsums[0] = sqsums;
-
-                unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-                matchTemplate_CCORR(image, templ, result, buf);
-
-                //port CUDA's matchTemplatePrepared_SQDIFF_8U
-                Context *clCxt = image.clCxt;
-                String kernelName = "matchTemplate_Prepared_SQDIFF";
-                std::vector< std::pair<size_t, const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
-                args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-                size_t globalThreads[3] = {result.cols, result.rows, 1};
-                size_t localThreads[3]  = {16, 16, 1};
-
-                const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
-                openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
-            }
-        }
-
-        void matchTemplate_SQDIFF_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            matchTemplate_CCORR(image, templ, result, buf);
-            buf.image_sums.resize(1);
-
-            integral(image.reshape(1), buf.image_sums[0]);
-
-            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-            Context *clCxt = image.clCxt;
-            String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
-            std::vector< std::pair<size_t, const void *> > args;
-
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
-            args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
-        }
-
-        void matchTemplateNaive_SQDIFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, int)
-        {
-            CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
-                      || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
-                     );
-            CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
-            CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
-
-            Context *clCxt = image.clCxt;
-            String kernelName = "matchTemplate_Naive_SQDIFF";
-
-            std::vector< std::pair<size_t, const void *> > args;
-
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // CCORR
-        void convolve_32F(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            ConvolveBuf convolve_buf;
-            convolve_buf.user_block_size = buf.user_block_size;
-            if (image.oclchannels() == 1)
-                convolve(image, templ, result, true, convolve_buf);
-            else
-            {
-                oclMat result_;
-                convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
-                extractFirstChannel_32F(result_, result);
-            }
-        }
-
-        void matchTemplate_CCORR(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            if (useNaive(TM_CCORR, image.depth(), templ.size()))
-            {
-                matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
-                return;
-            }
-            else
-            {
-                if(image.depth() == CV_8U && templ.depth() == CV_8U)
-                {
-                    image.convertTo(buf.imagef, CV_32F);
-                    templ.convertTo(buf.templf, CV_32F);
-                    convolve_32F(buf.imagef, buf.templf, result, buf);
-                }
-                else
-                {
-                    convolve_32F(image, templ, result, buf);
-                }
-            }
-        }
-
-        void matchTemplate_CCORR_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            cv::ocl::oclMat temp;
-            matchTemplate_CCORR(image, templ, result, buf);
-            buf.image_sums.resize(1);
-            buf.image_sqsums.resize(1);
-            integral(image.reshape(1), buf.image_sums[0], temp);
-            if(temp.depth() == CV_64F)
-                temp.convertTo(buf.image_sqsums[0], CV_32FC1);
-            else
-                buf.image_sqsums[0] = temp;
-            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-            Context *clCxt = image.clCxt;
-            String kernelName = "normalizeKernel";
-            std::vector< std::pair<size_t, const void *> > args;
-
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
-            args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
-        }
-
-        void matchTemplateNaive_CCORR(
-            const oclMat &image, const oclMat &templ, oclMat &result, int)
-        {
-            CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
-                      || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
-                     );
-            CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
-            CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
-
-            Context *clCxt = image.clCxt;
-            String kernelName = "matchTemplate_Naive_CCORR";
-
-            std::vector< std::pair<size_t, const void *> > args;
-
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
-        }
-        //////////////////////////////////////////////////////////////////////
-        // CCOFF
-        void matchTemplate_CCOFF(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
-
-            matchTemplate_CCORR(image, templ, result, buf);
-
-            Context *clCxt = image.clCxt;
-            String kernelName;
-
-            kernelName = "matchTemplate_Prepared_CCOFF";
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-            Vec4f templ_sum = Vec4f::all(0);
-            // to be continued in the following section
-            if(image.oclchannels() == 1)
-            {
-                buf.image_sums.resize(1);
-                integral(image, buf.image_sums[0]);
-
-                templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
-                args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
-            }
-            else
-            {
-
-                split(image, buf.images);
-                templ_sum = sum(templ) / templ.size().area();
-                buf.image_sums.resize(buf.images.size());
-
-
-                for(int i = 0; i < image.oclchannels(); i ++)
-                {
-                    integral(buf.images[i], buf.image_sums[i]);
-                }
-                switch(image.oclchannels())
-                {
-                case 4:
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
-                    break;
-                default:
-                    CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
-                    break;
-                }
-            }
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
-        }
-
-        void matchTemplate_CCOFF_NORMED(
-            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-
-            matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
-            float scale = 1.f / templ.size().area();
-
-            Context *clCxt = image.clCxt;
-            String kernelName;
-
-            kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-            args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
-
-            Vec4f templ_sum   = Vec4f::all(0);
-            Vec4f templ_sqsum = Vec4f::all(0);
-            // to be continued in the following section
-            if(image.oclchannels() == 1)
-            {
-                buf.image_sums.resize(1);
-                buf.image_sqsums.resize(1);
-                cv::ocl::oclMat temp;
-                integral(image, buf.image_sums[0], temp);
-                if(temp.depth() == CV_64F)
-                    temp.convertTo(buf.image_sqsums[0], CV_32FC1);
-                else
-                    buf.image_sqsums[0] = temp;
-
-                templ_sum[0]   = (float)sum(templ)[0];
-
-                templ_sqsum[0] = sqrSum(templ)[0];
-
-                templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
-                templ_sum[0]   *= scale;
-
-                args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
-                args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
-                args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
-            }
-            else
-            {
-
-                split(image, buf.images);
-                templ_sum   = sum(templ);
-
-                templ_sqsum = sqrSum(templ);
-
-                templ_sqsum -= scale * templ_sum * templ_sum;
-
-                float templ_sqsum_sum = 0;
-                for(int i = 0; i < image.oclchannels(); i ++)
-                {
-                    templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
-                }
-                templ_sum   *= scale;
-                buf.image_sums.resize(buf.images.size());
-                buf.image_sqsums.resize(buf.images.size());
-                cv::ocl::oclMat temp;
-                for(int i = 0; i < image.oclchannels(); i ++)
-                {
-                    integral(buf.images[i], buf.image_sums[i], temp);
-                    if(temp.depth() == CV_64F)
-                        temp.convertTo(buf.image_sqsums[i], CV_32FC1);
-                    else
-                        buf.image_sqsums[i] = temp;
-                }
-
-                switch(image.oclchannels())
-                {
-                case 4:
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
-                    args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
-                    args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
-                    args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
-                    break;
-                default:
-                    CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
-                    break;
-                }
-            }
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
-        }
-        void extractFirstChannel_32F(const oclMat &image, oclMat &result)
-        {
-            Context *clCxt = image.clCxt;
-            String kernelName;
-
-            kernelName = "extractFirstChannel";
-            size_t globalThreads[3] = {result.cols, result.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
-
-            openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-    }/*ocl*/
-} /*cv*/
-
-void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
-{
-    MatchTemplateBuf buf;
-    matchTemplate(image, templ, result, method, buf);
-}
-void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
-{
-    CV_Assert(image.type() == templ.type());
-    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
-
-    typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
-
-    const Caller callers[] =
-    {
-        ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
-        ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
-        ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
-    };
-
-    Caller caller = callers[method];
-    CV_Assert(caller);
-    caller(image, templ, result, buf);
-}
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
deleted file mode 100644
index c028fb729..000000000
--- a/modules/ocl/src/matrix_operations.cpp
+++ /dev/null
@@ -1,632 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-#define ALIGN 32
-#define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
-
-// helper routines
-namespace cv
-{
-    namespace ocl
-    {
-        extern DevMemType gDeviceMemType;
-        extern DevMemRW gDeviceMemRW;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////
-// convert_C3C4
-
-static void convert_C3C4(const cl_mem &src, oclMat &dst)
-{
-    Context *clCxt = dst.clCxt;
-    int pixel_end = dst.wholecols * dst.wholerows - 1;
-    int dstStep_in_pixel = dst.step1() / dst.oclchannels();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholecols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholerows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
-
-    size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
-
-#ifdef ANDROID
-    openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, NULL,
-                        args, -1, -1, buildOptions.c_str());
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-    openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
-                        args, -1, -1, buildOptions.c_str());
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////
-// convert_C4C3
-
-static void convert_C4C3(const oclMat &src, cl_mem &dst)
-{
-    int srcStep_in_pixel = src.step1() / src.oclchannels();
-    int pixel_end = src.wholecols * src.wholerows - 1;
-    Context *clCxt = src.clCxt;
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholecols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholerows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
-
-    size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
-
-#ifdef ANDROID
-    openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, NULL, args, -1, -1, buildOptions.c_str());
-#else
-    size_t localThreads[3] = { 256, 1, 1};
-    openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
-#endif
-}
-
-void cv::ocl::oclMat::upload(const Mat &m)
-{
-    if (!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && m.depth() == CV_64F)
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    CV_DbgAssert(!m.empty());
-    Size wholeSize;
-    Point ofs;
-    m.locateROI(wholeSize, ofs);
-    create(wholeSize, m.type());
-
-    if (m.channels() == 3)
-    {
-        int pitch = wholeSize.width * 3 * m.elemSize1();
-        int tail_padding = m.elemSize1() * 3072;
-        int err;
-        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
-                                     (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
-        openCLVerifyCall(err);
-
-        openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
-        convert_C3C4(temp, *this);
-        openCLSafeCall(clReleaseMemObject(temp));
-    }
-    else
-        openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
-
-    rows = m.rows;
-    cols = m.cols;
-    offset = ofs.y * step + ofs.x * elemSize();
-}
-
-cv::ocl::oclMat::operator cv::_InputArray()
-{
-    return _InputArray(cv::_InputArray::OCL_MAT, this);
-}
-
-cv::ocl::oclMat::operator cv::_OutputArray()
-{
-    return _OutputArray(cv::_InputArray::OCL_MAT, this);
-}
-
-cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
-{
-    CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
-    return *(oclMat*)src.getObj();
-}
-
-cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
-{
-    CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
-    return *(oclMat*)src.getObj();
-}
-
-void cv::ocl::oclMat::download(cv::Mat &m) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(wholerows, wholecols, type());
-
-    if(m.channels() == 3)
-    {
-        int pitch = wholecols * 3 * m.elemSize1();
-        int tail_padding = m.elemSize1() * 3072;
-        int err;
-        cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
-                                     (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
-        openCLVerifyCall(err);
-
-        convert_C4C3(*this, temp);
-        openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
-        openCLSafeCall(clReleaseMemObject(temp));
-    }
-    else
-    {
-        openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
-    }
-
-    Size wholesize;
-    Point ofs;
-    locateROI(wholesize, ofs);
-    m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
-}
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// CopyTo /////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
-{
-    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
-                  src.rows == dst.rows && src.cols == dst.cols
-                  && mask.type() == CV_8UC1);
-
-    std::vector<std::pair<size_t , const void *> > args;
-
-    String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
-        {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
-        {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
-        {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
-    };
-
-    char compile_option[32];
-    sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-
-    int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
-    int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
-
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
-
-    openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
-                        localThreads, args, -1, -1, compile_option);
-}
-
-void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
-{
-    if (mask.empty())
-    {
-        CV_DbgAssert(!this->empty());
-        mat.create(size(), type());
-        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
-                           data, step, cols * elemSize(), rows, offset);
-    }
-    else
-    {
-        mat.create(size(), type());
-        copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////
-//////////////////////////////// ConvertTo ////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
-{
-    String kernelName = "convert_to";
-    float alpha_f = alpha, beta_f = beta;
-    int sdepth = src.depth(), ddepth = dst.depth();
-    int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
-    int cols1 = src.cols * src.oclchannels();
-
-    char buildOptions[150], convertString[50];
-    const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
-    sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
-            typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
-
-    CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
-    std::vector<std::pair<size_t , const void *> > args;
-
-    size_t localThreads[3] = { 16, 16, 1 };
-    size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
-                                divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
-
-    int doffset1 = dst.offset / dst.elemSize1();
-    int soffset1 = src.offset / src.elemSize1();
-
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols1 ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sstep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&soffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstep1 ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&doffset1 ));
-    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
-    args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
-
-    openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
-                        localThreads, args, -1, -1, buildOptions);
-}
-
-void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
-{
-    if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) &&
-            (depth() == CV_64F || dst.depth() == CV_64F))
-    {
-        CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-        return;
-    }
-
-    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
-                   && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if( rtype < 0 )
-        rtype = type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
-
-    int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
-    if( sdepth == ddepth && noScale )
-    {
-        copyTo(dst);
-        return;
-    }
-
-    oclMat temp;
-    const oclMat *psrc = this;
-    if( sdepth != ddepth && psrc == &dst )
-        psrc = &(temp = *this);
-
-    dst.create( size(), rtype );
-    convert_run(*psrc, dst, alpha, beta);
-}
-
-///////////////////////////////////////////////////////////////////////////
-//////////////////////////////// setTo ////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
-{
-    setTo(s);
-    return *this;
-}
-
-#ifdef CL_VERSION_1_2
-
-template <typename CLT, typename PT>
-static std::vector<uchar> cvt1(const cv::Scalar & s)
-{
-    std::vector<uchar> _buf(sizeof(CLT));
-    CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
-    buf[0] = saturate_cast<PT>(s[0]);
-    return _buf;
-}
-
-template <typename CLT, typename PT>
-static std::vector<uchar> cvt2(const cv::Scalar & s)
-{
-    std::vector<uchar> _buf(sizeof(CLT));
-    CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
-    buf->s[0] = saturate_cast<PT>(s[0]);
-    buf->s[1] = saturate_cast<PT>(s[1]);
-    return _buf;
-}
-
-template <typename CLT, typename PT>
-static std::vector<uchar> cvt4(const cv::Scalar & s)
-{
-    std::vector<uchar> _buf(sizeof(CLT));
-    CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
-    buf->s[0] = saturate_cast<PT>(s[0]);
-    buf->s[1] = saturate_cast<PT>(s[1]);
-    buf->s[2] = saturate_cast<PT>(s[2]);
-    buf->s[3] = saturate_cast<PT>(s[3]);
-    return _buf;
-}
-
-typedef std::vector<uchar> (*ConvertFunc)(const cv::Scalar & s);
-
-static std::vector<uchar> scalarToCLVector(const cv::Scalar & s, int type)
-{
-    const int depth = CV_MAT_DEPTH(type);
-    const int channels = CV_MAT_CN(type);
-
-    static const ConvertFunc funcs[4][7] =
-    {
-        { cvt1<cl_uchar, uchar>, cvt1<cl_char, char>, cvt1<cl_ushort, ushort>, cvt1<cl_short, short>,
-          cvt1<cl_int, int>, cvt1<cl_float, float>, cvt1<cl_double, double> },
-
-        { cvt2<cl_uchar2, uchar>, cvt2<cl_char2, char>, cvt2<cl_ushort2, ushort>, cvt2<cl_short2, short>,
-          cvt2<cl_int2, int>, cvt2<cl_float2, float>, cvt2<cl_double2, double> },
-
-        { 0, 0, 0, 0, 0, 0, 0 },
-
-        { cvt4<cl_uchar4, uchar>, cvt4<cl_char4, char>, cvt4<cl_ushort4, ushort>, cvt4<cl_short4, short>,
-          cvt4<cl_int4, int>, cvt4<cl_float4, float>, cvt4<cl_double4, double> }
-    };
-
-    ConvertFunc func = funcs[channels - 1][depth];
-    return func(s);
-}
-
-#endif
-
-static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
-{
-    std::vector<std::pair<size_t , const void *> > args;
-
-    size_t localThreads[3] = {16, 16, 1};
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-
-    if (dst.type() == CV_8UC1)
-        globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
-    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
-
-    Mat mat(1, 1, dst.type(), scalar);
-
-#ifdef CL_VERSION_1_2
-    // this enables backwards portability to
-    // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-    if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) && dst.isContinuous())
-    {
-        std::vector<uchar> p = ::scalarToCLVector(scalar, CV_MAKE_TYPE(dst.depth(), dst.oclchannels()));
-        clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
-                (cl_mem)dst.data, (void*)&p[0], p.size(),
-                0, dst.step * dst.rows, 0, NULL, NULL);
-    }
-    else
-#endif
-    {
-        oclMat m(mat);
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void*)&m.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
-        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
-
-        openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
-            localThreads, args, -1, -1, buildOptions.c_str());
-    }
-}
-
-static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
-{
-    CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
-    std::vector<std::pair<size_t , const void *> > args;
-    size_t localThreads[3] = { 16, 16, 1 };
-    size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
-    int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
-
-    const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
-    const char channelMap[] = { ' ', ' ', '2', '4', '4' };
-    std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
-
-    oclMat m(Mat(1, 1, dst.type(), scalar));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&m.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
-    args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
-    args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
-    openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
-                        localThreads, args, -1, -1, buildOptions.c_str());
-}
-
-oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
-{
-    CV_Assert(mask.type() == CV_8UC1);
-    CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
-    CV_DbgAssert( !this->empty());
-    if (mask.empty())
-    {
-        set_to_withoutmask_run(*this, scalar, type() == CV_8UC1 ?
-                                   "set_to_without_mask_C1_D0" : "set_to_without_mask");
-    }
-    else
-        set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
-
-    return *this;
-}
-
-oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
-{
-    if( new_rows != 0 && new_rows != rows)
-    {
-        CV_Error( Error::StsBadFunc, "oclMat's number of rows can not be changed for current version" );
-    }
-
-    oclMat hdr = *this;
-
-    int cn = oclchannels();
-    if (new_cn == 0)
-        new_cn = cn;
-
-    int total_width = cols * cn;
-    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
-        new_rows = rows * total_width / new_cn;
-
-    if (new_rows != 0 && new_rows != rows)
-    {
-        int total_size = total_width * rows;
-
-        if (!isContinuous())
-            CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
-
-        if ((unsigned)new_rows > (unsigned)total_size)
-            CV_Error(Error::StsOutOfRange, "Bad new number of rows");
-
-        total_width = total_size / new_rows;
-        if (total_width * new_rows != total_size)
-            CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
-
-        hdr.rows = new_rows;
-        hdr.step = total_width * elemSize1();
-    }
-
-    int new_width = total_width / new_cn;
-    if (new_width * new_cn != total_width)
-        CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");
-
-    hdr.cols = new_width;
-    hdr.wholecols = new_width;
-    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
-    return hdr;
-
-}
-
-void cv::ocl::oclMat::createEx(Size size, int type,
-                               DevMemRW rw_type, DevMemType mem_type)
-{
-    createEx(size.height, size.width, type, rw_type, mem_type);
-}
-
-void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
-{
-    createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
-}
-
-void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
-                               DevMemRW rw_type, DevMemType mem_type)
-{
-    clCxt = Context::getContext();
-    /* core logic */
-    _type &= Mat::TYPE_MASK;
-    if( rows == _rows && cols == _cols && type() == _type && data )
-        return;
-    if( data )
-        release();
-    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
-    if( _rows > 0 && _cols > 0 )
-    {
-        flags = Mat::MAGIC_VAL + _type;
-        rows = _rows;
-        cols = _cols;
-        wholerows = _rows;
-        wholecols = _cols;
-        size_t esz = elemSize();
-
-        void *dev_ptr;
-        openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
-
-        if (esz * cols == step)
-            flags |= Mat::CONTINUOUS_FLAG;
-
-        int64 _nettosize = (int64)step * rows;
-        size_t nettosize = (size_t)_nettosize;
-
-        datastart = data = (uchar *)dev_ptr;
-        dataend = data + nettosize;
-
-        refcount = (int *)fastMalloc(sizeof(*refcount));
-        *refcount = 1;
-    }
-}
-
-void cv::ocl::oclMat::release()
-{
-    if( refcount && CV_XADD(refcount, -1) == 1 )
-    {
-        fastFree(refcount);
-        openCLFree(datastart);
-    }
-    data = datastart = dataend = 0;
-    step = rows = cols = 0;
-    offset = wholerows = wholecols = 0;
-    refcount = 0;
-}
-
-oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
-{
-    add(*this, m, *this);
-    return *this;
-}
-
-oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
-{
-    subtract(*this, m, *this);
-    return *this;
-}
-
-oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
-{
-    multiply(*this, m, *this);
-    return *this;
-}
-
-oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
-{
-    divide(*this, m, *this);
-    return *this;
-}
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
deleted file mode 100644
index e5dfdd44a..000000000
--- a/modules/ocl/src/mcwutil.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace std;
-
-namespace cv
-{
-    namespace ocl
-    {
-        // provide additional methods for the user to interact with the command queue after a task is fired
-        static void openCLExecuteKernel_2(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
-                                   size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                   int depth, const char *build_options, FLUSH_MODE finish_mode)
-        {
-            //construct kernel name
-            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
-            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
-            std::stringstream idxStr;
-            if(channels != -1)
-                idxStr << "_C" << channels;
-            if(depth != -1)
-                idxStr << "_D" << depth;
-            kernelName += idxStr.str().c_str();
-
-            cl_kernel kernel;
-            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
-
-            if ( localThreads != NULL)
-            {
-                globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
-                globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
-                globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
-
-                //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
-                cv::ocl::openCLVerifyKernel(clCxt, kernel,  localThreads);
-            }
-            for(size_t i = 0; i < args.size(); i ++)
-                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
-
-            openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), kernel, 3, NULL, globalThreads,
-                                                  localThreads, 0, NULL, NULL));
-
-            switch(finish_mode)
-            {
-            case CLFINISH:
-                clFinish(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
-            case CLFLUSH:
-                clFlush(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr());
-                break;
-            case DISABLE:
-            default:
-                break;
-            }
-            openCLSafeCall(clReleaseKernel(kernel));
-        }
-
-        void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
-                                  size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode)
-        {
-            openCLExecuteKernel2(clCxt, source, kernelName, globalThreads, localThreads, args,
-                                 channels, depth, NULL, finish_mode);
-        }
-        void openCLExecuteKernel2(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
-                                  size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options, FLUSH_MODE finish_mode)
-
-        {
-            openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
-                                  build_options, finish_mode);
-        }
-
-        cl_mem bindTexture(const oclMat &mat)
-        {
-            cl_mem texture;
-            cl_image_format format;
-            int err;
-            int depth    = mat.depth();
-            int channels = mat.oclchannels();
-
-            switch(depth)
-            {
-            case CV_8U:
-                format.image_channel_data_type = CL_UNSIGNED_INT8;
-                break;
-            case CV_32S:
-                format.image_channel_data_type = CL_UNSIGNED_INT32;
-                break;
-            case CV_32F:
-                format.image_channel_data_type = CL_FLOAT;
-                break;
-            default:
-                CV_Error(-1, "Image forma is not supported");
-                break;
-            }
-            switch(channels)
-            {
-            case 1:
-                format.image_channel_order     = CL_R;
-                break;
-            case 3:
-                format.image_channel_order     = CL_RGB;
-                break;
-            case 4:
-                format.image_channel_order     = CL_RGBA;
-                break;
-            default:
-                CV_Error(-1, "Image format is not supported");
-                break;
-            }
-#ifdef CL_VERSION_1_2
-            //this enables backwards portability to
-            //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
-            if(Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2))
-            {
-                cl_image_desc desc;
-                desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
-                desc.image_width      = mat.cols;
-                desc.image_height     = mat.rows;
-                desc.image_depth      = 0;
-                desc.image_array_size = 1;
-                desc.image_row_pitch  = 0;
-                desc.image_slice_pitch = 0;
-                desc.buffer           = NULL;
-                desc.num_mip_levels   = 0;
-                desc.num_samples      = 0;
-                texture = clCreateImage(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
-            }
-            else
-#endif
-            {
-                texture = clCreateImage2D(
-                    *(cl_context*)mat.clCxt->getOpenCLContextPtr(),
-                    CL_MEM_READ_WRITE,
-                    &format,
-                    mat.cols,
-                    mat.rows,
-                    0,
-                    NULL,
-                    &err);
-            }
-            size_t origin[] = { 0, 0, 0 };
-            size_t region[] = { mat.cols, mat.rows, 1 };
-
-            cl_mem devData;
-            if (mat.cols * mat.elemSize() != mat.step)
-            {
-                devData = clCreateBuffer(*(cl_context*)mat.clCxt->getOpenCLContextPtr(), CL_MEM_READ_ONLY, mat.cols * mat.rows
-                    * mat.elemSize(), NULL, NULL);
-                const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
-                clEnqueueCopyBufferRect(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), (cl_mem)mat.data, devData, origin, origin,
-                    regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
-                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
-            }
-            else
-            {
-                devData = (cl_mem)mat.data;
-            }
-
-            clEnqueueCopyBufferToImage(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr(), devData, texture, 0, origin, region, 0, NULL, 0);
-            if ((mat.cols * mat.elemSize() != mat.step))
-            {
-                clFlush(*(cl_command_queue*)mat.clCxt->getOpenCLCommandQueuePtr());
-                clReleaseMemObject(devData);
-            }
-
-            openCLSafeCall(err);
-            return texture;
-        }
-
-        Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
-        {
-            return makePtr<TextureCL>(bindTexture(mat), mat.rows, mat.cols, mat.type());
-        }
-
-        void releaseTexture(cl_mem& texture)
-        {
-            openCLFree(texture);
-        }
-    }//namespace ocl
-
-}//namespace cv
diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp
deleted file mode 100644
index 0ba6e8ce0..000000000
--- a/modules/ocl/src/moments.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma,  jin@multicorewareinc.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other Materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-#include "opencv2/imgproc/types_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-
-#include "opencl_kernels.hpp"
-
-#if defined _MSC_VER
-#define snprintf sprintf_s
-#endif
-namespace cv
-{
-    namespace ocl
-    {
-        // The function calculates center of gravity and the central second order moments
-        static void icvCompleteMomentState( CvMoments* moments )
-        {
-            double cx = 0, cy = 0;
-            double mu20, mu11, mu02;
-
-            assert( moments != 0 );
-            moments->inv_sqrt_m00 = 0;
-
-            if( fabs(moments->m00) > DBL_EPSILON )
-            {
-                double inv_m00 = 1. / moments->m00;
-                cx = moments->m10 * inv_m00;
-                cy = moments->m01 * inv_m00;
-                moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
-            }
-
-            // mu20 = m20 - m10*cx
-            mu20 = moments->m20 - moments->m10 * cx;
-            // mu11 = m11 - m10*cy
-            mu11 = moments->m11 - moments->m10 * cy;
-            // mu02 = m02 - m01*cy
-            mu02 = moments->m02 - moments->m01 * cy;
-
-            moments->mu20 = mu20;
-            moments->mu11 = mu11;
-            moments->mu02 = mu02;
-
-            // mu30 = m30 - cx*(3*mu20 + cx*m10)
-            moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
-            mu11 += mu11;
-            // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
-            moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
-            // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
-            moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
-            // mu03 = m03 - cy*(3*mu02 + cy*m01)
-            moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
-        }
-
-
-        static void icvContourMoments( CvSeq* contour, CvMoments* mom )
-        {
-            if( contour->total )
-            {
-                CvSeqReader reader;
-                int lpt = contour->total;
-                double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
-
-                cvStartReadSeq( contour, &reader, 0 );
-
-                size_t reader_size = lpt << 1;
-                cv::Mat reader_mat(1,reader_size,CV_32FC1);
-
-                bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
-
-                if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
-                {
-                    CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
-                }
-
-                if( is_float )
-                {
-                    for(size_t i = 0; i < reader_size; ++i)
-                    {
-                        reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
-                        reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
-                        CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
-                    }
-                }
-                else
-                {
-                    for(size_t i = 0; i < reader_size; ++i)
-                    {
-                        reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
-                        reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
-                        CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
-                    }
-                }
-
-                cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
-                cv::ocl::oclMat reader_oclmat(reader_mat);
-                int llength = std::min(lpt,128);
-                size_t localThreads[3]  = { llength, 1, 1};
-                size_t globalThreads[3] = { lpt, 1, 1};
-                std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
-                cl_int dst_step = (cl_int)dst_a.step;
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
-
-                char builOption[128];
-                snprintf(builOption, 128, "-D CV_8UC1");
-
-                openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption);
-
-                cv::Mat dst(dst_a);
-                a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
-                if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-                {
-                    for (int i = 0; i < contour->total; ++i)
-                    {
-                        a00 += dst.at<cl_long>(0, i);
-                        a10 += dst.at<cl_long>(1, i);
-                        a01 += dst.at<cl_long>(2, i);
-                        a20 += dst.at<cl_long>(3, i);
-                        a11 += dst.at<cl_long>(4, i);
-                        a02 += dst.at<cl_long>(5, i);
-                        a30 += dst.at<cl_long>(6, i);
-                        a21 += dst.at<cl_long>(7, i);
-                        a12 += dst.at<cl_long>(8, i);
-                        a03 += dst.at<cl_long>(9, i);
-                    }
-                }
-                else
-                {
-                    a00 = cv::sum(dst.row(0))[0];
-                    a10 = cv::sum(dst.row(1))[0];
-                    a01 = cv::sum(dst.row(2))[0];
-                    a20 = cv::sum(dst.row(3))[0];
-                    a11 = cv::sum(dst.row(4))[0];
-                    a02 = cv::sum(dst.row(5))[0];
-                    a30 = cv::sum(dst.row(6))[0];
-                    a21 = cv::sum(dst.row(7))[0];
-                    a12 = cv::sum(dst.row(8))[0];
-                    a03 = cv::sum(dst.row(9))[0];
-                }
-
-                double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
-                if( fabs(a00) > FLT_EPSILON )
-                {
-                    if( a00 > 0 )
-                    {
-                        db1_2 = 0.5;
-                        db1_6 = 0.16666666666666666666666666666667;
-                        db1_12 = 0.083333333333333333333333333333333;
-                        db1_24 = 0.041666666666666666666666666666667;
-                        db1_20 = 0.05;
-                        db1_60 = 0.016666666666666666666666666666667;
-                    }
-                    else
-                    {
-                        db1_2 = -0.5;
-                        db1_6 = -0.16666666666666666666666666666667;
-                        db1_12 = -0.083333333333333333333333333333333;
-                        db1_24 = -0.041666666666666666666666666666667;
-                        db1_20 = -0.05;
-                        db1_60 = -0.016666666666666666666666666666667;
-                    }
-
-                    // spatial moments
-                    mom->m00 = a00 * db1_2;
-                    mom->m10 = a10 * db1_6;
-                    mom->m01 = a01 * db1_6;
-                    mom->m20 = a20 * db1_12;
-                    mom->m11 = a11 * db1_24;
-                    mom->m02 = a02 * db1_12;
-                    mom->m30 = a30 * db1_20;
-                    mom->m21 = a21 * db1_60;
-                    mom->m12 = a12 * db1_60;
-                    mom->m03 = a03 * db1_20;
-
-                    icvCompleteMomentState( mom );
-                }
-            }
-        }
-
-        Moments ocl_moments(oclMat& src, bool binary) //for image
-        {
-            CV_Assert(src.oclchannels() == 1);
-            if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-            {
-                CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
-            }
-
-            if(binary)
-            {
-                oclMat mask;
-                if(src.type() != CV_8UC1)
-                {
-                    src.convertTo(mask, CV_8UC1);
-                }
-                oclMat src8u(src.size(), CV_8UC1);
-                src8u.setTo(Scalar(255), mask);
-                src = src8u;
-            }
-            const int TILE_SIZE = 256;
-
-            CvMoments mom;
-            memset(&mom, 0, sizeof(mom));
-
-            cv::Size size = src.size();
-            int blockx, blocky;
-            blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE;
-            blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE;
-
-            oclMat dst_m;
-            int tile_height = TILE_SIZE;
-
-            size_t localThreads[3]  = {1, tile_height, 1};
-            size_t globalThreads[3] = {blockx, size.height, 1};
-
-            if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-            {
-                dst_m.create(blocky * 10, blockx, CV_64FC1);
-            }else
-            {
-                dst_m.create(blocky * 10, blockx, CV_32FC1);
-            }
-
-            int src_step = (int)(src.step/src.elemSize());
-            int dstm_step = (int)(dst_m.step/dst_m.elemSize());
-
-            std::vector<std::pair<size_t , const void *> > args,args_sum;
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step ));
-            args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstm_step ));
-
-            int binary_;
-            if(binary)
-                binary_ = 1;
-            else
-                binary_ = 0;
-            args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary_));
-
-            char builOption[128];
-            if(binary || src.type() == CV_8UC1)
-            {
-                snprintf(builOption, 128, "-D CV_8UC1");
-            }else if(src.type() == CV_16UC1)
-            {
-                snprintf(builOption, 128, "-D CV_16UC1");
-            }else if(src.type() == CV_16SC1)
-            {
-                snprintf(builOption, 128, "-D CV_16SC1");
-            }else if(src.type() == CV_32FC1)
-            {
-                snprintf(builOption, 128, "-D CV_32FC1");
-            }else if(src.type() == CV_64FC1)
-            {
-                snprintf(builOption, 128, "-D CV_64FC1");
-            }else
-            {
-                CV_Error( CV_StsUnsupportedFormat, "" );
-            }
-
-            openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption);
-
-            Mat tmp(dst_m);
-            tmp.convertTo(tmp, CV_64FC1);
-
-            double tmp_m[10] = {0};
-
-            for(int j = 0; j < tmp.rows; j += 10)
-            {
-                for(int i = 0; i < tmp.cols; i++)
-                {
-                    tmp_m[0] += tmp.at<double>(j, i);
-                    tmp_m[1] += tmp.at<double>(j + 1, i);
-                    tmp_m[2] += tmp.at<double>(j + 2, i);
-                    tmp_m[3] += tmp.at<double>(j + 3, i);
-                    tmp_m[4] += tmp.at<double>(j + 4, i);
-                    tmp_m[5] += tmp.at<double>(j + 5, i);
-                    tmp_m[6] += tmp.at<double>(j + 6, i);
-                    tmp_m[7] += tmp.at<double>(j + 7, i);
-                    tmp_m[8] += tmp.at<double>(j + 8, i);
-                    tmp_m[9] += tmp.at<double>(j + 9, i);
-                }
-            }
-
-            mom.m00 = tmp_m[0];
-            mom.m10 = tmp_m[1];
-            mom.m01 = tmp_m[2];
-            mom.m20 = tmp_m[3];
-            mom.m11 = tmp_m[4];
-            mom.m02 = tmp_m[5];
-            mom.m30 = tmp_m[6];
-            mom.m21 = tmp_m[7];
-            mom.m12 = tmp_m[8];
-            mom.m03 = tmp_m[9];
-            icvCompleteMomentState( &mom );
-            return mom;
-        }
-
-        Moments ocl_moments(InputArray _contour) //for contour
-        {
-            CvMoments mom;
-            memset(&mom, 0, sizeof(mom));
-
-            Mat arr = _contour.getMat();
-            CvMat c_array = arr;
-
-            const void* array = &c_array;
-
-            CvSeq* contour = 0;
-            if( CV_IS_SEQ( array ))
-            {
-                contour = (CvSeq*)(array);
-                if( !CV_IS_SEQ_POINT_SET( contour ))
-                    CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
-            }
-
-            int type, coi = 0;
-
-            CvMat stub, *mat = (CvMat*)(array);
-            CvContour contourHeader;
-            CvSeqBlock block;
-
-            if( !contour )
-            {
-                mat = cvGetMat( mat, &stub, &coi );
-                type = CV_MAT_TYPE( mat->type );
-
-                if( type == CV_32SC2 || type == CV_32FC2 )
-                {
-                    contour = cvPointSeqFromMat(
-                        CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
-                        mat, &contourHeader, &block );
-                }
-            }
-
-            CV_Assert(contour);
-
-            icvContourMoments(contour, &mom);
-            return mom;
-        }
-    }
-}
diff --git a/modules/ocl/src/mssegmentation.cpp b/modules/ocl/src/mssegmentation.cpp
deleted file mode 100644
index a569c127d..000000000
--- a/modules/ocl/src/mssegmentation.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 6)
-# pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-// Auxiliray stuff
-namespace
-{
-
-    //
-    // Declarations
-    //
-
-    class DjSets
-    {
-    public:
-        DjSets(int n);
-        int find(int elem);
-        int merge(int set1, int set2);
-
-        std::vector<int> parent;
-        std::vector<int> rank;
-        std::vector<int> size;
-    private:
-        DjSets(const DjSets &) {}
-        DjSets operator =(const DjSets &);
-    };
-
-    template <typename T>
-    struct GraphEdge
-    {
-        GraphEdge() {}
-        GraphEdge(int to, int next, const T &val) : to(to), next(next), val(val) {}
-        int to;
-        int next;
-        T val;
-    };
-
-
-    template <typename T>
-    class Graph
-    {
-    public:
-        typedef GraphEdge<T> Edge;
-
-        Graph(int numv, int nume_max);
-
-        void addEdge(int from, int to, const T &val = T());
-
-        std::vector<int> start;
-        std::vector<Edge> edges;
-
-        int numv;
-        int nume_max;
-        int nume;
-    private:
-        Graph(const Graph &) {}
-        Graph operator =(const Graph &) {}
-    };
-
-
-    struct SegmLinkVal
-    {
-        SegmLinkVal() {}
-        SegmLinkVal(int dr, int dsp) : dr(dr), dsp(dsp) {}
-        bool operator <(const SegmLinkVal &other) const
-        {
-            return dr + dsp < other.dr + other.dsp;
-        }
-        int dr;
-        int dsp;
-    };
-
-
-    struct SegmLink
-    {
-        SegmLink() {}
-        SegmLink(int from, int to, const SegmLinkVal &val)
-            : from(from), to(to), val(val) {}
-        bool operator <(const SegmLink &other) const
-        {
-            return val < other.val;
-        }
-        int from;
-        int to;
-        SegmLinkVal val;
-    };
-
-    //
-    // Implementation
-    //
-
-    DjSets DjSets::operator = (const DjSets &/*obj*/)
-    {
-        //cout << "Invalid DjSets constructor\n";
-        CV_Error(-1, "Invalid DjSets constructor\n");
-        return *this;
-    }
-
-    DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
-    {
-        for (int i = 0; i < n; ++i)
-            parent[i] = i;
-    }
-
-
-    inline int DjSets::find(int elem)
-    {
-        int set = elem;
-        while (set != parent[set])
-            set = parent[set];
-        while (elem != parent[elem])
-        {
-            int next = parent[elem];
-            parent[elem] = set;
-            elem = next;
-        }
-        return set;
-    }
-
-
-    inline int DjSets::merge(int set1, int set2)
-    {
-        if (rank[set1] < rank[set2])
-        {
-            parent[set1] = set2;
-            size[set2] += size[set1];
-            return set2;
-        }
-        if (rank[set2] < rank[set1])
-        {
-            parent[set2] = set1;
-            size[set1] += size[set2];
-            return set1;
-        }
-        parent[set1] = set2;
-        rank[set2]++;
-        size[set2] += size[set1];
-        return set2;
-    }
-
-
-    template <typename T>
-    Graph<T>::Graph(int numv, int nume_max) : start(numv, -1), edges(nume_max)
-    {
-        this->numv = numv;
-        this->nume_max = nume_max;
-        nume = 0;
-    }
-
-
-    template <typename T>
-    inline void Graph<T>::addEdge(int from, int to, const T &val)
-    {
-        edges[nume] = Edge(to, start[from], val);
-        start[from] = nume;
-        nume++;
-    }
-
-
-    inline int pix(int y, int x, int ncols)
-    {
-        return y * ncols + x;
-    }
-
-
-    inline int sqr(int x)
-    {
-        return x * x;
-    }
-
-
-    inline int dist2(const cv::Vec4b &lhs, const cv::Vec4b &rhs)
-    {
-        return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
-    }
-
-
-    inline int dist2(const cv::Vec2s &lhs, const cv::Vec2s &rhs)
-    {
-        return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
-    }
-
-} // anonymous namespace
-
-namespace cv
-{
-    namespace ocl
-    {
-
-        void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, TermCriteria criteria)
-        {
-            CV_Assert(src.type() == CV_8UC4);
-            const int nrows = src.rows;
-            const int ncols = src.cols;
-            const int hr = sr;
-            const int hsp = sp;
-
-            // Perform mean shift procedure and obtain region and spatial maps
-            oclMat h_rmap, h_spmap;
-            meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria);
-            Mat rmap = h_rmap;
-            Mat spmap = h_spmap;
-
-            Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
-                                 + (nrows - 1) + (ncols - 1));
-
-            // Make region adjacent graph from image
-            Vec4b r1;
-            Vec4b r2[4];
-            Vec2s sp1;
-            Vec2s sp2[4];
-            int dr[4];
-            int dsp[4];
-            for (int y = 0; y < nrows - 1; ++y)
-            {
-                Vec4b *ry = rmap.ptr<Vec4b>(y);
-                Vec4b *ryp = rmap.ptr<Vec4b>(y + 1);
-                Vec2s *spy = spmap.ptr<Vec2s>(y);
-                Vec2s *spyp = spmap.ptr<Vec2s>(y + 1);
-                for (int x = 0; x < ncols - 1; ++x)
-                {
-                    r1 = ry[x];
-                    sp1 = spy[x];
-
-                    r2[0] = ry[x + 1];
-                    r2[1] = ryp[x];
-                    r2[2] = ryp[x + 1];
-                    r2[3] = ryp[x];
-
-                    sp2[0] = spy[x + 1];
-                    sp2[1] = spyp[x];
-                    sp2[2] = spyp[x + 1];
-                    sp2[3] = spyp[x];
-
-                    dr[0] = dist2(r1, r2[0]);
-                    dr[1] = dist2(r1, r2[1]);
-                    dr[2] = dist2(r1, r2[2]);
-                    dsp[0] = dist2(sp1, sp2[0]);
-                    dsp[1] = dist2(sp1, sp2[1]);
-                    dsp[2] = dist2(sp1, sp2[2]);
-
-                    r1 = ry[x + 1];
-                    sp1 = spy[x + 1];
-
-                    dr[3] = dist2(r1, r2[3]);
-                    dsp[3] = dist2(sp1, sp2[3]);
-
-                    g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-                    g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
-                    g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
-                    g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
-                }
-            }
-            for (int y = 0; y < nrows - 1; ++y)
-            {
-                r1 = rmap.at<Vec4b>(y, ncols - 1);
-                r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
-                sp1 = spmap.at<Vec2s>(y, ncols - 1);
-                sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
-                dr[0] = dist2(r1, r2[0]);
-                dsp[0] = dist2(sp1, sp2[0]);
-                g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-            }
-            for (int x = 0; x < ncols - 1; ++x)
-            {
-                r1 = rmap.at<Vec4b>(nrows - 1, x);
-                r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
-                sp1 = spmap.at<Vec2s>(nrows - 1, x);
-                sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
-                dr[0] = dist2(r1, r2[0]);
-                dsp[0] = dist2(sp1, sp2[0]);
-                g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-            }
-
-            DjSets comps(g.numv);
-
-            // Find adjacent components
-            for (int v = 0; v < g.numv; ++v)
-            {
-                for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-                {
-                    int c1 = comps.find(v);
-                    int c2 = comps.find(g.edges[e_it].to);
-                    if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
-                        comps.merge(c1, c2);
-                }
-            }
-
-            std::vector<SegmLink> edges;
-            edges.reserve(g.numv);
-
-            // Prepare edges connecting differnet components
-            for (int v = 0; v < g.numv; ++v)
-            {
-                int c1 = comps.find(v);
-                for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-                {
-                    int c2 = comps.find(g.edges[e_it].to);
-                    if (c1 != c2)
-                        edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
-                }
-            }
-
-            // Sort all graph's edges connecting differnet components (in asceding order)
-            std::sort(edges.begin(), edges.end());
-
-            // Exclude small components (starting from the nearest couple)
-            for (size_t i = 0; i < edges.size(); ++i)
-            {
-                int c1 = comps.find(edges[i].from);
-                int c2 = comps.find(edges[i].to);
-                if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
-                    comps.merge(c1, c2);
-            }
-
-            // Compute sum of the pixel's colors which are in the same segment
-            Mat h_src = src;
-            std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
-            for (int y = 0; y < nrows; ++y)
-            {
-                Vec4b *h_srcy = h_src.ptr<Vec4b>(y);
-                for (int x = 0; x < ncols; ++x)
-                {
-                    int parent = comps.find(pix(y, x, ncols));
-                    Vec4b col = h_srcy[x];
-                    Vec4i &sumcol = sumcols[parent];
-                    sumcol[0] += col[0];
-                    sumcol[1] += col[1];
-                    sumcol[2] += col[2];
-                }
-            }
-
-            // Create final image, color of each segment is the average color of its pixels
-            dst.create(src.size(), src.type());
-
-            for (int y = 0; y < nrows; ++y)
-            {
-                Vec4b *dsty = dst.ptr<Vec4b>(y);
-                for (int x = 0; x < ncols; ++x)
-                {
-                    int parent = comps.find(pix(y, x, ncols));
-                    const Vec4i &sumcol = sumcols[parent];
-                    Vec4b &dstcol = dsty[x];
-                    dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
-                    dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
-                    dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
-                }
-            }
-        }
-
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl
deleted file mode 100644
index 30407bb88..000000000
--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ /dev/null
@@ -1,107 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Rock Li, Rock.li@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
-      __global dstT *dst,
-      int cols1, int rows,
-      int src_offset1,
-      int lut_offset1,
-      int dst_offset1,
-      int src_step1, int dst_step1)
-{
-    int x1 = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x1 < cols1 && y < rows)
-    {
-        int src_index = mad24(y, src_step1, src_offset1 + x1);
-        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
-
-        dst[dst_index] = lut[lut_offset1 + src[src_index]];
-    }
-}
-
-__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
-      __global dstT *dst,
-      int cols1, int rows,
-      int src_offset1,
-      int lut_offset1,
-      int dst_offset1,
-      int src_step1, int dst_step1)
-{
-    int x1 = get_global_id(0) << 1;
-    int y = get_global_id(1);
-
-    if (x1 < cols1 && y < rows)
-    {
-        int src_index = mad24(y, src_step1, src_offset1 + x1);
-        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
-
-        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
-        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
-    }
-}
-
-__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
-      __global dstT *dst,
-      int cols1, int rows,
-      int src_offset1,
-      int lut_offset1,
-      int dst_offset1,
-      int src_step1, int dst_step1)
-{
-    int x1 = get_global_id(0) << 2;
-    int y = get_global_id(1);
-
-    if (x1 < cols1 && y < rows)
-    {
-        int src_index = mad24(y, src_step1, src_offset1 + x1);
-        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
-
-        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
-        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
-        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
-        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl b/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
deleted file mode 100644
index e07f31413..000000000
--- a/modules/ocl/src/opencl/arithm_absdiff_nonsaturate.cl
+++ /dev/null
@@ -1,107 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef BINARY
-
-__kernel void arithm_absdiff_nonsaturate_binary(__global srcT *src1, int src1_step, int src1_offset,
-                         __global srcT *src2, int src2_step, int src2_offset,
-                         __global dstT *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-#ifdef INTEL_DEVICE //workaround for intel compiler bug
-        if(src1_index >= 0 && src2_index >= 0)
-#endif
-        {
-            dstT t0 = convertToDstT(src1[src1_index]);
-            dstT t1 = convertToDstT(src2[src2_index]);
-            dstT t2 = t0 - t1;
-
-            dst[dst_index] = t2 >= (dstT)(0) ? t2 : -t2;
-        }
-    }
-}
-
-#else
-
-__kernel void arithm_absdiff_nonsaturate(__global srcT *src1, int src1_step, int src1_offset,
-                         __global dstT *dst, int dst_step, int dst_offset,
-                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-#ifdef INTEL_DEVICE //workaround for intel compiler bug
-        if(src1_index >= 0)
-#endif
-        {
-            dstT t0 = convertToDstT(src1[src1_index]);
-
-            dst[dst_index] = t0 >= (dstT)(0) ? t0 : -t0;
-        }
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
deleted file mode 100644
index 04262b872..000000000
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ /dev/null
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + convertToWT(src2[src2_index]));
-#endif
-
-#if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]));
-#endif
-
-#if defined (FUNC_MUL)
-#if defined (HAVE_SCALAR)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar * convertToWT(src2[src2_index]));
-#else
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
-#endif
-#endif
-
-#if defined (FUNC_DIV)
-#if defined (HAVE_SCALAR)
-#define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src2[src2_index] == zero ? zero : \
-    convertToT(convertToWT(src1[src1_index]) * scalar / convertToWT(src2[src2_index]));
-#else
-#define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src2[src2_index] == zero ? zero : \
-    convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
-#endif
-#endif
-
-#if defined (FUNC_ABS_DIFF)
-#define EXPRESSION WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]); \
-    value = value > (WT)(0) ? value : -value; \
-    dst[dst_index] = convertToT(value);
-#endif
-
-#if defined (FUNC_MIN)
-#define EXPRESSION dst[dst_index] = min( src1[src1_index], src2[src2_index] );
-#endif
-
-#if defined (FUNC_MAX)
-#define EXPRESSION dst[dst_index] = max( src1[src1_index], src2[src2_index] );
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HAVE_SCALAR
-
-__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
-                                   __global T *src2, int src2_step, int src2_offset,
-                                   __global T *dst, int dst_step, int dst_offset,
-                                   int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index  = mad24(y, dst_step, x + dst_offset);
-
-        EXPRESSION
-    }
-}
-
-#else
-
-// add mat with scale
-__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
-                                          __global T *src2, int src2_step, int src2_offset,
-                                          WT scalar,
-                                          __global T *dst, int dst_step,  int dst_offset,
-                                          int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        EXPRESSION
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl
deleted file mode 100644
index 872ee8535..000000000
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ /dev/null
@@ -1,75 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
-                              __global T * src2, int src2_step1, int src2_offset1,
-                              __global T * dst, int dst_step1, int dst_offset1,
-                              WT alpha, WT beta, WT gama,
-                              int cols1, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols1 && y < rows)
-    {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index = mad24(y, dst_step1, x + dst_offset1);
-
-        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_add_mask.cl b/modules/ocl/src/opencl/arithm_add_mask.cl
deleted file mode 100644
index b115d9b76..000000000
--- a/modules/ocl/src/opencl/arithm_add_mask.cl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + convertToWT(src2[src2_index]));
-#endif
-
-#if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]));
-#endif
-
-#if defined (FUNC_MUL)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * convertToWT(src2[src2_index]));
-#endif
-
-#if defined (FUNC_DIV)
-#define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src2[src2_index] == zero ? zero : \
-    convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////// add with mask //////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset,
-                              __global T * src2, int src2_step, int src2_offset,
-                              __global uchar * mask, int mask_step, int mask_offset,
-                              __global T * dst, int dst_step, int dst_offset,
-                              int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int mask_index = mad24(y, mask_step, x + mask_offset);
-        if (mask[mask_index])
-        {
-            int src1_index = mad24(y, src1_step, x + src1_offset);
-            int src2_index = mad24(y, src2_step, x + src2_offset);
-            int dst_index  = mad24(y, dst_step, dst_offset + x);
-
-            EXPRESSION
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
deleted file mode 100644
index 05ea48da4..000000000
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ /dev/null
@@ -1,103 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
-#endif
-
-#if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
-#endif
-
-#if defined (FUNC_MUL)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
-#endif
-
-#if defined (FUNC_DIV)
-#define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar / convertToWT(src1[src1_index]));
-#endif
-
-#if defined (FUNC_ABS)
-#define EXPRESSION \
-    T value = src1[src1_index] > (T)(0) ? src1[src1_index] : -src1[src1_index]; \
-    dst[dst_index] = value;
-#endif
-
-#if defined (FUNC_ABS_DIFF)
-#define EXPRESSION WT value = convertToWT(src1[src1_index]) - scalar; \
-    value = value > (WT)(0) ? value : -value; \
-    dst[dst_index] = convertToT(value);
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////// Add with scalar /////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
-                                 WT scalar,
-                                 __global T *dst,  int dst_step,  int dst_offset,
-                                 int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        EXPRESSION
-
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
deleted file mode 100644
index a8b965758..000000000
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if defined (FUNC_ADD)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) + scalar);
-#endif
-
-#if defined (FUNC_SUB)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) - scalar);
-#endif
-
-#if defined (FUNC_MUL)
-#define EXPRESSION dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar);
-#endif
-
-#if defined (FUNC_DIV)
-#define EXPRESSION T zero = (T)(0); \
-    dst[dst_index] = src2[src2_index] == zero ? zero : \
-    convertToT(convertToWT(src1[src1_index]) / scalar[0]);
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////
-//////////////////////////// Add with scalar with mask ////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
-                                     WT scalar,
-                                     __global uchar *mask, int mask_step, int mask_offset,
-                                     __global T *dst,  int dst_step,  int dst_offset,
-                                     int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int mask_index = mad24(y, mask_step, x + mask_offset);
-        if (mask[mask_index])
-        {
-            int src1_index = mad24(y, src1_step, x + src1_offset);
-            int dst_index = mad24(y, dst_step, dst_offset + x);
-
-            EXPRESSION
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary.cl b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
deleted file mode 100644
index 56cd745d2..000000000
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//    Peng Xiao,    pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////// bitwise_binary //////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
-                                    __global uchar * src2, int src2_step, int src2_offset,
-                                    __global uchar * dst, int dst_step, int dst_offset,
-                                    int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-#if elemSize > 1
-        x *= elemSize;
-#endif
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-#if elemSize > 1
-        #pragma unroll
-        for (int i = 0; i < elemSize; i += vlen)
-        {
-            ucharv t0 = vloadn(0, src1 + src1_index + i);
-            ucharv t1 = vloadn(0, src2 + src2_index + i);
-            ucharv t2 = t0 Operation t1;
-
-            vstoren(t2, 0, dst + dst_index + i);
-        }
-#else
-        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
-#endif
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
deleted file mode 100644
index 328ccd91a..000000000
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//    Peng Xiao,    pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
-                                    __global uchar * src2, int src2_step, int src2_offset,
-                                    __global uchar * mask, int mask_step, int mask_offset,
-                                    __global uchar * dst, int dst_step, int dst_offset,
-                                    int cols1, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols1 && y < rows)
-    {
-        int mask_index = mad24(y, mask_step, mask_offset + x);
-
-        if (mask[mask_index])
-        {
-#if elemSize > 1
-                x *= elemSize;
-#endif
-            int src1_index = mad24(y, src1_step, x + src1_offset);
-            int src2_index = mad24(y, src2_step, x + src2_offset);
-            int dst_index = mad24(y, dst_step, x + dst_offset);
-
-#if elemSize > 1
-            #pragma unroll
-            for (int i = 0; i < elemSize; i += vlen)
-            {
-                ucharv t0 = vloadn(0, src1 + src1_index + i);
-                ucharv t1 = vloadn(0, src2 + src2_index + i);
-                ucharv t2 = t0 Operation t1;
-
-                vstoren(t2, 0, dst + dst_index + i);
-            }
-#else
-            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
-#endif
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
deleted file mode 100644
index 434bd5eca..000000000
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ /dev/null
@@ -1,82 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//    Peng Xiao,    pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_bitwise_binary_scalar(
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2,
-        __global uchar *dst, int dst_step, int dst_offset,
-        int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-#if elemSize > 1
-        x *= elemSize;
-#endif
-        int src1_index = mad24(y, src1_step, src1_offset + x);
-        int dst_index  = mad24(y, dst_step, dst_offset + x);
-
-#if elemSize > 1
-        #pragma unroll
-        for (int i = 0; i < elemSize; i += vlen)
-        {
-            ucharv t0 = vloadn(0, src1 + src1_index + i);
-            ucharv t1 = vloadn(0, src2 + i);
-            ucharv t2 = t0 Operation t1;
-
-            vstoren(t2, 0, dst + dst_index + i);
-        }
-#else
-        dst[dst_index] = src1[src1_index] Operation src2[0];
-#endif
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl
deleted file mode 100644
index b6f76d606..000000000
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ /dev/null
@@ -1,253 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 dst_data = vload4(0, dst + dst_index);
-        uchar4 tmp_data = ~src1_data;
-
-        dst_data.x = dst_index + 0 < dst_end ? tmp_data.x : dst_data.x;
-        dst_data.y = dst_index + 1 < dst_end ? tmp_data.y : dst_data.y;
-        dst_data.z = dst_index + 2 < dst_end ? tmp_data.z : dst_data.z;
-        dst_data.w = dst_index + 3 < dst_end ? tmp_data.w : dst_data.w;
-
-        vstore4(dst_data, 0, dst + dst_index);
-    }
-}
-
-
-__kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 dst_data = vload4(0, dst + dst_index);
-        char4 tmp_data = ~src1_data;
-
-        dst_data.x = dst_index + 0 < dst_end ? tmp_data.x : dst_data.x;
-        dst_data.y = dst_index + 1 < dst_end ? tmp_data.y : dst_data.y;
-        dst_data.z = dst_index + 2 < dst_end ? tmp_data.z : dst_data.z;
-        dst_data.w = dst_index + 3 < dst_end ? tmp_data.w : dst_data.w;
-
-        vstore4(dst_data, 0, dst + dst_index);
-    }
-}
-
-
-__kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = ~ src1_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
-                                     __global short *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        short4 tmp_data = ~ src1_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
-                                     __global int *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int tmp  = ~ data1;
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-__kernel void arithm_bitwise_not_D5 (__global char *src, int src_step, int src_offset,
-                                     __global char *dst, int dst_step, int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
-
-        char4 data;
-
-        data = *((__global char4 *)((__global char *)src + src_index));
-        data = ~ data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_offset,
-                                     __global char *dst, int dst_step, int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 3) + src_offset);
-        int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 data;
-
-        data = *((__global char8 *)((__global char *)src + src_index));
-        data = ~ data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl
deleted file mode 100644
index c65f899b7..000000000
--- a/modules/ocl/src/opencl/arithm_cartToPolar.cl
+++ /dev/null
@@ -1,141 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define CV_PI M_PI
-#else
-#define CV_PI M_PI_F
-#endif
-
-__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
-                                     __global float *src2, int src2_step, int src2_offset,
-                                     __global float *dst1, int dst1_step, int dst1_offset, // magnitude
-                                     __global float *dst2, int dst2_step, int dst2_offset, // cartToPolar
-                                     int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        float x = src1[src1_index];
-        float y = src2[src2_index];
-
-        float x2 = x * x;
-        float y2 = y * y;
-
-        float magnitude = sqrt(x2 + y2);
-
-        float tmp = y >= 0 ? 0 : CV_PI*2;
-        tmp = x < 0 ? CV_PI : tmp;
-
-        float tmp1 = y >= 0 ? CV_PI*0.5f : CV_PI*1.5f;
-        float cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + FLT_EPSILON) + tmp :
-                                 tmp1 - x*y/(y2 + 0.28f*x2 + FLT_EPSILON);
-
-#ifdef DEGREE
-        cartToPolar *= (180/CV_PI);
-#endif
-
-        dst1[dst1_index] = magnitude;
-        dst2[dst2_index] = cartToPolar;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-
-__kernel void arithm_cartToPolar_D6 (__global double *src1, int src1_step, int src1_offset,
-                                     __global double *src2, int src2_step, int src2_offset,
-                                     __global double *dst1, int dst1_step, int dst1_offset,
-                                     __global double *dst2, int dst2_step, int dst2_offset,
-                                     int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        double x = src1[src1_index];
-        double y = src2[src2_index];
-
-        double x2 = x * x;
-        double y2 = y * y;
-
-        double magnitude = sqrt(x2 + y2);
-
-        float tmp = y >= 0 ? 0 : CV_PI*2;
-        tmp = x < 0 ? CV_PI : tmp;
-
-        float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5;
-        double cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + DBL_EPSILON)  + tmp :
-                                 tmp1 - x*y/(y2 + 0.28f*x2 + DBL_EPSILON);
-
-#ifdef DEGREE
-        cartToPolar *= (180/CV_PI);
-#endif
-
-        dst1[dst1_index] = magnitude;
-        dst2[dst2_index] = cartToPolar;
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl
deleted file mode 100644
index f0a189353..000000000
--- a/modules/ocl/src/opencl/arithm_exp.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Wu Zailong, bullet@yeah.net
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////EXP//////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_exp_C1(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
-
-        dst[dstIdx] = exp(src[srcIdx]);
-    }
-}
-
-__kernel void arithm_exp_C2(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 1;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      exp(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
-    }
-}
-
-__kernel void arithm_exp_C4(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 2;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      exp(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
-        dst[dstIdx + 2] = x1 + 2 < cols1 ? exp(src[srcIdx + 2]) : dst[dstIdx + 2];
-        dst[dstIdx + 3] = x1 + 3 < cols1 ? exp(src[srcIdx + 3]) : dst[dstIdx + 3];
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl
deleted file mode 100644
index b9bacd339..000000000
--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////// flip rows ///////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_flip_rows(__global T * src, int src_step, int src_offset,
-                               __global T * dst, int dst_step, int dst_offset,
-                               int rows, int cols, int thread_rows, int thread_cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, x + src_offset);
-        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset);
-
-        int dst_index_0 = mad24(y,            dst_step, x + dst_offset);
-        int dst_index_1 = mad24(rows - y - 1, dst_step, x + dst_offset);
-
-        T data0 = src[src_index_0], data1 = src[src_index_1];
-
-        dst[dst_index_0] = data1;
-        dst[dst_index_1] = data0;
-    }
-}
-
-__kernel void arithm_flip_rows_cols(__global T * src, int src_step, int src_offset,
-                                    __global T * dst, int dst_step, int dst_offset,
-                                    int rows, int cols, int thread_rows, int thread_cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < thread_rows)
-    {
-        int src_index_0 = mad24(y,            src_step, x            + src_offset);
-        int dst_index_0 = mad24(rows - y - 1, dst_step, cols - x - 1 + dst_offset);
-
-        int src_index_1 = mad24(rows - y - 1, src_step, cols - x - 1 + src_offset);
-        int dst_index_1 = mad24(y,            dst_step, x            + dst_offset);
-
-        T data0 = src[src_index_0], data1 = src[src_index_1];
-
-        dst[dst_index_0] = data0;
-        dst[dst_index_1] = data1;
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////// flip cols ///////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_flip_cols(__global T * src, int src_step, int src_offset,
-                               __global T * dst, int dst_step, int dst_offset,
-                               int rows, int cols, int thread_rows, int thread_cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < thread_cols && y < rows)
-    {
-        int src_index_0 = mad24(y, src_step, x            + src_offset);
-        int dst_index_0 = mad24(y, dst_step, cols - x - 1 + dst_offset);
-
-        int src_index_1 = mad24(y, src_step, cols - x - 1 + src_offset);
-        int dst_index_1 = mad24(y, dst_step, x            + dst_offset);
-
-        T data0 = src[src_index_0], data1 = src[src_index_1];
-        dst[dst_index_1] = data1;
-        dst[dst_index_0] = data0;
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl
deleted file mode 100644
index ba5f32d6d..000000000
--- a/modules/ocl/src/opencl/arithm_log.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Wu Zailong, bullet@yeah.net
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////LOG/////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_log_C1(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
-
-        dst[dstIdx] = log(src[srcIdx]);
-    }
-}
-
-__kernel void arithm_log_C2(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 1;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      log(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
-    }
-}
-
-__kernel void arithm_log_C4(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 2;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      log(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
-        dst[dstIdx + 2] = x1 + 2 < cols1 ? log(src[srcIdx + 2]) : dst[dstIdx + 2];
-        dst[dstIdx + 3] = x1 + 3 < cols1 ? log(src[srcIdx + 3]) : dst[dstIdx + 3];
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
deleted file mode 100644
index 01db7d064..000000000
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef DEPTH_5
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#elif defined DEPTH_6
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-/**************************************Array minMax**************************************/
-
-__kernel void arithm_op_minMax(__global const T * src, __global T * dst,
-    int cols, int invalid_cols, int offset, int elemnum, int groupnum)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int id = get_global_id(0);
-
-    int idx = offset + id + (id / cols) * invalid_cols;
-
-    __local T localmem_max[128], localmem_min[128];
-    T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
-
-    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-    {
-        idx = offset + id + (id / cols) * invalid_cols;
-        temp = src[idx];
-        minval = min(minval, temp);
-        maxval = max(maxval, temp);
-    }
-
-    if (lid > 127)
-    {
-        localmem_min[lid - 128] = minval;
-        localmem_max[lid - 128] = maxval;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid < 128)
-    {
-        localmem_min[lid] = min(minval, localmem_min[lid]);
-        localmem_max[lid] = max(maxval, localmem_max[lid]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-        if (lid < lsize)
-        {
-            int lid2 = lsize + lid;
-            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-    {
-        dst[gid] = localmem_min[0];
-        dst[gid + groupnum] = localmem_max[0];
-    }
-}
-
-__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
-    int cols, int invalid_cols, int offset,
-    int elemnum, int groupnum,
-    const __global uchar * mask, int minvalid_cols, int moffset)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int id = get_global_id(0);
-
-    int idx = offset + id + (id / cols) * invalid_cols;
-    int midx = moffset + id + (id / cols) * minvalid_cols;
-
-    __local T localmem_max[128], localmem_min[128];
-    T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
-
-    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-    {
-        idx = offset + id + (id / cols) * invalid_cols;
-        midx = moffset + id + (id / cols) * minvalid_cols;
-
-        if (mask[midx])
-        {
-            temp = src[idx];
-            minval = min(minval, temp);
-            maxval = max(maxval, temp);
-        }
-    }
-
-    if (lid > 127)
-    {
-        localmem_min[lid - 128] = minval;
-        localmem_max[lid - 128] = maxval;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid < 128)
-    {
-        localmem_min[lid] = min(minval, localmem_min[lid]);
-        localmem_max[lid] = max(maxval, localmem_max[lid]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-        if (lid < lsize)
-        {
-            int lid2 = lsize + lid;
-            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-    {
-        dst[gid] = localmem_min[0];
-        dst[gid + groupnum] = localmem_max[0];
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
deleted file mode 100644
index 1aac6c1f6..000000000
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ /dev/null
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan, yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define RES_TYPE double4
-#define CONVERT_RES_TYPE convert_double4
-#else
-#define RES_TYPE float4
-#define CONVERT_RES_TYPE convert_float4
-#endif
-
-#if defined (DEPTH_0)
-#define VEC_TYPE uchar4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_uchar4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0
-#define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
-#define VEC_TYPE char4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_char4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -128
-#define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
-#define VEC_TYPE ushort4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_ushort4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0
-#define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
-#define VEC_TYPE short4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_short4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -32768
-#define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
-#define VEC_TYPE int4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_int4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL INT_MIN
-#define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
-#define VEC_TYPE float4
-#define VEC_TYPE_LOC float4
-#define CONVERT_TYPE convert_float4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
-#define VEC_TYPE double4
-#define VEC_TYPE_LOC double4
-#define CONVERT_TYPE convert_double4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-#if defined (REPEAT_S0)
-#define repeat_s(a) a=a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_s(a) a.s0 = a.s1;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_s(a) a.s0 = a.s2;a.s1 = a.s2;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_s(a) a.s0 = a.s3;a.s1 = a.s3;a.s2 = a.s3;
-#endif
-
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a=a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s3 = a.s2;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s3 = a.s1;a.s2 = a.s1;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
-#endif
-
-/**************************************Array minMax**************************************/
-
-__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
-                                  __global VEC_TYPE *src, __global RES_TYPE *dst)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int  id = get_global_id(0);
-    int idx = offset + id + (id / cols) * invalid_cols;
-
-    __local VEC_TYPE localmem_max[128], localmem_min[128];
-    VEC_TYPE minval, maxval, temp;
-
-    __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
-    VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
-
-    int idx_c;
-
-    if (id < elemnum)
-    {
-        temp = src[idx];
-        idx_c = idx << 2;
-        temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
-
-        if (id % cols == 0 )
-        {
-            repeat_s(temp);
-            repeat_s(temploc);
-        }
-        if (id % cols == cols - 1)
-        {
-            repeat_e(temp);
-            repeat_e(temploc);
-        }
-        minval = temp;
-        maxval = temp;
-        minloc = temploc;
-        maxloc = temploc;
-    }
-    else
-    {
-        minval = MAX_VAL;
-        maxval = MIN_VAL;
-        minloc = negative;
-        maxloc = negative;
-    }
-
-    int grainSize = (groupnum << 8);
-    for (id = id + grainSize; id < elemnum; id = id + grainSize)
-    {
-        idx = offset + id + (id / cols) * invalid_cols;
-        temp = src[idx];
-        idx_c = idx << 2;
-        temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
-
-        if (id % cols == 0 )
-        {
-            repeat_s(temp);
-            repeat_s(temploc);
-        }
-        if (id % cols == cols - 1)
-        {
-            repeat_e(temp);
-            repeat_e(temploc);
-        }
-
-        minval = min(minval, temp);
-        maxval = max(maxval, temp);
-        minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
-        maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
-    }
-
-    if (lid > 127)
-    {
-        localmem_min[lid - 128] = minval;
-        localmem_max[lid - 128] = maxval;
-        localmem_minloc[lid - 128] = minloc;
-        localmem_maxloc[lid - 128] = maxloc;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid < 128)
-    {
-        localmem_min[lid] = min(minval,localmem_min[lid]);
-        localmem_max[lid] = max(maxval,localmem_max[lid]);
-        VEC_TYPE minVal = localmem_min[lid], maxVal = localmem_max[lid];
-        localmem_minloc[lid] = CONDITION_FUNC(minVal == minval, minloc, localmem_minloc[lid]);
-        localmem_maxloc[lid] = CONDITION_FUNC(maxVal == maxval, maxloc, localmem_maxloc[lid]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-       if (lid < lsize)
-       {
-            int lid2 = lsize + lid;
-            localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
-            localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
-            VEC_TYPE min1 = localmem_min[lid], min2 = localmem_min[lid2];
-            localmem_minloc[lid] = CONDITION_FUNC(min1 == min2, localmem_minloc[lid2], localmem_minloc[lid]);
-            VEC_TYPE max1 = localmem_max[lid], max2 = localmem_max[lid2];
-            localmem_maxloc[lid] = CONDITION_FUNC(max1 == max2, localmem_maxloc[lid2], localmem_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if ( lid == 0)
-    {
-        dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-        dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-        dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
deleted file mode 100644
index 84b26cae3..000000000
--- a/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
+++ /dev/null
@@ -1,256 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan, yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define RES_TYPE double4
-#define CONVERT_RES_TYPE convert_double4
-#else
-#define RES_TYPE float4
-#define CONVERT_RES_TYPE convert_float4
-#endif
-
-#if defined (DEPTH_0)
-#define TYPE uchar
-#define VEC_TYPE uchar4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_uchar4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0
-#define MAX_VAL 255
-#endif
-#if defined (DEPTH_1)
-#define TYPE char
-#define VEC_TYPE char4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_char4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -128
-#define MAX_VAL 127
-#endif
-#if defined (DEPTH_2)
-#define TYPE ushort
-#define VEC_TYPE ushort4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_ushort4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL 0
-#define MAX_VAL 65535
-#endif
-#if defined (DEPTH_3)
-#define TYPE short
-#define VEC_TYPE short4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_short4
-#define CONDITION_FUNC(a,b,c) (convert_int4(a) ? b : c)
-#define MIN_VAL -32768
-#define MAX_VAL 32767
-#endif
-#if defined (DEPTH_4)
-#define TYPE int
-#define VEC_TYPE int4
-#define VEC_TYPE_LOC int4
-#define CONVERT_TYPE convert_int4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL INT_MIN
-#define MAX_VAL INT_MAX
-#endif
-#if defined (DEPTH_5)
-#define TYPE float
-#define VEC_TYPE float4
-#define VEC_TYPE_LOC float4
-#define CONVERT_TYPE convert_float4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-FLT_MAX)
-#define MAX_VAL FLT_MAX
-#endif
-#if defined (DEPTH_6)
-#define TYPE double
-#define VEC_TYPE double4
-#define VEC_TYPE_LOC double4
-#define CONVERT_TYPE convert_double4
-#define CONDITION_FUNC(a,b,c) ((a) ? b : c)
-#define MIN_VAL (-DBL_MAX)
-#define MAX_VAL DBL_MAX
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_e(a) a=a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_e(a) a.s3 = a.s2;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_e(a) a.s3 = a.s1;a.s2 = a.s1;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s3 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-/**************************************Array minMaxLoc mask**************************************/
-__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src,
-                                        int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE  *dst)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int  id = get_global_id(0);
-    int idx = id + (id / cols) * invalid_cols;
-    int midx = id + (id / cols) * minvalid_cols;
-
-    __local VEC_TYPE lm_max[128],lm_min[128];
-    VEC_TYPE minval, maxval, temp, m_temp, zeroVal = (VEC_TYPE)(0);
-    __local VEC_TYPE_LOC lm_maxloc[128], lm_minloc[128];
-    VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1, one = 1, zero = 0;
-
-    if(id < elemnum)
-    {
-        temp = vload4(idx, &src[offset]);
-        m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
-        int idx_c = (idx << 2) + offset;
-        temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-        if (id % cols == cols - 1)
-        {
-            repeat_me(m_temp);
-            repeat_e(temploc);
-        }
-        minval = m_temp != zeroVal ? temp : (VEC_TYPE)MAX_VAL;
-        maxval = m_temp != zeroVal ? temp : (VEC_TYPE)MIN_VAL;
-        minloc = CONDITION_FUNC(m_temp != zeroVal, temploc , negative);
-        maxloc = minloc;
-    }
-    else
-    {
-        minval = MAX_VAL;
-        maxval = MIN_VAL;
-        minloc = negative;
-        maxloc = negative;
-    }
-
-    for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-    {
-        idx = id + (id / cols) * invalid_cols;
-        midx = id + (id / cols) * minvalid_cols;
-        temp = vload4(idx, &src[offset]);
-        m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
-        int idx_c = (idx << 2) + offset;
-        temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-        if (id % cols == cols - 1)
-        {
-            repeat_me(m_temp);
-            repeat_e(temploc);
-        }
-        minval = min(minval, m_temp != zeroVal ? temp : minval);
-        maxval = max(maxval, m_temp != zeroVal ? temp : maxval);
-
-        minloc = CONDITION_FUNC(minval == temp && m_temp != zeroVal, temploc , minloc);
-        maxloc = CONDITION_FUNC(maxval == temp && m_temp != zeroVal, temploc , maxloc);
-    }
-
-    if(lid > 127)
-    {
-        lm_min[lid - 128] = minval;
-        lm_max[lid - 128] = maxval;
-        lm_minloc[lid - 128] = minloc;
-        lm_maxloc[lid - 128] = maxloc;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(lid < 128)
-    {
-        lm_min[lid] = min(minval, lm_min[lid]);
-        lm_max[lid] = max(maxval, lm_max[lid]);
-        VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
-        VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
-        VEC_TYPE lmMinVal = lm_min[lid], lmMaxVal = lm_max[lid];
-        lm_minloc[lid] = CONDITION_FUNC(lmMinVal == minval && con_min != zeroVal, minloc , lm_minloc[lid]);
-        lm_maxloc[lid] = CONDITION_FUNC(lmMaxVal == maxval && con_max != zeroVal, maxloc , lm_maxloc[lid]);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-        if(lid < lsize)
-        {
-            int lid2 = lsize + lid;
-            lm_min[lid] = min(lm_min[lid], lm_min[lid2]);
-            lm_max[lid] = max(lm_max[lid], lm_max[lid2]);
-            VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
-            VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
-
-            VEC_TYPE lmMinVal1 = lm_min[lid], lmMinVal2 = lm_min[lid2];
-            VEC_TYPE lmMaxVal1 = lm_max[lid], lmMaxVal2 = lm_max[lid2];
-            lm_minloc[lid] = CONDITION_FUNC(lmMinVal1 == lmMinVal2 && con_min != zeroVal, lm_minloc[lid2] , lm_minloc[lid]);
-            lm_maxloc[lid] = CONDITION_FUNC(lmMaxVal1 == lmMaxVal2 && con_max != zeroVal, lm_maxloc[lid2] , lm_maxloc[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if( lid == 0)
-    {
-        dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
-        dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
-        dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
-        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl
deleted file mode 100644
index 3180c26e8..000000000
--- a/modules/ocl/src/opencl/arithm_nonzero.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-/**************************************Count NonZero**************************************/
-
-__kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
-                                  __global srcT *src, __global dstT *dst)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int  id = get_global_id(0);
-
-    int idx = offset + id + (id / cols) * invalid_cols;
-    __local dstT localmem_nonzero[128];
-    dstT nonzero = (dstT)(0);
-    srcT zero = (srcT)(0), one = (srcT)(1);
-
-    for (int grain = groupnum << 8; id < elemnum; id += grain)
-    {
-        idx = offset + id + (id / cols) * invalid_cols;
-        nonzero += src[idx] == zero ? zero : one;
-    }
-
-    if (lid > 127)
-        localmem_nonzero[lid - 128] = nonzero;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid < 128)
-        localmem_nonzero[lid] = nonzero + localmem_nonzero[lid];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-        if (lid < lsize)
-        {
-           int lid2 = lsize + lid;
-           localmem_nonzero[lid] = localmem_nonzero[lid] + localmem_nonzero[lid2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-        dst[gid] = localmem_nonzero[0];
-}
diff --git a/modules/ocl/src/opencl/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl
deleted file mode 100644
index 40346b2cd..000000000
--- a/modules/ocl/src/opencl/arithm_phase.cl
+++ /dev/null
@@ -1,171 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define CV_PI M_PI
-#define CV_2PI (2 * CV_PI)
-#else
-#define CV_PI M_PI_F
-#define CV_2PI (2 * CV_PI)
-#endif
-
-/**************************************phase inradians**************************************/
-
-__kernel void arithm_phase_inradians_D5(__global float *src1, int src1_step1, int src1_offset1,
-                                         __global float *src2, int src2_step1, int src2_offset1,
-                                         __global float *dst,  int dst_step1,  int dst_offset1,
-                                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
-
-        float data1 = src1[src1_index];
-        float data2 = src2[src2_index];
-        float tmp = atan2(data2, data1);
-
-        if (tmp < 0)
-            tmp += CV_2PI;
-
-        dst[dst_index] = tmp;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_inradians_D6(__global double *src1, int src1_step1, int src1_offset1,
-                                         __global double *src2, int src2_step1, int src2_offset1,
-                                         __global double *dst,  int dst_step1,  int dst_offset1,
-                                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
-
-        double data1 = src1[src1_index];
-        double data2 = src2[src2_index];
-        double tmp = atan2(data2, data1);
-
-        if (tmp < 0)
-            tmp += CV_2PI;
-
-        dst[dst_index] = tmp;
-    }
-}
-
-#endif
-
-/**************************************phase indegrees**************************************/
-
-__kernel void arithm_phase_indegrees_D5(__global float *src1, int src1_step1, int src1_offset1,
-                                         __global float *src2, int src2_step1, int src2_offset1,
-                                         __global float *dst,  int dst_step1,  int dst_offset1,
-                                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
-
-        float data1 = src1[src1_index];
-        float data2 = src2[src2_index];
-        float tmp = atan2(data2, data1);
-        tmp = 180 * tmp / CV_PI;
-
-        if (tmp < 0)
-            tmp += 360;
-
-        dst[dst_index] = tmp;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_phase_indegrees_D6 (__global double *src1, int src1_step1, int src1_offset1,
-                                         __global double *src2, int src2_step1, int src2_offset1,
-                                         __global double *dst,  int dst_step1,  int dst_offset1,
-                                         int cols, int rows)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step1, x + src1_offset1);
-        int src2_index = mad24(y, src2_step1, x + src2_offset1);
-        int dst_index  = mad24(y, dst_step1, x + dst_offset1);
-
-        double data1 = src1[src1_index];
-        double data2 = src2[src2_index];
-        double tmp = atan2(data2, data1);
-
-        tmp = 180 * tmp / CV_PI;
-        if (tmp < 0)
-            tmp += 360;
-
-        dst[dst_index] = tmp;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl
deleted file mode 100644
index 024f1f0ee..000000000
--- a/modules/ocl/src/opencl/arithm_polarToCart.cl
+++ /dev/null
@@ -1,197 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define CV_PI M_PI
-#else
-#define CV_PI M_PI_F
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////polarToCart with magnitude//////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, int src1_offset,//magnitue
-                                         __global float *src2, int src2_step, int src2_offset,//angle
-                                         __global float *dst1, int dst1_step, int dst1_offset,
-                                         __global float *dst2, int dst2_step, int dst2_offset,
-                                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        float x = src1[src1_index];
-        float y = src2[src2_index];
-
-#ifdef DEGREE
-        float ascale = CV_PI/180.0f;
-        float alpha = y * ascale;
-#else
-        float alpha = y;
-#endif
-        float a = cos(alpha) * x;
-        float b = sin(alpha) * x;
-
-        dst1[dst1_index] = a;
-        dst2[dst2_index] = b;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_polarToCart_mag_D6 (__global double *src1, int src1_step, int src1_offset,//magnitue
-                                         __global double *src2, int src2_step, int src2_offset,//angle
-                                         __global double *dst1, int dst1_step, int dst1_offset,
-                                         __global double *dst2, int dst2_step, int dst2_offset,
-                                         int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, x + src1_offset);
-        int src2_index = mad24(y, src2_step, x + src2_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        double x = src1[src1_index];
-        double y = src2[src2_index];
-
-#ifdef DEGREE
-        float ascale = CV_PI/180.0;
-        float alpha = y * ascale;
-#else
-        float alpha = y;
-#endif
-        double a = cos(alpha) * x;
-        double b = sin(alpha) * x;
-
-        dst1[dst1_index] = a;
-        dst2[dst2_index] = b;
-    }
-}
-#endif
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////polarToCart without magnitude//////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_polarToCart_D5 (__global float *src,  int src_step,  int src_offset,//angle
-                                     __global float *dst1, int dst1_step, int dst1_offset,
-                                     __global float *dst2, int dst2_step, int dst2_offset,
-                                     int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index  = mad24(y, src_step,  x + src_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        float y = src[src_index];
-
-#ifdef DEGREE
-        float ascale = CV_PI/180.0f;
-        float alpha = y * ascale;
-#else
-        float alpha = y;
-#endif
-        float a = cos(alpha);
-        float b = sin(alpha);
-
-        dst1[dst1_index] = a;
-        dst2[dst2_index] = b;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_polarToCart_D6 (__global float *src,  int src_step,  int src_offset,//angle
-                                     __global float *dst1, int dst1_step, int dst1_offset,
-                                     __global float *dst2, int dst2_step, int dst2_offset,
-                                     int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index  = mad24(y, src_step,  x + src_offset);
-
-        int dst1_index = mad24(y, dst1_step, x + dst1_offset);
-        int dst2_index = mad24(y, dst2_step, x + dst2_offset);
-
-        double y = src[src_index];
-
-#ifdef DEGREE
-        float ascale = CV_PI/180.0f;
-        float alpha = y * ascale;
-#else
-        float alpha = y;
-#endif
-        double a = cos(alpha);
-        double b = sin(alpha);
-
-        dst1[dst1_index] = a;
-        dst2[dst2_index] = b;
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/arithm_setidentity.cl b/modules/ocl/src/opencl/arithm_setidentity.cl
deleted file mode 100644
index 0ead5b003..000000000
--- a/modules/ocl/src/opencl/arithm_setidentity.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void setIdentity(__global T * src, int src_step, int src_offset,
-    int cols, int rows, __global const T * scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, src_offset + x);
-
-        if (x == y)
-            src[src_index] = *scalar;
-        else
-            src[src_index] = 0;
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_sqrt.cl b/modules/ocl/src/opencl/arithm_sqrt.cl
deleted file mode 100644
index 142a52ada..000000000
--- a/modules/ocl/src/opencl/arithm_sqrt.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peter Andreas Entschev, peter@entschev.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////LOG/////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void arithm_sqrt_C1(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x + dstOffset1);
-
-        dst[dstIdx] = sqrt(src[srcIdx]);
-    }
-}
-
-__kernel void arithm_sqrt_C2(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 1;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      sqrt(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? sqrt(src[srcIdx + 1]) : dst[dstIdx + 1];
-    }
-}
-
-__kernel void arithm_sqrt_C4(__global srcT *src, __global srcT *dst,
-    int cols1, int rows,
-    int srcOffset1, int dstOffset1,
-    int srcStep1, int dstStep1)
-{
-    int x1 = get_global_id(0) << 2;
-    int y = get_global_id(1);
-
-    if(x1 < cols1 && y < rows)
-    {
-        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
-        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
-
-        dst[dstIdx] =                      sqrt(src[srcIdx]);
-        dst[dstIdx + 1] = x1 + 1 < cols1 ? sqrt(src[srcIdx + 1]) : dst[dstIdx + 1];
-        dst[dstIdx + 2] = x1 + 2 < cols1 ? sqrt(src[srcIdx + 2]) : dst[dstIdx + 2];
-        dst[dstIdx + 3] = x1 + 3 < cols1 ? sqrt(src[srcIdx + 3]) : dst[dstIdx + 3];
-    }
-}
diff --git a/modules/ocl/src/opencl/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl
deleted file mode 100644
index 514cf2a7f..000000000
--- a/modules/ocl/src/opencl/arithm_sum.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if FUNC_SUM
-#define FUNC(a, b) b += a;
-#elif FUNC_ABS_SUM
-#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
-#elif FUNC_SQR_SUM
-#define FUNC(a, b) b += a * a;
-#else
-#error No sum function
-#endif
-
-/**************************************Array buffer SUM**************************************/
-
-__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
-                                __global srcT *src, __global dstT *dst)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int id = get_global_id(0);
-    int idx = offset + id + (id / cols) * invalid_cols;
-
-    __local dstT localmem_sum[128];
-    dstT sum = (dstT)(0), temp;
-
-    for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
-    {
-        idx = offset + id + (id / cols) * invalid_cols;
-        temp = convertToDstT(src[idx]);
-        FUNC(temp, sum);
-    }
-
-    if (lid > 127)
-        localmem_sum[lid - 128] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (lid < 128)
-        localmem_sum[lid] = sum + localmem_sum[lid];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int lsize = 64; lsize > 0; lsize >>= 1)
-    {
-        if (lid < lsize)
-        {
-            int lid2 = lsize + lid;
-            localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    if (lid == 0)
-        dst[gid] = localmem_sum[0];
-}
diff --git a/modules/ocl/src/opencl/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
deleted file mode 100644
index 8cde6544e..000000000
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ /dev/null
@@ -1,139 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#define TILE_DIM      32
-#define BLOCK_ROWS    8
-#define LDS_STEP      TILE_DIM
-
-__kernel void transpose(__global const T* src, __global T* dst,
-    int src_cols, int src_rows,
-    int src_step, int dst_step,
-    int src_offset, int dst_offset)
-{
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local T title[TILE_DIM * LDS_STEP];
-
-    if (x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, x);
-
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if (y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, x_index);
-
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if ((y_index + i) < src_cols)
-            {
-                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS;
-            }
-        }
-    }
-}
-
-__kernel void transpose_inplace(__global T* src, __global T* dst,
-    int src_cols, int src_rows,
-    int src_step, int dst_step,
-    int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < src_rows && x < y)
-    {
-        int srcIdx = mad24(y, src_step, src_offset + x);
-        int dstIdx = mad24(x, dst_step, dst_offset + y);
-
-        T tmp = dst[dstIdx];
-        dst[dstIdx] = src[srcIdx];
-        src[srcIdx] = tmp;
-    }
-}
diff --git a/modules/ocl/src/opencl/bgfg_mog.cl b/modules/ocl/src/opencl/bgfg_mog.cl
deleted file mode 100644
index 06e18c213..000000000
--- a/modules/ocl/src/opencl/bgfg_mog.cl
+++ /dev/null
@@ -1,540 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (CN1)
-#define T_FRAME uchar
-#define T_MEAN_VAR float
-#define CONVERT_TYPE convert_uchar_sat
-#define F_ZERO (0.0f)
-inline float cvt(uchar val)
-{
-    return val;
-}
-
-inline float sqr(float val)
-{
-    return val * val;
-}
-
-inline float sum(float val)
-{
-    return val;
-}
-
-static float clamp1(float var, float learningRate, float diff, float minVar)
-{
-    return fmax(var + learningRate * (diff * diff - var), minVar);
-}
-
-#else
-
-#define T_FRAME uchar4
-#define T_MEAN_VAR float4
-#define CONVERT_TYPE convert_uchar4_sat
-#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
-
-inline float4 cvt(const uchar4 val)
-{
-    float4 result;
-    result.x = val.x;
-    result.y = val.y;
-    result.z = val.z;
-    result.w = val.w;
-
-    return result;
-}
-
-inline float sqr(const float4 val)
-{
-    return val.x * val.x + val.y * val.y + val.z * val.z;
-}
-
-inline float sum(const float4 val)
-{
-    return (val.x + val.y + val.z);
-}
-
-static void swap4(__global float4* ptr, int x, int y, int k, int rows, int ptr_step)
-{
-    float4 val = ptr[(k * rows + y) * ptr_step + x];
-    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
-    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
-}
-
-
-static float4 clamp1(const float4 var, float learningRate, const float4 diff, float minVar)
-{
-    float4 result;
-    result.x = fmax(var.x + learningRate * (diff.x * diff.x - var.x), minVar);
-    result.y = fmax(var.y + learningRate * (diff.y * diff.y - var.y), minVar);
-    result.z = fmax(var.z + learningRate * (diff.z * diff.z - var.z), minVar);
-    result.w = 0.0f;
-    return result;
-}
-
-#endif
-
-typedef struct
-{
-    float c_Tb;
-    float c_TB;
-    float c_Tg;
-    float c_varInit;
-    float c_varMin;
-    float c_varMax;
-    float c_tau;
-    uchar c_shadowVal;
-} con_srtuct_t;
-
-static void swap(__global float* ptr, int x, int y, int k, int rows, int ptr_step)
-{
-    float val = ptr[(k * rows + y) * ptr_step + x];
-    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
-    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
-}
-
-__kernel void mog_withoutLearning_kernel(__global T_FRAME* frame, __global uchar* fgmask,
-    __global float* weight, __global T_MEAN_VAR* mean, __global T_MEAN_VAR* var,
-    int frame_row, int frame_col, int frame_step, int fgmask_step,
-    int weight_step, int mean_step, int var_step,
-    float varThreshold, float backgroundRatio, int fgmask_offset_x,
-    int fgmask_offset_y, int frame_offset_x, int frame_offset_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < frame_col && y < frame_row)
-    {
-        T_MEAN_VAR pix = cvt(frame[(y + frame_offset_y) * frame_step + (x + frame_offset_x)]);
-
-        int kHit = -1;
-        int kForeground = -1;
-
-        for (int k = 0; k < (NMIXTURES); ++k)
-        {
-            if (weight[(k * frame_row + y) * weight_step + x] < 1.192092896e-07f)
-                break;
-
-            T_MEAN_VAR mu = mean[(k * frame_row + y) * mean_step + x];
-            T_MEAN_VAR _var = var[(k * frame_row + y) + var_step + x];
-
-            T_MEAN_VAR diff = pix - mu;
-
-            if (sqr(diff) < varThreshold * sum(_var))
-            {
-                kHit = k;
-                break;
-            }
-        }
-
-        if (kHit >= 0)
-        {
-            float wsum = 0.0f;
-            for (int k = 0; k < (NMIXTURES); ++k)
-            {
-                wsum += weight[(k * frame_row + y) * weight_step + x];
-
-                if (wsum > backgroundRatio)
-                {
-                    kForeground = k + 1;
-                    break;
-                }
-            }
-        }
-        if(kHit < 0 || kHit >= kForeground)
-            fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar) (-1);
-        else
-            fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar) (0);
-    }
-}
-
-__kernel void mog_withLearning_kernel(__global T_FRAME* frame, __global int* fgmask,
-    __global float* weight, __global float* sortKey, __global T_MEAN_VAR* mean,
-    __global T_MEAN_VAR* var, int frame_row, int frame_col, int frame_step, int fgmask_step,
-    int weight_step, int sortKey_step, int mean_step, int var_step,
-    float varThreshold, float backgroundRatio, float learningRate, float minVar,
-    int fgmask_offset_x, int fgmask_offset_y, int frame_offset_x, int frame_offset_y)
-{
-    const float w0 = 0.05f;
-    const float sk0 = w0 / 30.0f;
-    const float var0 = 900.f;
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x >= frame_col || y >= frame_row) return;
-    float wsum = 0.0f;
-    int kHit = -1;
-    int kForeground = -1;
-    int k = 0;
-
-    T_MEAN_VAR pix = cvt(frame[(y + frame_offset_y) * frame_step + (x + frame_offset_x)]);
-
-    for (; k < (NMIXTURES); ++k)
-    {
-        float w = weight[(k * frame_row + y) * weight_step + x];
-        wsum += w;
-
-        if (w < 1.192092896e-07f)
-            break;
-
-        T_MEAN_VAR mu = mean[(k * frame_row + y) * mean_step + x];
-        T_MEAN_VAR _var = var[(k * frame_row + y) * var_step + x];
-
-        float sortKey_prev, weight_prev;
-        T_MEAN_VAR mean_prev, var_prev;
-        if (sqr(pix - mu) < varThreshold * sum(_var))
-        {
-            wsum -= w;
-            float dw = learningRate * (1.0f - w);
-
-            _var = clamp1(_var, learningRate, pix - mu, minVar);
-
-            sortKey_prev = w / sqr(sum(_var));
-            sortKey[(k * frame_row + y) * sortKey_step + x] = sortKey_prev;
-
-            weight_prev = w + dw;
-            weight[(k * frame_row + y) * weight_step + x] = weight_prev;
-
-            mean_prev = mu + learningRate * (pix - mu);
-            mean[(k * frame_row + y) * mean_step + x] = mean_prev;
-
-            var_prev = _var;
-            var[(k * frame_row + y) * var_step + x] = var_prev;
-        }
-
-        int k1 = k - 1;
-
-        if (k1 >= 0 && sqr(pix - mu) < varThreshold * sum(_var))
-        {
-            float sortKey_next = sortKey[(k1 * frame_row + y) * sortKey_step + x];
-            float weight_next = weight[(k1 * frame_row + y) * weight_step + x];
-            T_MEAN_VAR mean_next = mean[(k1 * frame_row + y) * mean_step + x];
-            T_MEAN_VAR var_next = var[(k1 * frame_row + y) * var_step + x];
-
-            for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
-            {
-                sortKey[(k1 * frame_row + y) * sortKey_step + x] = sortKey_prev;
-                sortKey[((k1 + 1) * frame_row + y) * sortKey_step + x] = sortKey_next;
-
-                weight[(k1 * frame_row + y) * weight_step + x] = weight_prev;
-                weight[((k1 + 1) * frame_row + y) * weight_step + x] = weight_next;
-
-                mean[(k1 * frame_row + y) * mean_step + x] = mean_prev;
-                mean[((k1 + 1) * frame_row + y) * mean_step + x] = mean_next;
-
-                var[(k1 * frame_row + y) * var_step + x] = var_prev;
-                var[((k1 + 1) * frame_row + y) * var_step + x] = var_next;
-
-                sortKey_prev = sortKey_next;
-                sortKey_next = k1 > 0 ? sortKey[((k1 - 1) * frame_row + y) * sortKey_step + x] : 0.0f;
-
-                weight_prev = weight_next;
-                weight_next = k1 > 0 ? weight[((k1 - 1) * frame_row + y) * weight_step + x] : 0.0f;
-
-                mean_prev = mean_next;
-                mean_next = k1 > 0 ? mean[((k1 - 1) * frame_row + y) * mean_step + x] : (T_MEAN_VAR)F_ZERO;
-
-                var_prev = var_next;
-                var_next = k1 > 0 ? var[((k1 - 1) * frame_row + y) * var_step + x] : (T_MEAN_VAR)F_ZERO;
-            }
-        }
-
-        kHit = k1 + 1;
-        break;
-    }
-
-    if (kHit < 0)
-    {
-        kHit = k = k < ((NMIXTURES) - 1) ? k : ((NMIXTURES) - 1);
-        wsum += w0 - weight[(k * frame_row + y) * weight_step + x];
-
-        weight[(k * frame_row + y) * weight_step + x] = w0;
-        mean[(k * frame_row + y) * mean_step + x] = pix;
-#if defined (CN1)
-        var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0);
-#else
-        var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0, var0, var0, var0);
-#endif
-        sortKey[(k * frame_row + y) * sortKey_step + x] = sk0;
-    }
-    else
-    {
-        for( ; k < (NMIXTURES); k++)
-            wsum += weight[(k * frame_row + y) * weight_step + x];
-    }
-
-    float wscale = 1.0f / wsum;
-    wsum = 0;
-    for (k = 0; k < (NMIXTURES); ++k)
-    {
-        float w = weight[(k * frame_row + y) * weight_step + x];
-        w *= wscale;
-        wsum += w;
-
-        weight[(k * frame_row + y) * weight_step + x] = w;
-        sortKey[(k * frame_row + y) * sortKey_step + x] *= wscale;
-
-        kForeground = select(kForeground, k + 1, wsum > backgroundRatio && kForeground < 0);
-    }
-    fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar)(-(kHit >= kForeground));
-}
-
-
-__kernel void getBackgroundImage_kernel(__global float* weight, __global T_MEAN_VAR* mean, __global T_FRAME* dst,
-    int dst_row, int dst_col, int weight_step, int mean_step, int dst_step,
-    float backgroundRatio)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < dst_col && y < dst_row)
-    {
-        T_MEAN_VAR meanVal = (T_MEAN_VAR)F_ZERO;
-        float totalWeight = 0.0f;
-
-        for (int mode = 0; mode < (NMIXTURES); ++mode)
-        {
-            float _weight = weight[(mode * dst_row + y) * weight_step + x];
-
-            T_MEAN_VAR _mean = mean[(mode * dst_row + y) * mean_step + x];
-            meanVal = meanVal + _weight * _mean;
-
-            totalWeight += _weight;
-
-            if(totalWeight > backgroundRatio)
-                break;
-        }
-        meanVal = meanVal * (1.f / totalWeight);
-        dst[y * dst_step + x] = CONVERT_TYPE(meanVal);
-    }
-}
-
-__kernel void mog2_kernel(__global T_FRAME * frame, __global int* fgmask, __global float* weight, __global T_MEAN_VAR * mean,
-        __global int* modesUsed, __global float* variance, int frame_row, int frame_col, int frame_step,
-        int fgmask_step, int weight_step, int mean_step, int modesUsed_step, int var_step, float alphaT, float alpha1, float prune,
-        int detectShadows_flag, int fgmask_offset_x, int fgmask_offset_y, int frame_offset_x, int frame_offset_y, __constant con_srtuct_t* constants)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < frame_col && y < frame_row)
-    {
-        T_MEAN_VAR pix = cvt(frame[(y + frame_offset_y) * frame_step + x + frame_offset_x]);
-
-        bool background = false; // true - the pixel classified as background
-
-        bool fitsPDF = false; //if it remains zero a new GMM mode will be added
-
-        int nmodes = modesUsed[y * modesUsed_step + x];
-        int nNewModes = nmodes; //current number of modes in GMM
-
-        float totalWeight = 0.0f;
-
-        for (int mode = 0; mode < nmodes; ++mode)
-        {
-            float _weight = alpha1 * weight[(mode * frame_row + y) * weight_step + x] + prune;
-
-            if (!fitsPDF)
-            {
-                float var = variance[(mode * frame_row + y) * var_step + x];
-
-                T_MEAN_VAR _mean = mean[(mode * frame_row + y) * mean_step + x];
-
-                T_MEAN_VAR diff = _mean - pix;
-                float dist2 = sqr(diff);
-
-                if (totalWeight < constants -> c_TB && dist2 < constants -> c_Tb * var)
-                    background = true;
-
-                if (dist2 < constants -> c_Tg * var)
-                {
-                    fitsPDF = true;
-                    _weight += alphaT;
-                    float k = alphaT / _weight;
-                    mean[(mode * frame_row + y) * mean_step + x] = _mean - k * diff;
-                    float varnew = var + k * (dist2 - var);
-                    varnew = fmax(varnew, constants -> c_varMin);
-                    varnew = fmin(varnew, constants -> c_varMax);
-
-                    variance[(mode * frame_row + y) * var_step + x] = varnew;
-                    for (int i = mode; i > 0; --i)
-                    {
-                        if (_weight < weight[((i - 1) * frame_row + y) * weight_step + x])
-                            break;
-                        swap(weight, x, y, i - 1, frame_row, weight_step);
-                        swap(variance, x, y, i - 1, frame_row, var_step);
-                        #if defined (CN1)
-                        swap(mean, x, y, i - 1, frame_row, mean_step);
-                        #else
-                        swap4(mean, x, y, i - 1, frame_row, mean_step);
-                        #endif
-                    }
-                }
-            } // !fitsPDF
-
-            if (_weight < -prune)
-            {
-                _weight = 0.0f;
-                nmodes--;
-            }
-
-            weight[(mode * frame_row + y) * weight_step + x] = _weight; //update weight by the calculated value
-            totalWeight += _weight;
-        }
-
-        totalWeight = 1.f / totalWeight;
-        for (int mode = 0; mode < nmodes; ++mode)
-            weight[(mode * frame_row + y) * weight_step + x] *= totalWeight;
-
-        nmodes = nNewModes;
-
-        if (!fitsPDF)
-        {
-            int mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;
-
-            if (nmodes == 1)
-                weight[(mode * frame_row + y) * weight_step + x] = 1.f;
-            else
-            {
-                weight[(mode * frame_row + y) * weight_step + x] = alphaT;
-
-                for (int i = 0; i < nmodes - 1; ++i)
-                    weight[(i * frame_row + y) * weight_step + x] *= alpha1;
-            }
-
-            mean[(mode * frame_row + y) * mean_step + x] = pix;
-            variance[(mode * frame_row + y) * var_step + x] = constants -> c_varInit;
-
-            for (int i = nmodes - 1; i > 0; --i)
-            {
-                // check one up
-                if (alphaT < weight[((i - 1) * frame_row + y) * weight_step + x])
-                    break;
-
-                swap(weight, x, y, i - 1, frame_row, weight_step);
-                swap(variance, x, y, i - 1, frame_row, var_step);
-                #if defined (CN1)
-                swap(mean, x, y, i - 1, frame_row, mean_step);
-                #else
-                swap4(mean, x, y, i - 1, frame_row, mean_step);
-                #endif
-            }
-        }
-
-        modesUsed[y * modesUsed_step + x] = nmodes;
-
-        bool isShadow = false;
-        if (detectShadows_flag && !background)
-        {
-            float tWeight = 0.0f;
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                T_MEAN_VAR _mean = mean[(mode * frame_row + y) * mean_step + x];
-
-                T_MEAN_VAR pix_mean = pix * _mean;
-
-                float numerator = sum(pix_mean);
-                float denominator = sqr(_mean);
-
-                if (denominator == 0)
-                    break;
-
-                if (numerator <= denominator && numerator >= constants -> c_tau * denominator)
-                {
-                    float a = numerator / denominator;
-
-                    T_MEAN_VAR dD = a * _mean - pix;
-
-                    if (sqr(dD) < constants -> c_Tb * variance[(mode * frame_row + y) * var_step + x] * a * a)
-                    {
-                        isShadow = true;
-                        break;
-                    }
-                }
-
-                tWeight += weight[(mode * frame_row + y) * weight_step + x];
-                if (tWeight > constants -> c_TB)
-                    break;
-            }
-        }
-
-        fgmask[(y + fgmask_offset_y) * fgmask_step + x + fgmask_offset_x] = background ? 0 : isShadow ? constants -> c_shadowVal : 255;
-    }
-}
-
-__kernel void getBackgroundImage2_kernel(__global int* modesUsed, __global float* weight, __global T_MEAN_VAR* mean,
-    __global T_FRAME* dst, float c_TB, int modesUsed_row, int modesUsed_col, int modesUsed_step, int weight_step,
-    int mean_step, int dst_step, int dst_x, int dst_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < modesUsed_col && y < modesUsed_row)
-    {
-        int nmodes = modesUsed[y * modesUsed_step + x];
-
-        T_MEAN_VAR meanVal = (T_MEAN_VAR)F_ZERO;
-
-        float totalWeight = 0.0f;
-
-        for (int mode = 0; mode < nmodes; ++mode)
-        {
-            float _weight = weight[(mode * modesUsed_row + y) * weight_step + x];
-
-            T_MEAN_VAR _mean = mean[(mode * modesUsed_row + y) * mean_step + x];
-            meanVal = meanVal + _weight * _mean;
-
-            totalWeight += _weight;
-
-            if(totalWeight > c_TB)
-                break;
-        }
-
-        meanVal = meanVal * (1.f / totalWeight);
-        dst[(y + dst_y) * dst_step + x + dst_x] = CONVERT_TYPE(meanVal);
-    }
-}
diff --git a/modules/ocl/src/opencl/blend_linear.cl b/modules/ocl/src/opencl/blend_linear.cl
deleted file mode 100644
index bc7aa4685..000000000
--- a/modules/ocl/src/opencl/blend_linear.cl
+++ /dev/null
@@ -1,78 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, MulticoreWare Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Liu Liujun, liujun@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void blendLinear(__global const T * src1, int src1_offset, int src1_step,
-                          __global const T * src2, int src2_offset, int src2_step,
-                          __global const float * weight1, int weight1_offset, int weight1_step,
-                          __global const float * weight2, int weight2_offset, int weight2_step,
-                          __global T * dst, int dst_offset, int dst_step,
-                          int rows, int cols)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, src1_offset + x);
-        int src2_index = mad24(y, src2_step, src2_offset + x);
-        int weight1_index = mad24(y, weight1_step, weight1_offset + x);
-        int weight2_index = mad24(y, weight2_step, weight2_offset + x);
-        int dst_index = mad24(y, dst_step, dst_offset + x);
-
-        FT w1 = (FT)(weight1[weight1_index]), w2 = (FT)(weight2[weight2_index]);
-        FT den = w1 + w2 + (FT)(1e-5f);
-        FT num = w1 * convertToFT(src1[src1_index]) + w2 * convertToFT(src2[src2_index]);
-
-        dst[dst_index] = convertToT(num / den);
-    }
-}
diff --git a/modules/ocl/src/opencl/convertC3C4.cl b/modules/ocl/src/opencl/convertC3C4.cl
deleted file mode 100644
index 4c519fdf7..000000000
--- a/modules/ocl/src/opencl/convertC3C4.cl
+++ /dev/null
@@ -1,153 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst,
-                         int cols, int rows,
-                         int dstStep_in_piexl, int pixel_end)
-{
-    int id = get_global_id(0);
-    int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
-    pixelid = clamp(pixelid,0,pixel_end);
-    GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
-
-    pixel0 = src[pixelid.x];
-    pixel1 = src[pixelid.y];
-    pixel2 = src[pixelid.z];
-
-    outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
-    outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
-    outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
-    outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
-
-    int4 outy = (id<<2)/cols;
-    int4 outx = (id<<2)%cols;
-
-    outx += (int4)(0, 1, 2, 3);
-    outy = select(outy, outy+1, outx>=cols);
-    outx = select(outx, outx-cols, outx>=cols);
-
-    // when cols == 1
-    outy = select(outy, outy + 1, outx >= cols);
-    outx = select(outx, outx-cols, outx >= cols);
-    outy = select(outy, outy + 1, outx >= cols);
-    outx = select(outx, outx-cols, outx >= cols);
-
-    int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);
-
-    if(outx.w<cols && outy.w<rows)
-    {
-        dst[addr.x] = outpix0;
-        dst[addr.y] = outpix1;
-        dst[addr.z] = outpix2;
-        dst[addr.w] = outpix3;
-    }
-    else if(outx.z<cols && outy.z<rows)
-    {
-        dst[addr.x] = outpix0;
-        dst[addr.y] = outpix1;
-        dst[addr.z] = outpix2;
-    }
-    else if(outx.y<cols && outy.y<rows)
-    {
-        dst[addr.x] = outpix0;
-        dst[addr.y] = outpix1;
-    }
-    else if(outx.x<cols && outy.x<rows)
-        dst[addr.x] = outpix0;
-}
-
-__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst,
-                          int cols, int rows,
-                          int srcStep_in_pixel, int pixel_end)
-{
-    int id = get_global_id(0)<<2;
-    int y = id / cols;
-    int x = id % cols;
-
-    int4 x4 = (int4)(x,x+1,x+2,x+3);
-    int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
-    x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
-
-    // when cols == 1
-    y4 = select(y4, y4 + 1,x4>=(int4)cols);
-    x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols);
-    y4 = select(y4, y4 + 1,x4>=(int4)cols);
-    x4 = select(x4, x4-(int4)cols,x4>=(int4)cols);
-
-    y4=clamp(y4,(int4)0,(int4)(rows-1));
-    int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4);
-
-    GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
-    pixel0 = src[addr.x];
-    pixel1 = src[addr.y];
-    pixel2 = src[addr.z];
-    pixel3 = src[addr.w];
-
-    pixel0.w = pixel1.x;
-    outpixel1.x = pixel1.y;
-    outpixel1.y = pixel1.z;
-    outpixel1.z = pixel2.x;
-    outpixel1.w = pixel2.y;
-    outpixel2.x = pixel2.z;
-    outpixel2.y = pixel3.x;
-    outpixel2.z = pixel3.y;
-    outpixel2.w = pixel3.z;
-
-    int4 outaddr = mul24(id>>2 , 3);
-    outaddr.y++;
-    outaddr.z+=2;
-
-    if(outaddr.z <= pixel_end)
-    {
-        dst[outaddr.x] = pixel0;
-        dst[outaddr.y] = outpixel1;
-        dst[outaddr.z] = outpixel2;
-    }
-    else if(outaddr.y <= pixel_end)
-    {
-        dst[outaddr.x] = pixel0;
-        dst[outaddr.y] = outpixel1;
-    }
-    else if(outaddr.x <= pixel_end)
-        dst[outaddr.x] = pixel0;
-}
diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl
deleted file mode 100644
index bf3b6cfa7..000000000
--- a/modules/ocl/src/opencl/cvt_color.cl
+++ /dev/null
@@ -1,1017 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-
-#ifndef hscale
-#define hscale 0
-#endif
-
-#ifndef hrange
-#define hrange 0
-#endif
-
-#ifdef DEPTH_0
-#define DATA_TYPE uchar
-#define COEFF_TYPE int
-#define MAX_NUM  255
-#define HALF_MAX 128
-#define SAT_CAST(num) convert_uchar_sat_rte(num)
-#endif
-
-#ifdef DEPTH_2
-#define DATA_TYPE ushort
-#define COEFF_TYPE int
-#define MAX_NUM  65535
-#define HALF_MAX 32768
-#define SAT_CAST(num) convert_ushort_sat_rte(num)
-#endif
-
-#ifdef DEPTH_5
-#define DATA_TYPE float
-#define COEFF_TYPE float
-#define MAX_NUM  1.0f
-#define HALF_MAX 0.5f
-#define SAT_CAST(num) (num)
-#endif
-
-#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
-
-enum
-{
-    yuv_shift  = 14,
-    xyz_shift  = 12,
-    hsv_shift = 12,
-    R2Y        = 4899,
-    G2Y        = 9617,
-    B2Y        = 1868,
-    BLOCK_SIZE = 256
-};
-
-///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
-
-__kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step,
-                       int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                       int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + (x << 2));
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-#ifdef DEPTH_5
-        dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f;
-#else
-        dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift);
-#endif
-    }
-}
-
-__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                       __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                       int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
-
-        DATA_TYPE val = src[src_idx];
-        dst[dst_idx] = val;
-        dst[dst_idx + 1] = val;
-        dst[dst_idx + 2] = val;
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
-
-__constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };
-
-__kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
-
-#ifdef DEPTH_5
-        __constant float * coeffs = c_RGB2YUVCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
-#else
-        __constant int * coeffs = c_RGB2YUVCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
-#endif
-
-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
-    }
-}
-
-__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
-__constant int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
-
-__kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
-
-#ifdef DEPTH_5
-        __constant float * coeffs = c_YUV2RGBCoeffs_f;
-        float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3];
-        float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1];
-        float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0];
-#else
-        __constant int * coeffs = c_YUV2RGBCoeffs_i;
-        int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift);
-        int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift);
-        int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift);
-#endif
-
-        dst[dst_idx + bidx] = SAT_CAST( b );
-        dst[dst_idx + 1]      = SAT_CAST( g );
-        dst[dst_idx + (bidx^2)]   = SAT_CAST( r );
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-__constant int ITUR_BT_601_CY = 1220542;
-__constant int ITUR_BT_601_CUB = 2116026;
-__constant int ITUR_BT_601_CUG = 409993;
-__constant int ITUR_BT_601_CVG = 852492;
-__constant int ITUR_BT_601_CVR = 1673527;
-__constant int ITUR_BT_601_SHIFT = 20;
-
-__kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step,
-                            int bidx, __global const uchar* src, __global uchar* dst,
-                            int src_offset, int dst_offset)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y < rows / 2 && x < cols / 2 )
-    {
-        __global const uchar* ysrc = src + mad24(y << 1, src_step, (x << 1) + src_offset);
-        __global const uchar* usrc = src + mad24(rows + y, src_step, (x << 1) + src_offset);
-        __global uchar*       dst1 = dst + mad24(y << 1, dst_step, (x << 3) + dst_offset);
-        __global uchar*       dst2 = dst + mad24((y << 1) + 1, dst_step, (x << 3) + dst_offset);
-
-        int Y1 = ysrc[0];
-        int Y2 = ysrc[1];
-        int Y3 = ysrc[src_step];
-        int Y4 = ysrc[src_step + 1];
-
-        int U  = usrc[0] - 128;
-        int V  = usrc[1] - 128;
-
-        int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
-        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
-        int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
-
-        Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
-        dst1[2 - bidx]     = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
-        dst1[1]        = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
-        dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
-        dst1[3]        = 255;
-
-        Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
-        dst1[6 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
-        dst1[5]        = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
-        dst1[4 + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
-        dst1[7]        = 255;
-
-        Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
-        dst2[2 - bidx]     = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
-        dst2[1]        = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
-        dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
-        dst2[3]        = 255;
-
-        Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
-        dst2[6 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
-        dst2[5]        = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
-        dst2[4 + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
-        dst2[7]        = 255;
-    }
-}
-
-///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
-
-__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
-
-__kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
-
-#ifdef DEPTH_5
-        __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
-        DATA_TYPE Y  = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx];
-        DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX;
-        DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX;
-#else
-        __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
-        int delta = HALF_MAX * (1 << yuv_shift);
-        int Y =  CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift);
-        int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift);
-        int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift);
-#endif
-
-        dst[dst_idx] = SAT_CAST( Y );
-        dst[dst_idx + 1] = SAT_CAST( Cr );
-        dst[dst_idx + 2] = SAT_CAST( Cb );
-    }
-}
-
-__constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };
-__constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };
-
-__kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                        int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] };
-
-#ifdef DEPTH_5
-        __constant float * coeff = c_YCrCb2RGBCoeffs_f;
-        float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX);
-        float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX);
-        float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX);
-#else
-        __constant int * coeff = c_YCrCb2RGBCoeffs_i;
-        int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift);
-        int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift);
-        int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift);
-#endif
-
-        dst[dst_idx + (bidx^2)] = SAT_CAST(r);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + bidx] = SAT_CAST(b);
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-///////////////////////////////////// RGB <-> XYZ //////////////////////////////////////
-
-__kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                      int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if (dy < rows && dx < cols)
-    {
-        dx <<= 2;
-        int src_idx = mad24(dy, src_step, src_offset + dx);
-        int dst_idx = mad24(dy, dst_step, dst_offset + dx);
-
-        DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2];
-
-#ifdef DEPTH_5
-        float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2];
-        float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5];
-        float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8];
-#else
-        int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift);
-        int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift);
-        int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift);
-#endif
-        dst[dst_idx] = SAT_CAST(x);
-        dst[dst_idx + 1] = SAT_CAST(y);
-        dst[dst_idx + 2] = SAT_CAST(z);
-    }
-}
-
-__kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step,
-                      int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst,
-                      int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if (dy < rows && dx < cols)
-    {
-        dx <<= 2;
-        int src_idx = mad24(dy, src_step, src_offset + dx);
-        int dst_idx = mad24(dy, dst_step, dst_offset + dx);
-
-        DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2];
-
-#ifdef DEPTH_5
-        float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2];
-        float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5];
-        float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8];
-#else
-        int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift);
-        int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift);
-        int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift);
-#endif
-        dst[dst_idx] = SAT_CAST(b);
-        dst[dst_idx + 1] = SAT_CAST(g);
-        dst[dst_idx + 2] = SAT_CAST(r);
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-///////////////////////////////////// RGB[A] <-> BGR[A] //////////////////////////////////////
-
-__kernel void RGB(int cols, int rows, int src_step, int dst_step,
-                  __global const DATA_TYPE * src, __global DATA_TYPE * dst,
-                  int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-#ifdef REVERSE
-        dst[dst_idx] = src[src_idx + 2];
-        dst[dst_idx + 1] = src[src_idx + 1];
-        dst[dst_idx + 2] = src[src_idx];
-#elif defined ORDER
-        dst[dst_idx] = src[src_idx];
-        dst[dst_idx + 1] = src[src_idx + 1];
-        dst[dst_idx + 2] = src[src_idx + 2];
-#endif
-
-#if dcn == 4
-#if scn == 3
-        dst[dst_idx + 3] = MAX_NUM;
-#else
-        dst[dst_idx + 3] = src[src_idx + 3];
-#endif
-#endif
-    }
-}
-
-///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
-
-__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                         __global const ushort * src, __global uchar * dst,
-                         int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + (x << 2));
-        ushort t = src[src_idx];
-
-#if greenbits == 6
-        dst[dst_idx + bidx] = (uchar)(t << 3);
-        dst[dst_idx + 1] = (uchar)((t >> 3) & ~3);
-        dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7);
-#else
-        dst[dst_idx + bidx] = (uchar)(t << 3);
-        dst[dst_idx + 1] = (uchar)((t >> 2) & ~7);
-        dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7);
-#endif
-
-#if dcn == 4
-#if greenbits == 6
-        dst[dst_idx + 3] = 255;
-#else
-        dst[dst_idx + 3] = t & 0x8000 ? 255 : 0;
-#endif
-#endif
-    }
-}
-
-__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx,
-                         __global const uchar * src, __global ushort * dst,
-                         int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + (x << 2));
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-#if greenbits == 6
-            dst[dst_idx] = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~3) << 3)|((src[src_idx + (bidx^2)]&~7) << 8));
-#elif scn == 3
-            dst[dst_idx] = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|((src[src_idx + (bidx^2)]&~7) << 7));
-#else
-            dst[dst_idx] = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|
-                ((src[src_idx + (bidx^2)]&~7) << 7)|(src[src_idx + 3] ? 0x8000 : 0));
-#endif
-    }
-}
-
-///////////////////////////////////// RGB5x5 <-> RGB //////////////////////////////////////
-
-__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx,
-                          __global const ushort * src, __global uchar * dst,
-                          int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        int t = src[src_idx];
-
-#if greenbits == 6
-        dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
-                                         ((t >> 3) & 0xfc)*G2Y +
-                                         ((t >> 8) & 0xf8)*R2Y, yuv_shift);
-#else
-        dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
-                                         ((t >> 2) & 0xf8)*G2Y +
-                                         ((t >> 7) & 0xf8)*R2Y, yuv_shift);
-#endif
-    }
-}
-
-__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx,
-                          __global const uchar * src, __global ushort * dst,
-                          int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-        int t = src[src_idx];
-
-#if greenbits == 6
-        dst[dst_idx] = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
-#else
-        t >>= 3;
-        dst[dst_idx] = (ushort)(t|(t << 5)|(t << 10));
-#endif
-    }
-}
-
-///////////////////////////////////// RGB <-> HSV //////////////////////////////////////
-
-__constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2, 1 }, { 0, 1, 3 }, { 2, 1, 0 } };
-
-#ifdef DEPTH_0
-
-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const uchar * src, __global uchar * dst,
-                      int src_offset, int dst_offset,
-                      __constant int * sdiv_table, __constant int * hdiv_table)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        int b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)];
-        int h, s, v = b;
-        int vmin = b, diff;
-        int vr, vg;
-
-        v = max( v, g );
-        v = max( v, r );
-        vmin = min( vmin, g );
-        vmin = min( vmin, r );
-
-        diff = v - vmin;
-        vr = v == r ? -1 : 0;
-        vg = v == g ? -1 : 0;
-
-        s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h = (vr & (g - b)) +
-            (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h += h < 0 ? hrange : 0;
-
-        dst[dst_idx] = convert_uchar_sat_rte(h);
-        dst[dst_idx + 1] = (uchar)s;
-        dst[dst_idx + 2] = (uchar)v;
-    }
-}
-
-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const uchar * src, __global uchar * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float h = src[src_idx], s = src[src_idx + 1]*(1/255.f), v = src[src_idx + 2]*(1/255.f);
-        float b, g, r;
-
-        if (s != 0)
-        {
-            float tab[4];
-            int sector;
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-            if( (unsigned)sector >= 6u )
-            {
-                sector = 0;
-                h = 0.f;
-            }
-
-            tab[0] = v;
-            tab[1] = v*(1.f - s);
-            tab[2] = v*(1.f - s*h);
-            tab[3] = v*(1.f - s*(1.f - h));
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = v;
-
-        dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
-        dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const float * src, __global float * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)];
-        float h, s, v;
-
-        float vmin, diff;
-
-        v = vmin = r;
-        if( v < g ) v = g;
-        if( v < b ) v = b;
-        if( vmin > g ) vmin = g;
-        if( vmin > b ) vmin = b;
-
-        diff = v - vmin;
-        s = diff/(float)(fabs(v) + FLT_EPSILON);
-        diff = (float)(60./(diff + FLT_EPSILON));
-        if( v == r )
-            h = (g - b)*diff;
-        else if( v == g )
-            h = (b - r)*diff + 120.f;
-        else
-            h = (r - g)*diff + 240.f;
-
-        if( h < 0 ) h += 360.f;
-
-        dst[dst_idx] = h*hscale;
-        dst[dst_idx + 1] = s;
-        dst[dst_idx + 2] = v;
-    }
-}
-
-__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const float * src, __global float * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float h = src[src_idx], s = src[src_idx + 1], v = src[src_idx + 2];
-        float b, g, r;
-
-        if (s != 0)
-        {
-            float tab[4];
-            int sector;
-            h *= hscale;
-            if(h < 0)
-                do h += 6; while (h < 0);
-            else if (h >= 6)
-                do h -= 6; while (h >= 6);
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-            if ((unsigned)sector >= 6u)
-            {
-                sector = 0;
-                h = 0.f;
-            }
-
-            tab[0] = v;
-            tab[1] = v*(1.f - s);
-            tab[2] = v*(1.f - s*h);
-            tab[3] = v*(1.f - s*(1.f - h));
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = v;
-
-        dst[dst_idx + bidx] = b;
-        dst[dst_idx + 1] = g;
-        dst[dst_idx + (bidx^2)] = r;
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-#endif
-
-///////////////////////////////////// RGB <-> HLS //////////////////////////////////////
-
-#ifdef DEPTH_0
-
-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const uchar * src, __global uchar * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float b = src[src_idx + bidx]*(1/255.f), g = src[src_idx + 1]*(1/255.f), r = src[src_idx + (bidx^2)]*(1/255.f);
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
-
-        vmax = vmin = r;
-        if (vmax < g) vmax = g;
-        if (vmax < b) vmax = b;
-        if (vmin > g) vmin = g;
-        if (vmin > b) vmin = b;
-
-        diff = vmax - vmin;
-        l = (vmax + vmin)*0.5f;
-
-        if (diff > FLT_EPSILON)
-        {
-            s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-            diff = 60.f/diff;
-
-            if( vmax == r )
-                h = (g - b)*diff;
-            else if( vmax == g )
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if( h < 0.f ) h += 360.f;
-        }
-
-        dst[dst_idx] = convert_uchar_sat_rte(h*hscale);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f);
-        dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f);
-    }
-}
-
-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const uchar * src, __global uchar * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float h = src[src_idx], l = src[src_idx + 1]*(1.f/255.f), s = src[src_idx + 2]*(1.f/255.f);
-        float b, g, r;
-
-        if (s != 0)
-        {
-            float tab[4];
-
-            float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-            float p1 = 2*l - p2;
-
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            int sector = convert_int_sat_rtn(h);
-            h -= sector;
-
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1)*(1-h);
-            tab[3] = p1 + (p2 - p1)*h;
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = l;
-
-        dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f);
-        dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f);
-        dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-#elif defined DEPTH_5
-
-__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const float * src, __global float * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)];
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
-
-        vmax = vmin = r;
-        if (vmax < g) vmax = g;
-        if (vmax < b) vmax = b;
-        if (vmin > g) vmin = g;
-        if (vmin > b) vmin = b;
-
-        diff = vmax - vmin;
-        l = (vmax + vmin)*0.5f;
-
-        if (diff > FLT_EPSILON)
-        {
-            s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
-            diff = 60.f/diff;
-
-            if( vmax == r )
-                h = (g - b)*diff;
-            else if( vmax == g )
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if( h < 0.f ) h += 360.f;
-        }
-
-        dst[dst_idx] = h*hscale;
-        dst[dst_idx + 1] = l;
-        dst[dst_idx + 2] = s;
-    }
-}
-
-__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx,
-                      __global const float * src, __global float * dst,
-                      int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        float h = src[src_idx], l = src[src_idx + 1], s = src[src_idx + 2];
-        float b, g, r;
-
-        if (s != 0)
-        {
-            float tab[4];
-            int sector;
-
-            float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
-            float p1 = 2*l - p2;
-
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            sector = convert_int_sat_rtn(h);
-            h -= sector;
-
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1)*(1-h);
-            tab[3] = p1 + (p2 - p1)*h;
-
-            b = tab[sector_data[sector][0]];
-            g = tab[sector_data[sector][1]];
-            r = tab[sector_data[sector][2]];
-        }
-        else
-            b = g = r = l;
-
-        dst[dst_idx + bidx] = b;
-        dst[dst_idx + 1] = g;
-        dst[dst_idx + (bidx^2)] = r;
-#if dcn == 4
-        dst[dst_idx + 3] = MAX_NUM;
-#endif
-    }
-}
-
-#endif
-
-/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
-
-#ifdef DEPTH_0
-
-__kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step,
-                        int bidx, __global const uchar * src, __global uchar * dst,
-                        int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-
-        dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM;
-        dst[dst_idx + 3] = v3;
-    }
-}
-
-__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx,
-                        __global const uchar * src, __global uchar * dst,
-                        int src_offset, int dst_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        x <<= 2;
-        int src_idx = mad24(y, src_step, src_offset + x);
-        int dst_idx = mad24(y, dst_step, dst_offset + x);
-
-        uchar v0 = src[src_idx], v1 = src[src_idx + 1];
-        uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3];
-        uchar v3_half = v3 / 2;
-
-        dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3;
-        dst[dst_idx + 3] = v3;
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/featdetect_fast.cl b/modules/ocl/src/opencl/featdetect_fast.cl
deleted file mode 100644
index 44d4f44e6..000000000
--- a/modules/ocl/src/opencl/featdetect_fast.cl
+++ /dev/null
@@ -1,1331 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Peter Andreas Entschev, peter@entschev.com
-//
-//M*/
-
-#define X_ROW 0
-#define Y_ROW 1
-#define RESPONSE_ROW 2
-#define ROWS_COUNT 3
-
-
-__constant uchar c_table[] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-                               0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
-                               0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
-                               0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
-                               0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
-                               0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80, 0x00,
-                               0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
-                               0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
-                               0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0xc0,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-                               0x80, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
-                               0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
-                               0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
-                               0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0xaa, 0xaa, 0xaa,
-                               0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
-                               0xaa, 0xaa, 0xaa, 0xaa, 0xff, 0xff, 0xff, 0xff, 0xff,
-                               0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-                               0xff, 0xff };
-
-
-// 1 -> v > x + th
-// 2 -> v < x - th
-// 0 -> x - th <= v <= x + th
-__inline int diffType(const int v, const int x, const int th)
-{
-    const int diff = x - v;
-
-    return (int)(diff < -th) + ((int)(diff > th) << 1);
-}
-
-// For OpenCL 1.1 compatibility
-__inline int popCount(int x) {
-    x = x - ((x >> 1) & 0x55555555);
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    x = (x + (x >> 4)) & 0x0F0F0F0F;
-    x = x + (x >> 8);
-    x = x + (x >> 16);
-
-    return x & 0x0000003F;
-}
-
-
-void calcMask(
-    const uint C[4],
-    const int v,
-    const int th,
-    int* mask1,
-    int* mask2)
-{
-    *mask1 = 0;
-    *mask2 = 0;
-
-    int d1, d2;
-
-
-
-    d1 = diffType(v, C[0] & 0xff, th);
-    d2 = diffType(v, C[2] & 0xff, th);
-
-    if ((d1 | d2) == 0)
-        return;
-
-    *mask1 |= (d1 & 1) << 0;
-    *mask2 |= ((d1 & 2) >> 1) << 0;
-
-    *mask1 |= (d2 & 1) << 8;
-    *mask2 |= ((d2 & 2) >> 1) << 8;
-
-
-
-    d1 = diffType(v, C[1] & 0xff, th);
-    d2 = diffType(v, C[3] & 0xff, th);
-
-    if ((d1 | d2) == 0)
-        return;
-
-    *mask1 |= (d1 & 1) << 4;
-    *mask2 |= ((d1 & 2) >> 1) << 4;
-
-    *mask1 |= (d2 & 1) << 12;
-    *mask2 |= ((d2 & 2) >> 1) << 12;
-
-
-
-    d1 = diffType(v, (C[0] >> (2 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[2] >> (2 * 8)) & 0xff, th);
-
-    if ((d1 | d2) == 0)
-        return;
-
-    *mask1 |= (d1 & 1) << 2;
-    *mask2 |= ((d1 & 2) >> 1) << 2;
-
-    *mask1 |= (d2 & 1) << 10;
-    *mask2 |= ((d2 & 2) >> 1) << 10;
-
-
-
-    d1 = diffType(v, (C[1] >> (2 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[3] >> (2 * 8)) & 0xff, th);
-
-    if ((d1 | d2) == 0)
-        return;
-
-    *mask1 |= (d1 & 1) << 6;
-    *mask2 |= ((d1 & 2) >> 1) << 6;
-
-    *mask1 |= (d2 & 1) << 14;
-    *mask2 |= ((d2 & 2) >> 1) << 14;
-
-
-
-    d1 = diffType(v, (C[0] >> (1 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[2] >> (1 * 8)) & 0xff, th);
-
-    /*if ((d1 | d2) == 0)
-        return;*/
-
-    *mask1 |= (d1 & 1) << 1;
-    *mask2 |= ((d1 & 2) >> 1) << 1;
-
-    *mask1 |= (d2 & 1) << 9;
-    *mask2 |= ((d2 & 2) >> 1) << 9;
-
-
-
-    d1 = diffType(v, (C[0] >> (3 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[2] >> (3 * 8)) & 0xff, th);
-
-    /*if ((d1 | d2) == 0)
-        return;*/
-
-    *mask1 |= (d1 & 1) << 3;
-    *mask2 |= ((d1 & 2) >> 1) << 3;
-
-    *mask1 |= (d2 & 1) << 11;
-    *mask2 |= ((d2 & 2) >> 1) << 11;
-
-
-
-    d1 = diffType(v, (C[1] >> (1 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[3] >> (1 * 8)) & 0xff, th);
-
-    /*if ((d1 | d2) == 0)
-        return;*/
-
-    *mask1 |= (d1 & 1) << 5;
-    *mask2 |= ((d1 & 2) >> 1) << 5;
-
-    *mask1 |= (d2 & 1) << 13;
-    *mask2 |= ((d2 & 2) >> 1) << 13;
-
-
-
-    d1 = diffType(v, (C[1] >> (3 * 8)) & 0xff, th);
-    d2 = diffType(v, (C[3] >> (3 * 8)) & 0xff, th);
-
-    *mask1 |= (d1 & 1) << 7;
-    *mask2 |= ((d1 & 2) >> 1) << 7;
-
-    *mask1 |= (d2 & 1) << 15;
-    *mask2 |= ((d2 & 2) >> 1) << 15;
-}
-
-// 1 -> v > x + th
-// 2 -> v < x - th
-// 0 -> not a keypoint
-__inline bool isKeyPoint(int mask1, int mask2)
-{
-    // TODO: Use OpenCL's popcount() function if OpenCL version >= 1.2
-    return (popCount(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
-           (popCount(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
-}
-
-int cornerScore(const uint C[4], const int v, const int threshold)
-{
-    // binary search in [threshold + 1, 255]
-
-    int min = threshold + 1;
-    int max = 255;
-
-    while (min <= max)
-    {
-        const int mid = (min + max) >> 1;
-
-        int mask1 = 0;
-        int mask2 = 0;
-
-        calcMask(C, v, mid, &mask1, &mask2);
-
-        int isKp = (int)isKeyPoint(mask1, mask2);
-
-        min = isKp * (mid + 1) + (isKp ^ 1) * min;
-        max = (isKp ^ 1) * (mid - 1) + isKp * max;
-    }
-
-    return min - 1;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// calcKeypoints
-
-__kernel
-void calcKeypointsWithMask(
-    __global const uchar* img,
-    __global const uchar* mask,
-    __global int* kpLoc,
-    __global uint* score,
-    __global int* counter,
-    const int calcScore,
-    const unsigned int maxKeypoints,
-    const int threshold,
-    const int c_img_step,
-    const int c_img_rows,
-    const int c_img_cols,
-    const int c_mask_step,
-    const int c_kploc_step,
-    const int c_score_step)
-{
-    const int j = get_global_id(0) + 3;
-    const int i = get_global_id(1) + 3;
-
-    if (i < c_img_rows - 3 && j < c_img_cols - 3 && mask[i * c_mask_step + j])
-    {
-        int v;
-        uint C[4] = {0,0,0,0};
-
-        C[2] |= (uint)img[(i - 3) * c_img_step + j - 1] << 8;
-        C[2] |= (uint)img[(i - 3) * c_img_step + j];
-        C[1] |= (uint)img[(i - 3) * c_img_step + j + 1] << (3 * 8);
-
-        C[2] |= (uint)img[(i - 2) * c_img_step + j - 2] << (2 * 8);
-        C[1] |= (uint)img[(i - 2) * c_img_step + j + 2] << (2 * 8);
-
-        C[2] |= (uint)img[(i - 1) * c_img_step + j - 3] << (3 * 8);
-        C[1] |= (uint)img[(i - 1) * c_img_step + j + 3] << 8;
-
-        C[3] |= (uint)img[i * c_img_step + j - 3];
-        v     = (int) img[i * c_img_step + j];
-        C[1] |= (uint)img[i * c_img_step + j + 3];
-
-        int d1 = diffType(v, C[1] & 0xff, threshold);
-        int d2 = diffType(v, C[3] & 0xff, threshold);
-
-        if ((d1 | d2) == 0)
-            return;
-
-        C[3] |= (uint)img[(i + 1) * c_img_step + j - 3] << 8;
-        C[0] |= (uint)img[(i + 1) * c_img_step + j + 3] << (3 * 8);
-
-        C[3] |= (uint)img[(i + 2) * c_img_step + j - 2] << (2 * 8);
-        C[0] |= (uint)img[(i + 2) * c_img_step + j + 2] << (2 * 8);
-
-        C[3] |= (uint)img[(i + 3) * c_img_step + j - 1] << (3 * 8);
-        C[0] |= (uint)img[(i + 3) * c_img_step + j];
-        C[0] |= (uint)img[(i + 3) * c_img_step + j + 1] << 8;
-
-        int mask1 = 0;
-        int mask2 = 0;
-
-        calcMask(C, v, threshold, &mask1, &mask2);
-
-        if (isKeyPoint(mask1, mask2))
-        {
-            if (calcScore) score[i * c_score_step + j] = cornerScore(C, v, threshold);
-
-            uint idx = atomic_inc(counter);
-
-            if (idx < maxKeypoints)
-            {
-                kpLoc[X_ROW * c_kploc_step + idx] = j;
-                kpLoc[Y_ROW * c_kploc_step + idx] = i;
-            }
-        }
-    }
-}
-
-__kernel
-void calcKeypoints(
-    __global const uchar* img,
-    __global int* kpLoc,
-    __global uint* score,
-    __global int* counter,
-    const int calcScore,
-    const unsigned int maxKeypoints,
-    const int threshold,
-    const int c_img_step,
-    const int c_img_rows,
-    const int c_img_cols,
-    const int c_kploc_step,
-    const int c_score_step)
-{
-    const int j = get_global_id(0) + 3;
-    const int i = get_global_id(1) + 3;
-
-    if (i < c_img_rows - 3 && j < c_img_cols - 3)
-    {
-        int v;
-        uint C[4] = {0,0,0,0};
-
-        C[2] |= (uint)img[(i - 3) * c_img_step + j - 1] << 8;
-        C[2] |= (uint)img[(i - 3) * c_img_step + j];
-        C[1] |= (uint)img[(i - 3) * c_img_step + j + 1] << (3 * 8);
-
-        C[2] |= (uint)img[(i - 2) * c_img_step + j - 2] << (2 * 8);
-        C[1] |= (uint)img[(i - 2) * c_img_step + j + 2] << (2 * 8);
-
-        C[2] |= (uint)img[(i - 1) * c_img_step + j - 3] << (3 * 8);
-        C[1] |= (uint)img[(i - 1) * c_img_step + j + 3] << 8;
-
-        C[3] |= (uint)img[i * c_img_step + j - 3];
-        v     = (int) img[i * c_img_step + j];
-        C[1] |= (uint)img[i * c_img_step + j + 3];
-
-        int d1 = diffType(v, C[1] & 0xff, threshold);
-        int d2 = diffType(v, C[3] & 0xff, threshold);
-
-        if ((d1 | d2) == 0)
-            return;
-
-        C[3] |= (uint)img[(i + 1) * c_img_step + j - 3] << 8;
-        C[0] |= (uint)img[(i + 1) * c_img_step + j + 3] << (3 * 8);
-
-        C[3] |= (uint)img[(i + 2) * c_img_step + j - 2] << (2 * 8);
-        C[0] |= (uint)img[(i + 2) * c_img_step + j + 2] << (2 * 8);
-
-        C[3] |= (uint)img[(i + 3) * c_img_step + j - 1] << (3 * 8);
-        C[0] |= (uint)img[(i + 3) * c_img_step + j];
-        C[0] |= (uint)img[(i + 3) * c_img_step + j + 1] << 8;
-
-        int mask1 = 0;
-        int mask2 = 0;
-
-        calcMask(C, v, threshold, &mask1, &mask2);
-
-        if (isKeyPoint(mask1, mask2))
-        {
-            if (calcScore) score[i * c_score_step + j] = cornerScore(C, v, threshold);
-
-            uint idx = atomic_inc(counter);
-
-            if (idx < maxKeypoints)
-            {
-                kpLoc[X_ROW * c_kploc_step + idx] = j;
-                kpLoc[Y_ROW * c_kploc_step + idx] = i;
-            }
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////
-// nonmaxSupression
-
-__kernel
-void nonmaxSupression(
-    __global const int* kpLoc,
-    __global const uint* score,
-    __global float* keypoints,
-    __global int* new_counter,
-    const int counter,
-    const int c_kploc_step,
-    const int c_score_step,
-    const int c_keypoints_step)
-{
-    const int i = get_global_id(0);
-
-    if (i < counter)
-    {
-        int loc_x = kpLoc[X_ROW * c_kploc_step + i];
-        int loc_y = kpLoc[Y_ROW * c_kploc_step + i];
-
-        int s = score[loc_y * c_score_step + loc_x];
-
-        bool ismax =
-            s > score[(loc_y - 1) * c_score_step + loc_x - 1] &&
-            s > score[(loc_y - 1) * c_score_step + loc_x    ] &&
-            s > score[(loc_y - 1) * c_score_step + loc_x + 1] &&
-
-            s > score[loc_y       * c_score_step + loc_x - 1] &&
-            s > score[loc_y       * c_score_step + loc_x + 1] &&
-
-            s > score[(loc_y + 1) * c_score_step + loc_x - 1] &&
-            s > score[(loc_y + 1) * c_score_step + loc_x    ] &&
-            s > score[(loc_y + 1) * c_score_step + loc_x + 1];
-
-        if (ismax)
-        {
-            uint idx = atomic_inc(new_counter);
-
-            keypoints[X_ROW * c_keypoints_step + idx] = (float)loc_x;
-            keypoints[Y_ROW * c_keypoints_step + idx] = (float)loc_y;
-            keypoints[RESPONSE_ROW * c_keypoints_step + idx] = (float)s;
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/filtering_adaptive_bilateral.cl b/modules/ocl/src/opencl/filtering_adaptive_bilateral.cl
deleted file mode 100644
index 81b29617c..000000000
--- a/modules/ocl/src/opencl/filtering_adaptive_bilateral.cl
+++ /dev/null
@@ -1,429 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Harris Gasparakis, harris.gasparakis@amd.com
-//    Xiaopeng Fu, fuxiaopeng2222@163.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef BORDER_CONSTANT
-#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        x = max(min(x, maxV - 1), 0); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        if (x < 0) \
-            x -= ((x - maxV + 1) / maxV) * maxV; \
-        if (x >= maxV) \
-            x %= maxV; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE_(x, maxV, delta) \
-    { \
-        if (maxV == 1) \
-            x = 0; \
-        else \
-            do \
-            { \
-                if ( x < 0 ) \
-                    x = -x - 1 + delta; \
-                else \
-                    x = maxV - 1 - (x - maxV) - delta; \
-            } \
-            while (x >= maxV || x < 0); \
-    }
-#ifdef BORDER_REFLECT
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
-#else
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
-#endif
-#else
-#error No extrapolation method
-#endif
-
-__kernel void
-adaptiveBilateralFilter_C4_D0(
-    __global const uchar4 * restrict src,
-    __global uchar4 *dst,
-    float alpha,
-    int src_offset,
-    int src_whole_rows,
-    int src_whole_cols,
-    int src_step,
-    int dst_offset,
-    int dst_rows,
-    int dst_cols,
-    int dst_step,
-    __global const float* lut,
-    int lut_step)
-{
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-
-    int src_x_off = (src_offset % src_step) >> 2;
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = (dst_offset % dst_step) >> 2;
-    int dst_y_off = dst_offset / dst_step;
-
-    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
-    int startY = (gY * (1+EXTRA)) - anY + src_y_off;
-
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY * (1+EXTRA)) + dst_y_off;
-
-    int posX = dst_startX - dst_x_off + col;
-    int posY = (gY * (1+EXTRA))	;
-
-    __local uchar4 data[ksY+EXTRA][THREADS];
-
-    float4 tmp_sum[1+EXTRA];
-    for(int tmpint = 0; tmpint < 1+EXTRA; tmpint++)
-        tmp_sum[tmpint] = (float4)(0,0,0,0);
-
-#ifdef BORDER_CONSTANT
-    bool con;
-    uchar4 ss;
-    for(int j = 0;	j < ksY+EXTRA; j++)
-    {
-        con = (startX+col >= 0 && startX+col < src_whole_cols && startY+j >= 0 && startY+j < src_whole_rows);
-        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        if (con)
-            ss = src[(startY+j)*(src_step>>2) + cur_col];
-
-        data[j][col] = con ? ss : (uchar4)0;
-    }
-#else
-    for(int j= 0; j < ksY+EXTRA; j++)
-    {
-        int selected_row = startY+j, selected_col = startX+col;
-        EXTRAPOLATE(selected_row, src_whole_rows)
-        EXTRAPOLATE(selected_col, src_whole_cols)
-
-        data[j][col] = src[selected_row * (src_step>>2) + selected_col];
-    }
-#endif
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float4 var[1+EXTRA];
-
-#if VAR_PER_CHANNEL
-    float4 weight;
-    float4 totalWeight = (float4)(0,0,0,0);
-#else
-    float weight;
-    float totalWeight = 0;
-#endif
-
-    int4 currValCenter;
-    int4 currWRTCenter;
-
-    int4 sumVal = 0;
-    int4 sumValSqr = 0;
-
-    if(col < (THREADS-(ksX-1)))
-    {
-        int4 currVal;
-        int howManyAll = (2*anX+1)*(ksY);
-
-        //find variance of all data
-        int startLMj;
-        int endLMj ;
-        // Top row: don't sum the very last element
-        for(int extraCnt = 0; extraCnt <=EXTRA; extraCnt++)
-        {
-#if CALCVAR
-            startLMj = extraCnt;
-            endLMj =  ksY+extraCnt-1;
-            sumVal = (int4)0;
-            sumValSqr= (int4)0;
-            for(int j = startLMj; j < endLMj; j++)
-                for(int i=-anX; i<=anX; i++)
-                {
-                    currVal = convert_int4(data[j][col+anX+i]);
-
-                    sumVal += currVal;
-                    sumValSqr += mul24(currVal, currVal);
-                }
-
-            var[extraCnt] = clamp( convert_float4( ( (sumValSqr * howManyAll)- mul24(sumVal , sumVal) ) ) /  ( (float)(howManyAll*howManyAll) ), (float4)(0.1f, 0.1f, 0.1f, 0.1f), (float4)(MAX_VAR_VAL, MAX_VAR_VAL, MAX_VAR_VAL, MAX_VAR_VAL)) ;
-
-#else
-            var[extraCnt] = (float4)(MAX_VAR_VAL, MAX_VAR_VAL, MAX_VAR_VAL, MAX_VAR_VAL);
-#endif
-        }
-
-        for(int extraCnt = 0; extraCnt <= EXTRA; extraCnt++)
-        {
-
-            // top row: include the very first element, even on first time
-            startLMj = extraCnt;
-            // go all the way, unless this is the last local mem chunk,
-            // then stay within limits - 1
-            endLMj =  extraCnt + ksY;
-
-            // Top row: don't sum the very last element
-            currValCenter = convert_int4( data[ (startLMj + endLMj)/2][col+anX] );
-
-            for(int j = startLMj, lut_j = 0; j < endLMj; j++, lut_j++)
-            {
-                for(int i=-anX; i<=anX; i++)
-                {
-#if FIXED_WEIGHT
-#if VAR_PER_CHANNEL
-                    weight.x = 1.0f;
-                    weight.y = 1.0f;
-                    weight.z = 1.0f;
-                    weight.w = 1.0f;
-#else
-                    weight = 1.0f;
-#endif
-#else // !FIXED_WEIGHT
-                    currVal = convert_int4(data[j][col+anX+i]);
-                    currWRTCenter = currVal-currValCenter;
-
-#if ABF_GAUSSIAN
-
-#if VAR_PER_CHANNEL
-                    weight = exp( (float4)(-0.5f, -0.5f, -0.5f, -0.5f) * convert_float4(currWRTCenter * currWRTCenter) / var[extraCnt] )*
-                        (float4)(lut[lut_j*lut_step+anX+i]);
-#else
-                    weight = exp( -0.5f * (mul24(currWRTCenter.x, currWRTCenter.x) + mul24(currWRTCenter.y, currWRTCenter.y) +
-                        mul24(currWRTCenter.z, currWRTCenter.z) ) / (var[extraCnt].x+var[extraCnt].y+var[extraCnt].z) ) * lut[lut_j*lut_step+anX+i];
-#endif
-
-#else // !ABF_GAUSSIAN
-
-#if VAR_PER_CHANNEL
-                    weight = var[extraCnt] / (var[extraCnt] + convert_float4(currWRTCenter * currWRTCenter)) *
-                        (float4)(lut[lut_j*lut_step+anX+i]);
-#else
-                    weight = ((float)lut[lut_j*lut_step+anX+i]) /(1.0f+( mul24(currWRTCenter.x, currWRTCenter.x) + mul24(currWRTCenter.y, currWRTCenter.y) +
-                        mul24(currWRTCenter.z, currWRTCenter.z))/(var[extraCnt].x+var[extraCnt].y+var[extraCnt].z));
-#endif
-
-#endif //ABF_GAUSSIAN
-
-
-
-#endif  // FIXED_WEIGHT
-
-                    tmp_sum[extraCnt] += convert_float4(data[j][col+anX+i]) * weight;
-                    totalWeight += weight;
-                }
-            }
-
-            if(posX >= 0 && posX < dst_cols && (posY+extraCnt) >= 0 && (posY+extraCnt) < dst_rows)
-                dst[(dst_startY+extraCnt) * (dst_step>>2)+ dst_startX + col] = convert_uchar4_rtz( (tmp_sum[extraCnt] / (float4)totalWeight) + (float4)0.5f);
-
-#if VAR_PER_CHANNEL
-            totalWeight = (float4)(0,0,0,0);
-#else
-            totalWeight = 0.0f;
-#endif
-        }
-    }
-}
-
-
-__kernel void
-adaptiveBilateralFilter_C1_D0(
-    __global const uchar * restrict src,
-    __global uchar *dst,
-    float alpha,
-    int src_offset,
-    int src_whole_rows,
-    int src_whole_cols,
-    int src_step,
-    int dst_offset,
-    int dst_rows,
-    int dst_cols,
-    int dst_step,
-    __global const float * lut,
-    int lut_step)
-{
-    int col = get_local_id(0);
-    const int gX = get_group_id(0);
-    const int gY = get_group_id(1);
-
-    int src_x_off = (src_offset % src_step);
-    int src_y_off = src_offset / src_step;
-    int dst_x_off = (dst_offset % dst_step);
-    int dst_y_off = dst_offset / dst_step;
-
-    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
-    int startY = (gY * (1+EXTRA)) - anY + src_y_off;
-
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY * (1+EXTRA)) + dst_y_off;
-
-    int posX = dst_startX - dst_x_off + col;
-    int posY = (gY * (1+EXTRA))	;
-
-    __local uchar data[ksY+EXTRA][THREADS];
-
-    float tmp_sum[1+EXTRA];
-    for(int tmpint = 0; tmpint < 1+EXTRA; tmpint++)
-    {
-        tmp_sum[tmpint] = (float)(0);
-    }
-
-#ifdef BORDER_CONSTANT
-    bool con;
-    uchar ss;
-    for(int j = 0;	j < ksY+EXTRA; j++)
-    {
-        con = (startX+col >= 0 && startX+col < src_whole_cols && startY+j >= 0 && startY+j < src_whole_rows);
-
-        int cur_col = clamp(startX + col, 0, src_whole_cols);
-        if(con)
-        {
-            ss = src[(startY+j)*(src_step) + cur_col];
-        }
-
-        data[j][col] = con ? ss : 0;
-    }
-#else
-    for(int j= 0; j < ksY+EXTRA; j++)
-    {
-        int selected_row = startY+j, selected_col = startX+col;
-        EXTRAPOLATE(selected_row, src_whole_rows)
-        EXTRAPOLATE(selected_col, src_whole_cols)
-
-        data[j][col] = src[selected_row * (src_step) + selected_col];
-    }
-#endif
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float var[1+EXTRA];
-
-    float weight;
-    float totalWeight = 0;
-
-    int currValCenter;
-    int currWRTCenter;
-
-    int sumVal = 0;
-    int sumValSqr = 0;
-
-    if(col < (THREADS-(ksX-1)))
-    {
-        int currVal;
-
-        int howManyAll = (2*anX+1)*(ksY);
-
-        //find variance of all data
-        int startLMj;
-        int endLMj;
-
-        // Top row: don't sum the very last element
-        for(int extraCnt=0; extraCnt<=EXTRA; extraCnt++)
-        {
-#if CALCVAR
-            startLMj = extraCnt;
-            endLMj =  ksY+extraCnt-1;
-            sumVal = 0;
-            sumValSqr =0;
-            for(int j = startLMj; j < endLMj; j++)
-            {
-                for(int i=-anX; i<=anX; i++)
-                {
-                    currVal	= (uint)(data[j][col+anX+i])	;
-
-                    sumVal += currVal;
-                    sumValSqr += mul24(currVal, currVal);
-                }
-            }
-            var[extraCnt] =  clamp((float)( ( (sumValSqr * howManyAll)- mul24(sumVal , sumVal) ) ) /  ( (float)(howManyAll*howManyAll) ) , 0.1f, (float)(MAX_VAR_VAL) );
-#else
-            var[extraCnt] = (float)(MAX_VAR_VAL);
-#endif
-        }
-
-        for(int extraCnt = 0; extraCnt <= EXTRA; extraCnt++)
-        {
-
-            // top row: include the very first element, even on first time
-            startLMj = extraCnt;
-            // go all the way, unless this is the last local mem chunk,
-            // then stay within limits - 1
-            endLMj =  extraCnt + ksY;
-
-            // Top row: don't sum the very last element
-            currValCenter = (int)( data[ (startLMj + endLMj)/2][col+anX] );
-
-            for(int j = startLMj, lut_j = 0; j < endLMj; j++, lut_j++)
-            {
-                for(int i=-anX; i<=anX; i++)
-                {
-#if FIXED_WEIGHT
-                    weight = 1.0f;
-#else
-                    currVal	= (int)(data[j][col+anX+i])	;
-                    currWRTCenter = currVal-currValCenter;
-
-#if ABF_GAUSSIAN
-                    weight = exp( -0.5f * (float)mul24(currWRTCenter,currWRTCenter)/var[extraCnt]) * lut[lut_j*lut_step+anX+i] ;
-#else
-                    weight = var[extraCnt] / (var[extraCnt] + (float)mul24(currWRTCenter,currWRTCenter)) * lut[lut_j*lut_step+anX+i] ;
-#endif
-#endif
-                    tmp_sum[extraCnt] += (float)(data[j][col+anX+i] * weight);
-                    totalWeight += weight;
-                }
-            }
-
-            if(posX >= 0 && posX < dst_cols && (posY+extraCnt) >= 0 && (posY+extraCnt) < dst_rows)
-            {
-                dst[(dst_startY+extraCnt) * (dst_step)+ dst_startX + col] = convert_uchar_rtz(tmp_sum[extraCnt]/totalWeight+0.5f);
-            }
-
-            totalWeight = 0;
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl
deleted file mode 100644
index 96091ce6e..000000000
--- a/modules/ocl/src/opencl/filtering_boxFilter.cl
+++ /dev/null
@@ -1,376 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-//blur function does not support BORDER_WRAP
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#ifdef EXTRA_EXTRAPOLATION // border > src image size
-#ifdef BORDER_CONSTANT
-// None
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        x = max(min(x, maxX - 1), minX); \
-        y = max(min(y, maxY - 1), minY); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        if (x < minX) \
-            x -= ((x - maxX + 1) / maxX) * maxX; \
-        if (x >= maxX) \
-            x %= maxX; \
-        if (y < minY) \
-            y -= ((y - maxY + 1) / maxY) * maxY; \
-        if (y >= maxY) \
-            y %= maxY; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
-    { \
-        if (maxX - minX == 1) \
-            x = minX; \
-        else \
-            do \
-            { \
-                if (x < minX) \
-                    x = -(x - minX) - 1 + delta; \
-                else \
-                    x = maxX - 1 - (x - maxX) - delta; \
-            } \
-            while (x >= maxX || x < minX); \
-        \
-        if (maxY - minY == 1) \
-            y = minY; \
-        else \
-            do \
-            { \
-                if (y < minY) \
-                    y = -(y - minY) - 1 + delta; \
-                else \
-                    y = maxY - 1 - (y - maxY) - delta; \
-            } \
-            while (y >= maxY || y < minY); \
-    }
-#ifdef BORDER_REFLECT
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
-#elif defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
-#endif
-#else
-#error No extrapolation method
-#endif
-#else
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        int _row = y - minY, _col = x - minX; \
-        _row = ADDR_H(_row, 0, maxY - minY); \
-        _row = ADDR_B(_row, maxY - minY, _row); \
-        y = _row + minY; \
-        \
-        _col = ADDR_L(_col, 0, maxX - minX); \
-        _col = ADDR_R(_col, maxX - minX, _col); \
-        x = _col + minX; \
-    }
-#endif
-
-#if USE_DOUBLE
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
-#endif
-
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
-#else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
-#endif
-
-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
-
-struct RectCoords
-{
-    int x1, y1, x2, y2;
-};
-
-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x)
-#define ASSERT(condition)
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
-{
-#ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-#else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-#endif
-    {
-        __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-        return CONVERT_TO_FPTYPE(*ptr);
-    }
-    else
-    {
-#ifdef BORDER_CONSTANT
-        return borderValue;
-#else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
-
-        EXTRAPOLATE(selected_col, selected_row,
-#ifdef BORDER_ISOLATED
-                srcCoords.x1, srcCoords.y1,
-#else
-                0, 0,
-#endif
-                srcCoords.x2, srcCoords.y2
-         );
-
-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
-
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
-#endif
-    }
-}
-
-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
-
-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void boxFilter(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
-               __global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
-#ifdef BORDER_CONSTANT
-               SCALAR_TYPE borderValue,
-#endif
-               FPTYPE alpha
-               )
-{
-    const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
-    const struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
-
-    const int x = get_local_id(0) + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
-
-    const int local_id = get_local_id(0);
-
-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
-
-    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
-    {
-        data[sy] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
-    }
-
-    INTERMEDIATE_TYPE tmp_sum = 0;
-    for(int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-    {
-        tmp_sum += (data[sy]);
-    }
-
-    sumOfCols[local_id] = tmp_sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
-    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
-
-    int sy_index = 0; // current index in data[] array
-    int stepsY = min(dstCoords.y2 - pos.y, BLOCK_SIZE_Y);
-    ASSERT(stepsY > 0);
-    for (; ;)
-    {
-        ASSERT(pos.y < dstCoords.y2);
-
-        if(local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
-            pos.x >= dstCoords.x1 && pos.x < dstCoords.x2)
-        {
-            ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
-
-            INTERMEDIATE_TYPE total_sum = 0;
-#pragma unroll
-            for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
-            {
-                total_sum += sumOfCols[local_id + sx - ANCHOR_X];
-            }
-            *dstPtr = CONVERT_TO_TYPE(((INTERMEDIATE_TYPE)alpha) * total_sum);
-        }
-
-#if BLOCK_SIZE_Y == 1
-        break;
-#else
-        if (--stepsY == 0)
-            break;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        tmp_sum = sumOfCols[local_id]; // TODO FIX IT: workaround for BUG in OpenCL compiler
-        // only works with scalars: ASSERT(fabs(tmp_sum - sumOfCols[local_id]) < (INTERMEDIATE_TYPE)1e-6);
-        tmp_sum -= data[sy_index];
-
-        data[sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
-#ifdef BORDER_CONSTANT
-                , borderValue
-#endif
-                );
-        srcPos.y++;
-
-        tmp_sum += data[sy_index];
-        sumOfCols[local_id] = tmp_sum;
-
-        sy_index = (sy_index + 1 < KERNEL_SIZE_Y) ? sy_index + 1 : 0;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // next line
-        DEBUG_ONLY(pos.y++);
-        dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes); // Pointer can be out of bounds!
-#endif // BLOCK_SIZE_Y == 1
-    }
-}
diff --git a/modules/ocl/src/opencl/filtering_filter2D.cl b/modules/ocl/src/opencl/filtering_filter2D.cl
deleted file mode 100644
index fb7dca509..000000000
--- a/modules/ocl/src/opencl/filtering_filter2D.cl
+++ /dev/null
@@ -1,374 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-//blur function does not support BORDER_WRAP
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#ifdef EXTRA_EXTRAPOLATION // border > src image size
-#ifdef BORDER_CONSTANT
-// None
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        x = max(min(x, maxX - 1), minX); \
-        y = max(min(y, maxY - 1), minY); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        if (x < minX) \
-            x -= ((x - maxX + 1) / maxX) * maxX; \
-        if (x >= maxX) \
-            x %= maxX; \
-        if (y < minY) \
-            y -= ((y - maxY + 1) / maxY) * maxY; \
-        if (y >= maxY) \
-            y %= maxY; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
-    { \
-        if (maxX - minX == 1) \
-            x = minX; \
-        else \
-            do \
-            { \
-                if (x < minX) \
-                    x = -(x - minX) - 1 + delta; \
-                else \
-                    x = maxX - 1 - (x - maxX) - delta; \
-            } \
-            while (x >= maxX || x < minX); \
-        \
-        if (maxY - minY == 1) \
-            y = minY; \
-        else \
-            do \
-            { \
-                if (y < minY) \
-                    y = -(y - minY) - 1 + delta; \
-                else \
-                    y = maxY - 1 - (y - maxY) - delta; \
-            } \
-            while (y >= maxY || y < minY); \
-    }
-#ifdef BORDER_REFLECT
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
-#elif defined(BORDER_REFLECT_101)
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
-#endif
-#else
-#error No extrapolation method
-#endif
-#else
-#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
-    { \
-        int _row = y - minY, _col = x - minX; \
-        _row = ADDR_H(_row, 0, maxY - minY); \
-        _row = ADDR_B(_row, maxY - minY, _row); \
-        y = _row + minY; \
-        \
-        _col = ADDR_L(_col, 0, maxX - minX); \
-        _col = ADDR_R(_col, maxX - minX, _col); \
-        x = _col + minX; \
-    }
-#endif
-
-#if USE_DOUBLE
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define FPTYPE double
-#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
-#else
-#define FPTYPE float
-#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#define BASE_TYPE char
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#define BASE_TYPE short
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
-#endif
-
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define convert_uchar1_sat_rte convert_uchar_sat_rte
-#define convert_char1_sat_rte convert_char_sat_rte
-#define convert_ushort1_sat_rte convert_ushort_sat_rte
-#define convert_short1_sat_rte convert_short_sat_rte
-#define convert_int1_sat_rte convert_int_sat_rte
-#define convert_float1
-#define convert_double1
-
-#if DATA_DEPTH == 5 || DATA_DEPTH == 6
-#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
-#else
-#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
-#endif
-
-#define VEC_SIZE DATA_CHAN
-
-#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-#define TYPE VEC_TYPE
-
-#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
-
-#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
-
-struct RectCoords
-{
-    int x1, y1, x2, y2;
-};
-
-//#define DEBUG
-#ifdef DEBUG
-#define DEBUG_ONLY(x) x
-#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
-#else
-#define DEBUG_ONLY(x) (void)0
-#define ASSERT(condition) (void)0
-#endif
-
-
-inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
-#ifdef BORDER_CONSTANT
-               , SCALAR_TYPE borderValue
-#endif
-    )
-{
-#ifdef BORDER_ISOLATED
-    if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-#else
-    if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-#endif
-    {
-        __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-        return CONVERT_TO_FPTYPE(*ptr);
-    }
-    else
-    {
-#ifdef BORDER_CONSTANT
-        return borderValue;
-#else
-        int selected_col = pos.x;
-        int selected_row = pos.y;
-
-        EXTRAPOLATE(selected_col, selected_row,
-#ifdef BORDER_ISOLATED
-                srcCoords.x1, srcCoords.y1,
-#else
-                0, 0,
-#endif
-                srcCoords.x2, srcCoords.y2
-         );
-
-        // debug border mapping
-        //printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
-
-        pos = (int2)(selected_col, selected_row);
-        if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
-        {
-            __global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
-            return CONVERT_TO_FPTYPE(*ptr);
-        }
-        else
-        {
-            // for debug only
-            DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
-            return (FPTYPE)(0.0f);
-        }
-#endif
-    }
-}
-
-// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
-
-__kernel
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-void filter2D(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
-              __global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
-#ifdef BORDER_CONSTANT
-              SCALAR_TYPE borderValue,
-#endif
-              __constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
-              )
-{
-    const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
-    struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
-
-    const int local_id = get_local_id(0);
-    const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
-    const int y = get_global_id(1) * BLOCK_SIZE_Y;
-
-    INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
-    __local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
-
-    int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
-
-    int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
-    __global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
-    bool writeResult = (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
-                        pos.x >= dstCoords.x1 && pos.x < dstCoords.x2);
-
-#if BLOCK_SIZE_Y > 1
-    bool readAllpixels = true;
-    int sy_index = 0; // current index in data[] array
-
-    dstCoords.y2 = min(dstCoords.y2, pos.y + BLOCK_SIZE_Y);
-    for (;
-         pos.y < dstCoords.y2;
-         pos.y++,
-         dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes))
-#endif
-    {
-        ASSERT(pos.y < dstCoords.y2);
-
-        for (
-#if BLOCK_SIZE_Y > 1
-            int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
-#else
-            int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y;
-#endif
-            sy++, srcPos.y++)
-        {
-            data[sy + sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
-#ifdef BORDER_CONSTANT
-                    , borderValue
-#endif
-                    );
-        }
-
-        INTERMEDIATE_TYPE total_sum = 0;
-        for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
-        {
-            {
-                __constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
-#if BLOCK_SIZE_Y > 1
-                                                   + KERNEL_SIZE_Y - sy_index
-#endif
-                                                   ];
-                INTERMEDIATE_TYPE tmp_sum = 0;
-                for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
-                {
-                    tmp_sum += data[sy] * k[sy];
-                }
-
-                sumOfCols[local_id] = tmp_sum;
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-
-            int id = local_id + sx - ANCHOR_X;
-            if (id >= 0 && id < LOCAL_SIZE)
-               total_sum += sumOfCols[id];
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        if (writeResult)
-        {
-            ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
-            *dstPtr = CONVERT_TO_TYPE(total_sum);
-        }
-
-#if BLOCK_SIZE_Y > 1
-        readAllpixels = false;
-#if BLOCK_SIZE_Y > KERNEL_SIZE_Y
-        sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1;
-#else
-        sy_index++;
-#endif
-#endif // BLOCK_SIZE_Y == 1
-    }
-}
diff --git a/modules/ocl/src/opencl/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl
deleted file mode 100644
index c402ff721..000000000
--- a/modules/ocl/src/opencl/filtering_morph.cl
+++ /dev/null
@@ -1,228 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Zero Lin, zero.lin@amd.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-
-#ifdef ERODE
-#define MORPH_OP(A,B) min((A),(B))
-#endif
-#ifdef DILATE
-#define MORPH_OP(A,B) max((A),(B))
-#endif
-//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
-#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
-#ifndef GENTYPE
-
-__kernel void morph_C1_D0(__global const uchar * restrict src,
-                          __global uchar *dst,
-                          int src_offset_x, int src_offset_y,
-                          int cols, int rows,
-                          int src_step_in_pixel, int dst_step_in_pixel,
-                          __constant uchar * mat_kernel,
-                          int src_whole_cols, int src_whole_rows,
-                          int dst_offset_in_pixel)
-{
-    int l_x = get_local_id(0);
-    int l_y = get_local_id(1);
-    int x = get_group_id(0)*4*LSIZE0;
-    int y = get_group_id(1)*LSIZE1;
-    int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
-    int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
-    int width = (end_x -start_x+4)>>2;
-    int offset = src_offset_x-RADIUSX & 3;
-    int start_y = y+src_offset_y-RADIUSY;
-    int point1 = mad24(l_y,LSIZE0,l_x);
-    int point2 = point1 + LSIZE0*LSIZE1;
-    int tl_x = (point1 % width)<<2;
-    int tl_y = point1 / width;
-    int tl_x2 = (point2 % width)<<2;
-    int tl_y2 = point2 / width;
-    int cur_x = start_x + tl_x;
-    int cur_y = start_y + tl_y;
-    int cur_x2 = start_x + tl_x2;
-    int cur_y2 = start_y + tl_y2;
-    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
-    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
-    uchar4 temp0,temp1;
-    __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
-
-    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-    //read pixels from src
-    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
-    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
-    temp0 = *(__global uchar4*)&src[start_addr];
-    temp1 = *(__global uchar4*)&src[start_addr2];
-    //judge if read out of boundary
-    temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
-    temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
-    temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
-    temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
-    temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
-
-    temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
-    temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
-    temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
-    temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
-    temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
-
-    LDS_DAT[point1] = temp0;
-    LDS_DAT[point2] = temp1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar4 res = (uchar4)VAL;
-
-    for(int i=0; i<2*RADIUSY+1; i++)
-        for(int j=0; j<2*RADIUSX+1; j++)
-        {
-            res =
-#ifndef RECTKERNEL
-                mat_kernel[i*(2*RADIUSX+1)+j] ?
-#endif
-                MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j))
-#ifndef RECTKERNEL
-                :res
-#endif
-                ;
-        }
-
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
-
-    if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
-    {
-        *(__global uchar4*)&dst[out_addr] = res;
-    }
-    else
-    {
-        if(gidx+3<cols && gidy<rows)
-        {
-            dst[out_addr] = res.x;
-            dst[out_addr+1] = res.y;
-            dst[out_addr+2] = res.z;
-            dst[out_addr+3] = res.w;
-        }
-        else if(gidx+2<cols && gidy<rows)
-        {
-            dst[out_addr] = res.x;
-            dst[out_addr+1] = res.y;
-            dst[out_addr+2] = res.z;
-        }
-        else if(gidx+1<cols && gidy<rows)
-        {
-            dst[out_addr] = res.x;
-            dst[out_addr+1] = res.y;
-        }
-        else if(gidx<cols && gidy<rows)
-        {
-            dst[out_addr] = res.x;
-        }
-    }
-}
-
-#else
-
-__kernel void morph(__global const GENTYPE * restrict src,
-                    __global GENTYPE *dst,
-                    int src_offset_x, int src_offset_y,
-                    int cols, int rows,
-                    int src_step_in_pixel, int dst_step_in_pixel,
-                    __constant uchar * mat_kernel,
-                    int src_whole_cols, int src_whole_rows,
-                    int dst_offset_in_pixel)
-{
-    int l_x = get_local_id(0);
-    int l_y = get_local_id(1);
-    int x = get_group_id(0)*LSIZE0;
-    int y = get_group_id(1)*LSIZE1;
-    int start_x = x+src_offset_x-RADIUSX;
-    int end_x = x + src_offset_x+LSIZE0+RADIUSX;
-    int width = end_x -(x+src_offset_x-RADIUSX)+1;
-    int start_y = y+src_offset_y-RADIUSY;
-    int point1 = mad24(l_y,LSIZE0,l_x);
-    int point2 = point1 + LSIZE0*LSIZE1;
-    int tl_x = point1 % width;
-    int tl_y = point1 / width;
-    int tl_x2 = point2 % width;
-    int tl_y2 = point2 / width;
-    int cur_x = start_x + tl_x;
-    int cur_y = start_y + tl_y;
-    int cur_x2 = start_x + tl_x2;
-    int cur_y2 = start_y + tl_y2;
-    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
-    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
-    GENTYPE temp0,temp1;
-    __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
-
-    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
-    //read pixels from src
-    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
-    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
-    temp0 = src[start_addr];
-    temp1 = src[start_addr2];
-    //judge if read out of boundary
-    temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
-    temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
-
-    temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
-    temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
-
-    LDS_DAT[point1] = temp0;
-    LDS_DAT[point2] = temp1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    GENTYPE res = (GENTYPE)VAL;
-    for(int i=0; i<2*RADIUSY+1; i++)
-        for(int j=0; j<2*RADIUSX+1; j++)
-        {
-            res =
-#ifndef RECTKERNEL
-                mat_kernel[i*(2*RADIUSX+1)+j] ?
-#endif
-                MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
-#ifndef RECTKERNEL
-                :res
-#endif
-                ;
-        }
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
-    if(gidx<cols && gidy<rows)
-    {
-        dst[out_addr] = res;
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
deleted file mode 100644
index 980e85dd2..000000000
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ /dev/null
@@ -1,596 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Wang Weiyan, wangweiyanster@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Nathan, liujun@multicorewareinc.com
-//    Peng Xiao, pengxiao@outlook.com
-//    Erping Pang, erping@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#define CV_HAAR_FEATURE_MAX           3
-
-#define calc_sum(rect,offset)        (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
-#define calc_sum1(rect,offset,i)     (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
-
-typedef int   sumtype;
-typedef float sqsumtype;
-
-#ifndef STUMP_BASED
-#define STUMP_BASED 1
-#endif
-
-typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
-{
-    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
-    float weight[CV_HAAR_FEATURE_MAX];
-    float threshold;
-    float alpha[3] __attribute__((aligned (16)));
-    int left __attribute__((aligned (4)));
-    int right __attribute__((aligned (4)));
-}
-GpuHidHaarTreeNode;
-
-
-//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
-//{
-//    int count __attribute__((aligned (4)));
-//    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
-//    float* alpha __attribute__((aligned (8)));
-//}
-//GpuHidHaarClassifier;
-
-
-typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
-{
-    int  count __attribute__((aligned (4)));
-    float threshold __attribute__((aligned (4)));
-    int two_rects __attribute__((aligned (4)));
-    int reserved0 __attribute__((aligned (8)));
-    int reserved1 __attribute__((aligned (8)));
-    int reserved2 __attribute__((aligned (8)));
-    int reserved3 __attribute__((aligned (8)));
-}
-GpuHidHaarStageClassifier;
-
-
-//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
-//{
-//    int  count __attribute__((aligned (4)));
-//    int  is_stump_based __attribute__((aligned (4)));
-//    int  has_tilted_features __attribute__((aligned (4)));
-//    int  is_tree __attribute__((aligned (4)));
-//    int pq0 __attribute__((aligned (4)));
-//    int pq1 __attribute__((aligned (4)));
-//    int pq2 __attribute__((aligned (4)));
-//    int pq3 __attribute__((aligned (4)));
-//    int p0 __attribute__((aligned (4)));
-//    int p1 __attribute__((aligned (4)));
-//    int p2 __attribute__((aligned (4)));
-//    int p3 __attribute__((aligned (4)));
-//    float inv_window_area __attribute__((aligned (4)));
-//} GpuHidHaarClassifierCascade;
-
-
-#ifdef PACKED_CLASSIFIER
-// this code is scalar, one pixel -> one workitem
-__kernel void gpuRunHaarClassifierCascadePacked(
-    global const GpuHidHaarStageClassifier * stagecascadeptr,
-    global const int4 * info,
-    global const GpuHidHaarTreeNode * nodeptr,
-    global const int * restrict sum,
-    global const float * restrict sqsum,
-    volatile global int4 * candidate,
-    const int pixelstep,
-    const int loopcount,
-    const int start_stage,
-    const int split_stage,
-    const int end_stage,
-    const int startnode,
-    const int splitnode,
-    const int4 p,
-    const int4 pq,
-    const float correction,
-    global const int* pNodesPK,
-    global const int4* pWGInfo
-    )
-
-{
-// this version used information provided for each workgroup
-// no empty WG
-    int     gid = (int)get_group_id(0);
-    int     lid_x = (int)get_local_id(0);
-    int     lid_y = (int)get_local_id(1);
-    int     lid = lid_y*LSx+lid_x;
-    int4    WGInfo = pWGInfo[gid];
-    int     GroupX = (WGInfo.y >> 16)&0xFFFF;
-    int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
-    int     Width  = (WGInfo.x >> 16)&0xFFFF;
-    int     Height = (WGInfo.x >> 0 )& 0xFFFF;
-    int     ImgOffset = WGInfo.z;
-    float   ScaleFactor = as_float(WGInfo.w);
-
-#define DATA_SIZE_X (LSx+WND_SIZE_X)
-#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
-#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
-
-    local int SumL[DATA_SIZE];
-
-    // read input data window into local mem
-    for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
-    {
-        int     index = i+lid; // index in shared local memory
-        if(index<DATA_SIZE)
-        {// calc global x,y coordinat and read data from there
-            int     x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
-            int     y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
-            SumL[index] = sum[ImgOffset+y*pixelstep+x];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // calc variance_norm_factor for all stages
-    float   variance_norm_factor;
-    int     nodecounter= startnode;
-    int4    info1 = p;
-    int4    info2 = pq;
-
-    {
-        int     xl = lid_x;
-        int     yl = lid_y;
-        int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
-        int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
-
-        // add shift to get position on scaled image
-        OffsetGlobal += ImgOffset;
-
-        float   mean =
-            SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
-            SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
-            SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
-            SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
-        float sq =
-            sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
-            sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
-            sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
-            sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
-
-        mean *= correction;
-        sq *= correction;
-
-        variance_norm_factor = sq - mean * mean;
-        variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
-    }// end calc variance_norm_factor for all stages
-
-    int result = (1.0f>0.0f);
-    for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
-    {// iterate until candidate is exist
-        float   stage_sum = 0.0f;
-        __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
-            ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
-        int stagecount = stageinfo->count;
-        float stagethreshold = stageinfo->threshold;
-        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
-        for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
-        {
-        // simple macro to extract shorts from int
-#define M0(_t) ((_t)&0xFFFF)
-#define M1(_t) (((_t)>>16)&0xFFFF)
-            // load packed node data from global memory (L3) into registers
-            global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
-            int4    n0 = pN[0];
-            int4    n1 = pN[1];
-            int4    n2 = pN[2];
-            float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
-            // calc sum of intensity pixels according to node information
-            float classsum =
-                (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
-                (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
-                (SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
-            //accumulate stage responce
-            stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
-        }
-        result = (stage_sum >= stagethreshold);
-    }// next stage if needed
-
-    if(result)
-    {// all stages will be passed and there is a detected face on the tested position
-        int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
-        if(index<OUTPUTSZ)
-        {
-            int     x = GroupX+lid_x;
-            int     y = GroupY+lid_y;
-            int4 candidate_result;
-            candidate_result.x = convert_int_rtn(x*ScaleFactor);
-            candidate_result.y = convert_int_rtn(y*ScaleFactor);
-            candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
-            candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
-            candidate[index] = candidate_result;
-        }
-    }
-}//end gpuRunHaarClassifierCascade
-#else
-
-__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
-    global GpuHidHaarStageClassifier * stagecascadeptr,
-    global int4 * info,
-    global GpuHidHaarTreeNode * nodeptr,
-    global const int * restrict sum1,
-    global const float * restrict sqsum1,
-    global int4 * candidate,
-    const int pixelstep,
-    const int loopcount,
-    const int start_stage,
-    const int split_stage,
-    const int end_stage,
-    const int startnode,
-    const int splitnode,
-    const int4 p,
-    const int4 pq,
-    const float correction)
-{
-    int grpszx = get_local_size(0);
-    int grpszy = get_local_size(1);
-    int grpnumx = get_num_groups(0);
-    int grpidx = get_group_id(0);
-    int lclidx = get_local_id(0);
-    int lclidy = get_local_id(1);
-
-    int lcl_sz = mul24(grpszx,grpszy);
-    int lcl_id = mad24(lclidy,grpszx,lclidx);
-
-    __local int lclshare[1024];
-    __local int* lcldata = lclshare;//for save win data
-    __local int* glboutindex = lcldata + 28*28;//for save global out index
-    __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
-    __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
-    __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
-    glboutindex[0]=0;
-    int outputoff = mul24(grpidx,256);
-
-    //assume window size is 20X20
-#define WINDOWSIZE 20+1
-    //make sure readwidth is the multiple of 4
-    //ystep =1, from host code
-    int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
-    int readheight = grpszy-1+WINDOWSIZE;
-    int read_horiz_cnt = readwidth >> 2;//each read int4
-    int total_read = mul24(read_horiz_cnt,readheight);
-    int read_loop = (total_read + lcl_sz - 1) >> 6;
-    candidate[outputoff+(lcl_id<<2)] = (int4)0;
-    candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
-    candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
-    candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
-    for(int scalei = 0; scalei <loopcount; scalei++)
-    {
-        int4 scaleinfo1= info[scalei];
-        int height = scaleinfo1.x & 0xffff;
-        int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
-        int totalgrp = scaleinfo1.y & 0xffff;
-        int imgoff = scaleinfo1.z;
-        float factor = as_float(scaleinfo1.w);
-
-        __global const int * sum = sum1 + imgoff;
-        __global const float * sqsum = sqsum1 + imgoff;
-        for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
-        {
-            int grpidy = grploop / grpnumperline;
-            int grpidx = grploop - mul24(grpidy, grpnumperline);
-            int x = mad24(grpidx,grpszx,lclidx);
-            int y = mad24(grpidy,grpszy,lclidy);
-            int grpoffx = x-lclidx;
-            int grpoffy = y-lclidy;
-
-            for(int i=0; i<read_loop; i++)
-            {
-                int pos_id = mad24(i,lcl_sz,lcl_id);
-                pos_id = pos_id < total_read ? pos_id : 0;
-
-                int lcl_y = pos_id / read_horiz_cnt;
-                int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
-
-                int glb_x = grpoffx + (lcl_x<<2);
-                int glb_y = grpoffy + lcl_y;
-
-                int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
-                int4 data = *(__global int4*)&sum[glb_off];
-                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
-
-                vstore4(data, 0, &lcldata[lcl_off]);
-            }
-
-            lcloutindex[lcl_id] = 0;
-            lclcount[0] = 0;
-            int result = 1;
-            int nodecounter= startnode;
-            float mean, variance_norm_factor;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            int lcl_off = mad24(lclidy,readwidth,lclidx);
-            int4 cascadeinfo1, cascadeinfo2;
-            cascadeinfo1 = p;
-            cascadeinfo2 = pq;
-
-            cascadeinfo1.x +=lcl_off;
-            cascadeinfo1.z +=lcl_off;
-            mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
-                    lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
-                    *correction;
-
-            int p_offset = mad24(y, pixelstep, x);
-
-            cascadeinfo2.x +=p_offset;
-            cascadeinfo2.z +=p_offset;
-            variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
-                                    sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
-
-            variance_norm_factor = variance_norm_factor * correction - mean * mean;
-            variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
-
-            for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
-            {
-                float stage_sum = 0.f;
-                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
-                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
-                int stagecount = stageinfo->count;
-                float stagethreshold = stageinfo->threshold;
-                for(int nodeloop = 0; nodeloop < stagecount; )
-                {
-                    __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
-                        (((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode));
-
-                    int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-                    int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-                    int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-                    float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                    float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
-
-                    float nodethreshold  = w.w * variance_norm_factor;
-
-                    info1.x +=lcl_off;
-                    info1.z +=lcl_off;
-                    info2.x +=lcl_off;
-                    info2.z +=lcl_off;
-
-                    float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
-                                        lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
-
-                    classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
-                                    lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-
-                    info3.x +=lcl_off;
-                    info3.z +=lcl_off;
-                    classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
-                                    lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-
-                    bool passThres = classsum >= nodethreshold;
-#if STUMP_BASED
-                    stage_sum += passThres ? alpha3.y : alpha3.x;
-                    nodecounter++;
-                    nodeloop++;
-#else
-                    bool isRootNode = (nodecounter & 1) == 0;
-                    if(isRootNode)
-                    {
-                        if( (passThres && currentnodeptr->right) ||
-                            (!passThres && currentnodeptr->left))
-                        {
-                            nodecounter ++;
-                        }
-                        else
-                        {
-                            stage_sum += alpha3.x;
-                            nodecounter += 2;
-                            nodeloop ++;
-                        }
-                    }
-                    else
-                    {
-                        stage_sum += passThres ? alpha3.z : alpha3.y;
-                        nodecounter ++;
-                        nodeloop ++;
-                    }
-#endif
-                }
-
-                result = (stage_sum >= stagethreshold) ? 1 : 0;
-            }
-            if(factor < 2)
-            {
-                if(result && lclidx %2 ==0 && lclidy %2 ==0 )
-                {
-                    int queueindex = atomic_inc(lclcount);
-                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
-                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
-                }
-            }
-            else
-            {
-                if(result)
-                {
-                    int queueindex = atomic_inc(lclcount);
-                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
-                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
-                }
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int queuecount  = lclcount[0];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            nodecounter = splitnode;
-            for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
-            {
-                lclcount[0]=0;
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                //int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
-                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
-                int stagecount = stageinfo->count;
-                float stagethreshold = stageinfo->threshold;
-
-                int perfscale = queuecount > 4 ? 3 : 2;
-                int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
-                int lcl_compute_win = lcl_sz >> perfscale;
-                int lcl_compute_win_id = (lcl_id >>(6-perfscale));
-                int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale);
-                int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
-                for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
-                {
-                    float stage_sum = 0.f;
-                    int temp_coord = lcloutindex[lcl_compute_win_id<<1];
-                    float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
-                    int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
-
-                    if(lcl_compute_win_id < queuecount)
-                    {
-                        int tempnodecounter = lcl_compute_id;
-                        float part_sum = 0.f;
-                        const int stump_factor = STUMP_BASED ? 1 : 2;
-                        int root_offset = 0;
-                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stagecount;)
-                        {
-                            __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
-                                    (((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset));
-
-                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
-                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
-                            int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
-                            float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
-                            float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
-                            float nodethreshold  = w.w * variance_norm_factor;
-
-                            info1.x +=queue_pixel;
-                            info1.z +=queue_pixel;
-                            info2.x +=queue_pixel;
-                            info2.z +=queue_pixel;
-
-                            float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
-                                                lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
-
-
-                            classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
-                                            lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-
-                            info3.x +=queue_pixel;
-                            info3.z +=queue_pixel;
-                            classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
-                                            lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-
-                            bool passThres = classsum >= nodethreshold;
-#if STUMP_BASED
-                            part_sum += passThres ? alpha3.y : alpha3.x;
-                            tempnodecounter += lcl_compute_win;
-                            lcl_loop++;
-#else
-                            if(root_offset == 0)
-                            {
-                                if( (passThres && currentnodeptr->right) ||
-                                    (!passThres && currentnodeptr->left))
-                                {
-                                    root_offset = 1;
-                                }
-                                else
-                                {
-                                    part_sum += alpha3.x;
-                                    tempnodecounter += lcl_compute_win;
-                                    lcl_loop++;
-                                }
-                            }
-                            else
-                            {
-                                part_sum += passThres ? alpha3.z : alpha3.y;
-                                tempnodecounter += lcl_compute_win;
-                                lcl_loop++;
-                                root_offset = 0;
-                            }
-#endif
-                        }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
-                        partialsum[lcl_id]=part_sum;
-                    }
-                    barrier(CLK_LOCAL_MEM_FENCE);
-                    if(lcl_compute_win_id < queuecount)
-                    {
-                        for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
-                        {
-                            stage_sum += partialsum[lcl_id+i];
-                        }
-                        if(stage_sum >= stagethreshold && (lcl_compute_id==0))
-                        {
-                            int queueindex = atomic_inc(lclcount);
-                            lcloutindex[queueindex<<1] = temp_coord;
-                            lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
-                        }
-                        lcl_compute_win_id +=(1<<perfscale);
-                    }
-                    barrier(CLK_LOCAL_MEM_FENCE);
-                }//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
-
-                queuecount = lclcount[0];
-                barrier(CLK_LOCAL_MEM_FENCE);
-                nodecounter += stagecount;
-            }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
-
-            if(lcl_id<queuecount)
-            {
-                int temp = lcloutindex[lcl_id<<1];
-                int x = mad24(grpidx,grpszx,temp & 0xffff);
-                int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
-                temp = glboutindex[0];
-                int4 candidate_result;
-                candidate_result.zw = (int2)convert_int_rte(factor*20.f);
-                candidate_result.x = convert_int_rte(x*factor);
-                candidate_result.y = convert_int_rte(y*factor);
-                atomic_inc(glboutindex);
-
-                int i = outputoff+temp+lcl_id;
-                if(candidate[i].z == 0)
-                {
-                    candidate[i] = candidate_result;
-                }
-                else
-                {
-                    for(i=i+1;;i++)
-                    {
-                        if(candidate[i].z == 0)
-                        {
-                            candidate[i] = candidate_result;
-                            break;
-                        }
-                    }
-                }
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
-    }//end for(int scalei = 0; scalei <loopcount; scalei++)
-}
-#endif
diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
deleted file mode 100644
index 09a26760b..000000000
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ /dev/null
@@ -1,323 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Wu Xinglong, wxl370@126.com
-//    Sen Liu, swjtuls1987@126.com
-//    Peng Xiao, pengxiao@outlook.com
-//    Erping Pang, erping@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#define CV_HAAR_FEATURE_MAX           3
-typedef int   sumtype;
-typedef float sqsumtype;
-
-typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
-{
-    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
-    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
-    float threshold /*__attribute__((aligned (4)))*/;
-    float alpha[3] __attribute__((aligned(16)));
-    int left __attribute__((aligned(4)));
-    int right __attribute__((aligned(4)));
-}
-GpuHidHaarTreeNode;
-//typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
-//{
-//    int count __attribute__((aligned(4)));
-//    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
-//    float *alpha __attribute__((aligned(8)));
-//}
-//GpuHidHaarClassifier;
-typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
-{
-    int  count __attribute__((aligned(4)));
-    float threshold __attribute__((aligned(4)));
-    int two_rects __attribute__((aligned(4)));
-    int reserved0 __attribute__((aligned(8)));
-    int reserved1 __attribute__((aligned(8)));
-    int reserved2 __attribute__((aligned(8)));
-    int reserved3 __attribute__((aligned(8)));
-}
-GpuHidHaarStageClassifier;
-//typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
-//{
-//    int  count __attribute__((aligned(4)));
-//    int  is_stump_based __attribute__((aligned(4)));
-//    int  has_tilted_features __attribute__((aligned(4)));
-//    int  is_tree __attribute__((aligned(4)));
-//    int pq0 __attribute__((aligned(4)));
-//    int pq1 __attribute__((aligned(4)));
-//    int pq2 __attribute__((aligned(4)));
-//    int pq3 __attribute__((aligned(4)));
-//    int p0 __attribute__((aligned(4)));
-//    int p1 __attribute__((aligned(4)));
-//    int p2 __attribute__((aligned(4)));
-//    int p3 __attribute__((aligned(4)));
-//    float inv_window_area __attribute__((aligned(4)));
-//} GpuHidHaarClassifierCascade;
-
-__kernel void gpuRunHaarClassifierCascade_scaled2(
-    global GpuHidHaarStageClassifier *stagecascadeptr_,
-    global int4 *info,
-    global GpuHidHaarTreeNode *nodeptr_,
-    global const int *restrict sum,
-    global const float *restrict sqsum,
-    global int4 *candidate,
-    const int rows,
-    const int cols,
-    const int step,
-    const int loopcount,
-    const int start_stage,
-    const int split_stage,
-    const int end_stage,
-    const int startnode,
-    global int4 *p,
-    global float *correction,
-    const int nodecount)
-{
-    int grpszx = get_local_size(0);
-    int grpszy = get_local_size(1);
-    int grpnumx = get_num_groups(0);
-    int grpidx = get_group_id(0);
-    int lclidx = get_local_id(0);
-    int lclidy = get_local_id(1);
-    int lcl_id = mad24(lclidy, grpszx, lclidx);
-    __local int glboutindex[1];
-    __local int lclcount[1];
-    __local int lcloutindex[64];
-    glboutindex[0] = 0;
-    int outputoff = mul24(grpidx, 256);
-    candidate[outputoff + (lcl_id << 2)] = (int4)0;
-    candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
-    candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
-    candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
-    int max_idx = rows * cols - 1;
-    for (int scalei = 0; scalei < loopcount; scalei++)
-    {
-        int4 scaleinfo1 = info[scalei];
-        int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
-        int totalgrp = scaleinfo1.y & 0xffff;
-        float factor = as_float(scaleinfo1.w);
-        float correction_t = correction[scalei];
-        float ystep = max(2.0f, factor);
-
-        for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
-        {
-            int4 cascadeinfo = p[scalei];
-            int grpidy = grploop / grpnumperline;
-            int grpidx = grploop - mul24(grpidy, grpnumperline);
-            int ix = mad24(grpidx, grpszx, lclidx);
-            int iy = mad24(grpidy, grpszy, lclidy);
-            int x = round(ix * ystep);
-            int y = round(iy * ystep);
-            lcloutindex[lcl_id] = 0;
-            lclcount[0] = 0;
-            int nodecounter;
-            float mean, variance_norm_factor;
-            //if((ix < width) && (iy < height))
-            {
-                const int p_offset = mad24(y, step, x);
-                cascadeinfo.x += p_offset;
-                cascadeinfo.z += p_offset;
-                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
-                - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
-                + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
-                       * correction_t;
-                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
-                - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
-                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
-                + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
-                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
-                variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
-                bool result = true;
-                nodecounter = startnode + nodecount * scalei;
-                for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
-                {
-                    float stage_sum = 0.f;
-                    __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
-                        (((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier));
-                    int stagecount = stageinfo->count;
-                    for (int nodeloop = 0; nodeloop < stagecount;)
-                    {
-                        __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
-                            (((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode));
-                        int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
-                        int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
-                        int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
-                        float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                        float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
-                        float nodethreshold  = w.w * variance_norm_factor;
-
-                        info1.x += p_offset;
-                        info1.z += p_offset;
-                        info2.x += p_offset;
-                        info2.z += p_offset;
-                        info3.x += p_offset;
-                        info3.z += p_offset;
-                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
-                        - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
-                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
-                        + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
-                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
-                        - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
-                        + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
-                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
-                        - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
-                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
-                        + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
-
-                        bool passThres = (classsum >= nodethreshold) ? 1 : 0;
-
-#if STUMP_BASED
-                        stage_sum += passThres ? alpha3.y : alpha3.x;
-                        nodecounter++;
-                        nodeloop++;
-#else
-                        bool isRootNode = (nodecounter & 1) == 0;
-                        if(isRootNode)
-                        {
-                            if( (passThres && currentnodeptr->right) ||
-                                (!passThres && currentnodeptr->left))
-                            {
-                                nodecounter ++;
-                            }
-                            else
-                            {
-                                stage_sum += alpha3.x;
-                                nodecounter += 2;
-                                nodeloop ++;
-                            }
-                        }
-                        else
-                        {
-                            stage_sum += (passThres ? alpha3.z : alpha3.y);
-                            nodecounter ++;
-                            nodeloop ++;
-                        }
-#endif
-                    }
-
-                    result = (stage_sum >= stageinfo->threshold) ? 1 : 0;
-                }
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-
-                if (result)
-                {
-                    int queueindex = atomic_inc(lclcount);
-                    lcloutindex[queueindex] = (y << 16) | x;
-                }
-                barrier(CLK_LOCAL_MEM_FENCE);
-                int queuecount = lclcount[0];
-
-                if (lcl_id < queuecount)
-                {
-                    int temp = lcloutindex[lcl_id];
-                    int x = temp & 0xffff;
-                    int y = (temp & (int)0xffff0000) >> 16;
-                    temp = atomic_inc(glboutindex);
-                    int4 candidate_result;
-                    candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
-                    candidate_result.x = x;
-                    candidate_result.y = y;
-
-                    int i = outputoff+temp+lcl_id;
-                    if(candidate[i].z == 0)
-                    {
-                        candidate[i] = candidate_result;
-                    }
-                    else
-                    {
-                        for(i=i+1;;i++)
-                        {
-                            if(candidate[i].z == 0)
-                            {
-                                candidate[i] = candidate_result;
-                                break;
-                            }
-                        }
-                    }
-                }
-
-                barrier(CLK_LOCAL_MEM_FENCE);
-            }
-        }
-    }
-}
-__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum)
-{
-    const int counter = get_global_id(0);
-    int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
-    GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*)
-        (((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode));
-    __global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*)
-        (((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode));
-
-    #pragma unroll
-    for (i = 0; i < 3; i++)
-    {
-        tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
-        tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
-        tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
-        tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
-    }
-
-    t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
-
-    #pragma unroll
-    for (i = 0; i < 3; i++)
-    {
-        pNew->p[i][0] = tr_x[i];
-        pNew->p[i][1] = tr_y[i];
-        pNew->p[i][2] = tr_x[i] + tr_w[i];
-        pNew->p[i][3] = tr_y[i] + tr_h[i];
-        pNew->weight[i] = t1.weight[i] * weight_scale;
-    }
-
-    pNew->left = t1.left;
-    pNew->right = t1.right;
-    pNew->threshold = t1.threshold;
-    pNew->alpha[0] = t1.alpha[0];
-    pNew->alpha[1] = t1.alpha[1];
-    pNew->alpha[2] = t1.alpha[2];
-}
diff --git a/modules/ocl/src/opencl/imgproc_bilateral.cl b/modules/ocl/src/opencl/imgproc_bilateral.cl
deleted file mode 100644
index cb317a005..000000000
--- a/modules/ocl/src/opencl/imgproc_bilateral.cl
+++ /dev/null
@@ -1,145 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Rock Li, Rock.li@amd.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-
-__kernel void bilateral_C1_D0(__global uchar *dst,
-        __global const uchar *src,
-        const int dst_rows,
-        const int dst_cols,
-        const int maxk,
-        const int radius,
-        const int dst_step,
-        const int dst_offset,
-        const int src_step,
-        const int src_rows,
-        const int src_cols,
-        __constant float *color_weight,
-        __constant float *space_weight,
-        __constant int *space_ofs)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < dst_rows && x < dst_cols)
-    {
-        int src_index = mad24(y + radius, src_step, x + radius);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-        float sum = 0.f, wsum = 0.f;
-
-        int val0 = (int)src[src_index];
-        for(int k = 0; k < maxk; k++ )
-        {
-            int val = (int)src[src_index + space_ofs[k]];
-            float w = space_weight[k] * color_weight[abs(val - val0)];
-            sum += (float)(val) * w;
-            wsum += w;
-        }
-        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
-    }
-}
-
-__kernel void bilateral2_C1_D0(__global uchar *dst,
-        __global const uchar *src,
-        const int dst_rows,
-        const int dst_cols,
-        const int maxk,
-        const int radius,
-        const int dst_step,
-        const int dst_offset,
-        const int src_step,
-        const int src_rows,
-        const int src_cols,
-        __constant float *color_weight,
-        __constant float *space_weight,
-        __constant int *space_ofs)
-{
-    int x = get_global_id(0) << 2;
-    int y = get_global_id(1);
-
-    if (y < dst_rows && x < dst_cols)
-    {
-        int src_index = mad24(y + radius, src_step, x + radius);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-        float4 sum = (float4)(0.f), wsum = (float4)(0.f);
-
-        int4 val0 = convert_int4(vload4(0,src + src_index));
-        for(int k = 0; k < maxk; k++ )
-        {
-            int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
-            float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
-                color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
-            sum += convert_float4(val) * w;
-            wsum += w;
-        }
-        *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
-    }
-}
-
-__kernel void bilateral_C4_D0(__global uchar4 *dst,
-        __global const uchar4 *src,
-        const int dst_rows,
-        const int dst_cols,
-        const int maxk,
-        const int radius,
-        const int dst_step,
-        const int dst_offset,
-        const int src_step,
-        const int src_rows,
-        const int src_cols,
-        __constant float *color_weight,
-        __constant float *space_weight,
-        __constant int *space_ofs)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < dst_rows && x < dst_cols)
-    {
-        int src_index = mad24(y + radius, src_step, x + radius);
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-        float4 sum = (float4)0.f;
-        float wsum = 0.f;
-
-        int4 val0 = convert_int4(src[src_index]);
-        for(int k = 0; k < maxk; k++ )
-        {
-            int4 val = convert_int4(src[src_index + space_ofs[k]]);
-            float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
-            sum += convert_float4(val) * (float4)w;
-            wsum += w;
-        }
-
-        wsum = 1.f / wsum;
-        dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
deleted file mode 100644
index 7cb4c8ff3..000000000
--- a/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
+++ /dev/null
@@ -1,204 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef BORDER_CONSTANT
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        x = max(min(x, maxV - 1), 0); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, maxV) \
-    { \
-        if (x < 0) \
-            x -= ((x - maxV + 1) / maxV) * maxV; \
-        if (x >= maxV) \
-            x %= maxV; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
-#define EXTRAPOLATE_(x, maxV, delta) \
-    { \
-        if (maxV == 1) \
-            x = 0; \
-        else \
-            do \
-            { \
-                if ( x < 0 ) \
-                    x = -x - 1 + delta; \
-                else \
-                    x = maxV - 1 - (x - maxV) - delta; \
-            } \
-            while (x >= maxV || x < 0); \
-    }
-#ifdef BORDER_REFLECT
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
-#else
-#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
-#endif
-#else
-#error No extrapolation method
-#endif
-
-#define THREADS 256
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////calcHarris////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
-                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
-                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
-                              int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
-{
-    int col = get_local_id(0);
-    int gX = get_group_id(0);
-    int gY = get_group_id(1);
-    int gly = get_global_id(1);
-
-    int dx_x_off = (dx_offset % dx_step) >> 2;
-    int dx_y_off = dx_offset / dx_step;
-    int dy_x_off = (dy_offset % dy_step) >> 2;
-    int dy_y_off = dy_offset / dy_step;
-    int dst_x_off = (dst_offset % dst_step) >> 2;
-    int dst_y_off = dst_offset / dst_step;
-
-    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
-    int dx_startY = (gY << 1) - anY + dx_y_off;
-    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
-    int dy_startY = (gY << 1) - anY + dy_y_off;
-    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
-    int dst_startY = (gY << 1) + dst_y_off;
-
-    float dx_data[ksY+1], dy_data[ksY+1], data[3][ksY+1];
-    __local float temp[6][THREADS];
-
-#ifdef BORDER_CONSTANT
-    for (int i=0; i < ksY+1; i++)
-    {
-        bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
-        int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
-        float dx_s = dx_con ? Dx[indexDx] : 0.0f;
-        dx_data[i] = dx_s;
-
-        bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
-        int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
-        float dy_s = dy_con ? Dy[indexDy] : 0.0f;
-        dy_data[i] = dy_s;
-
-        data[0][i] = dx_data[i] * dx_data[i];
-        data[1][i] = dx_data[i] * dy_data[i];
-        data[2][i] = dy_data[i] * dy_data[i];
-    }
-#else
-    int clamped_col = min(dst_cols, col);
-    for (int i=0; i < ksY+1; i++)
-    {
-        int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
-        EXTRAPOLATE(dx_selected_row, dx_whole_rows)
-        EXTRAPOLATE(dx_selected_col, dx_whole_cols)
-        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
-
-        int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
-        EXTRAPOLATE(dy_selected_row, dy_whole_rows)
-        EXTRAPOLATE(dy_selected_col, dy_whole_cols)
-        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
-
-        data[0][i] = dx_data[i] * dx_data[i];
-        data[1][i] = dx_data[i] * dy_data[i];
-        data[2][i] = dy_data[i] * dy_data[i];
-    }
-#endif
-    float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
-    for (int i=1; i < ksY; i++)
-    {
-        sum0 += (data[0][i]);
-        sum1 += (data[1][i]);
-        sum2 += (data[2][i]);
-    }
-
-    float sum01 = sum0 + (data[0][0]);
-    float sum02 = sum0 + (data[0][ksY]);
-    temp[0][col] = sum01;
-    temp[1][col] = sum02;
-    float sum11 = sum1 + (data[1][0]);
-    float sum12 = sum1 + (data[1][ksY]);
-    temp[2][col] = sum11;
-    temp[3][col] = sum12;
-    float sum21 = sum2 + (data[2][0]);
-    float sum22 = sum2 + (data[2][ksY]);
-    temp[4][col] = sum21;
-    temp[5][col] = sum22;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(col < (THREADS-(ksX-1)))
-    {
-        col += anX;
-        int posX = dst_startX - dst_x_off + col - anX;
-        int posY = (gly << 1);
-        int till = (ksX + 1)%2;
-        float tmp_sum[6] = { 0.0f, 0.0f , 0.0f, 0.0f, 0.0f, 0.0f };
-        for (int k=0; k<6; k++)
-            for (int i=-anX; i<=anX - till; i++)
-                tmp_sum[k] += temp[k][col+i];
-
-        if(posX < dst_cols && (posY) < dst_rows)
-        {
-            float a = tmp_sum[0] * 0.5f;
-            float b = tmp_sum[2];
-            float c = tmp_sum[4] * 0.5f;
-            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
-        }
-        if (posX < dst_cols && (posY + 1) < dst_rows)
-        {
-            float a = tmp_sum[1] * 0.5f;
-            float b = tmp_sum[3];
-            float c = tmp_sum[5] * 0.5f;
-            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_columnsum.cl b/modules/ocl/src/opencl/imgproc_columnsum.cl
deleted file mode 100644
index 6b596a322..000000000
--- a/modules/ocl/src/opencl/imgproc_columnsum.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Chunpeng Zhang chunpeng@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-////////////////////////////////////////////////////////////////////
-///////////////////////// columnSum ////////////////////////////////
-////////////////////////////////////////////////////////////////////
-
-__kernel void columnSum_C1_D5(__global float * src, __global float * dst,
-    int cols, int rows, int src_step, int dst_step, int src_offset, int dst_offset)
-{
-    const int x = get_global_id(0);
-
-    if (x < cols)
-    {
-        int srcIdx = x + src_offset;
-        int dstIdx = x + dst_offset;
-
-        float sum = 0;
-
-        for (int y = 0; y < rows; ++y)
-        {
-            sum += src[srcIdx];
-            dst[dstIdx] = sum;
-            srcIdx += src_step;
-            dstIdx += dst_step;
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl
deleted file mode 100644
index b8f974219..000000000
--- a/modules/ocl/src/opencl/imgproc_convolve.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-/************************************** convolve **************************************/
-
-__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
-                          int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
-                          int src_offset, int dst_offset, int koffset)
-{
-    __local float smem[16 + 2 * 8][16 + 2 * 8];
-
-    int x = get_local_id(0);
-    int y = get_local_id(1);
-    int gx = get_global_id(0);
-    int gy = get_global_id(1);
-
-            // x | x 0 | 0
-            // -----------
-            // x | x 0 | 0
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-    smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
-
-            // 0 | 0 x | x
-            // -----------
-            // 0 | 0 x | x
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
-
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            // x | x 0 | 0
-            // -----------
-            // x | x 0 | 0
-    smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
-
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            // 0 | 0 x | x
-            // -----------
-            // 0 | 0 x | x
-    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (gx < cols && gy < rows)
-    {
-        float res = 0;
-
-        for (int i = 0; i < kHeight; ++i)
-            for (int j = 0; j < kWidth; ++j)
-                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
-
-        dst[gy * dst_step + gx + dst_offset] = res;
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_copymakeboder.cl b/modules/ocl/src/opencl/imgproc_copymakeboder.cl
deleted file mode 100644
index ac149a46b..000000000
--- a/modules/ocl/src/opencl/imgproc_copymakeboder.cl
+++ /dev/null
@@ -1,134 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Zero Lin zero.lin@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef BORDER_CONSTANT
-#define EXTRAPOLATE(x, y, v) v = scalar;
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, y, v) \
-    { \
-        x = max(min(x, src_cols - 1), 0); \
-        y = max(min(y, src_rows - 1), 0); \
-        v = src[mad24(y, src_step, x + src_offset)]; \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, y, v) \
-    { \
-        if (x < 0) \
-            x -= ((x - src_cols + 1) / src_cols) * src_cols; \
-        if (x >= src_cols) \
-            x %= src_cols; \
-        \
-        if (y < 0) \
-            y -= ((y - src_rows + 1) / src_rows) * src_rows; \
-        if( y >= src_rows ) \
-            y %= src_rows; \
-        v = src[mad24(y, src_step, x + src_offset)]; \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#ifdef BORDER_REFLECT
-#define DELTA int delta = 0
-#else
-#define DELTA int delta = 1
-#endif
-#define EXTRAPOLATE(x, y, v) \
-    { \
-        DELTA; \
-        if (src_cols == 1) \
-            x = 0; \
-        else \
-            do \
-            { \
-                if( x < 0 ) \
-                    x = -x - 1 + delta; \
-                else \
-                    x = src_cols - 1 - (x - src_cols) - delta; \
-            } \
-            while (x >= src_cols || x < 0); \
-        \
-        if (src_rows == 1) \
-            y = 0; \
-        else \
-            do \
-            { \
-                if( y < 0 ) \
-                    y = -y - 1 + delta; \
-                else \
-                    y = src_rows - 1 - (y - src_rows) - delta; \
-            } \
-            while (y >= src_rows || y < 0); \
-        v = src[mad24(y, src_step, x + src_offset)]; \
-    }
-#else
-#error No extrapolation method
-#endif
-
-#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
-
-__kernel void copymakeborder
-                        (__global const GENTYPE *src,
-                         __global GENTYPE *dst,
-                         int dst_cols, int dst_rows,
-                         int src_cols, int src_rows,
-                         int src_step, int src_offset,
-                         int dst_step, int dst_offset,
-                         int top, int left, GENTYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int src_x = x - left;
-        int src_y = y - top;
-        int dst_index = mad24(y, dst_step, x + dst_offset);
-
-        if (NEED_EXTRAPOLATION(src_x, src_y))
-            EXTRAPOLATE(src_x, src_y, dst[dst_index])
-        else
-        {
-            int src_index = mad24(src_y, src_step, src_x + src_offset);
-            dst[dst_index] = src[src_index];
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_gftt.cl b/modules/ocl/src/opencl/imgproc_gftt.cl
deleted file mode 100644
index 80bdec08f..000000000
--- a/modules/ocl/src/opencl/imgproc_gftt.cl
+++ /dev/null
@@ -1,275 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef WITH_MASK
-#define WITH_MASK 0
-#endif
-
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
-{
-    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
-}
-
-inline float ELEM_FLT2(image2d_t _eig, float2 pt)
-{
-    return read_imagef(_eig, sampler, pt).x;
-}
-
-__kernel
-    void findCorners
-    (
-        image2d_t eig,
-        __global const char * mask,
-        __global float2 * corners,
-        const int mask_strip,// in pixels
-        const float threshold,
-        const int rows,
-        const int cols,
-        const int max_count,
-        __global int * g_counter
-    )
-{
-    const int j = get_global_id(0);
-    const int i = get_global_id(1);
-
-    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
-#if WITH_MASK
-        && mask[i * mask_strip + j] != 0
-#endif
-        )
-    {
-        const float val = ELEM_INT2(eig, j, i);
-
-        if (val > threshold)
-        {
-            float maxVal = val;
-
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
-
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
-
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
-
-            if (val == maxVal)
-            {
-                const int ind = atomic_inc(g_counter);
-
-                if (ind < max_count)
-                    corners[ind] = (float2)(j, i);
-            }
-        }
-    }
-}
-
-//bitonic sort
-__kernel
-    void sortCorners_bitonicSort
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count,
-        const int stage,
-        const int passOfStage
-    )
-{
-    const int threadId = get_global_id(0);
-    if(threadId >= count / 2)
-    {
-        return;
-    }
-
-    const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
-
-    const int pairDistance = 1 << (stage - passOfStage);
-    const int blockWidth   = 2 * pairDistance;
-
-    const int leftId = min( (threadId % pairDistance)
-                   + (threadId / pairDistance) * blockWidth, count );
-
-    const int rightId = min( leftId + pairDistance, count );
-
-    const float2 leftPt  = corners[leftId];
-    const float2 rightPt = corners[rightId];
-
-    const float leftVal  = ELEM_FLT2(eig, leftPt);
-    const float rightVal = ELEM_FLT2(eig, rightPt);
-
-    const bool compareResult = leftVal > rightVal;
-
-    float2 greater = compareResult ? leftPt:rightPt;
-    float2 lesser  = compareResult ? rightPt:leftPt;
-
-    corners[leftId]  = sortOrder ? lesser : greater;
-    corners[rightId] = sortOrder ? greater : lesser;
-}
-
-//selection sort for gfft
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-//  Local sort will firstly sort elements of each workgroup using selection sort
-//  its performance is O(n)
-__kernel
-    void sortCorners_selectionSortLocal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count,
-        __local float2 * scratch
-    )
-{
-    int          i  = get_local_id(0); // index in workgroup
-    int numOfGroups = get_num_groups(0); // index in workgroup
-    int groupID     = get_group_id(0);
-    int         wg  = get_local_size(0); // workgroup size = block size
-    int n; // number of elements to be processed for this work group
-
-    int offset   = groupID * wg;
-    int same     = 0;
-    corners      += offset;
-    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
-    float2 pt1, pt2;
-
-    pt1 = corners[min(i, n)];
-    scratch[i] = pt1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(i >= n)
-    {
-        return;
-    }
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    int pos = 0;
-    for (int j=0;j<n;++j)
-    {
-        pt2  = scratch[j];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val2 > val1)
-            pos++;//calculate the rank of this element in this work group
-        else
-        {
-            if(val1 > val2)
-                continue;
-            else
-            {
-                // val1 and val2 are same
-                same++;
-            }
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
-}
-__kernel
-    void sortCorners_selectionSortFinal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count
-    )
-{
-    const int          i  = get_local_id(0); // index in workgroup
-    const int numOfGroups = get_num_groups(0); // index in workgroup
-    const int groupID     = get_group_id(0);
-    const int         wg  = get_local_size(0); // workgroup size = block size
-    int pos = 0, same = 0;
-    const int offset = get_group_id(0) * wg;
-    const int remainder = count - wg*(numOfGroups-1);
-
-    if((offset + i ) >= count)
-        return;
-    float2 pt1, pt2;
-    pt1 = corners[groupID*wg + i];
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    for(int j=0; j<numOfGroups-1; j++ )
-    {
-        for(int k=0; k<wg; k++)
-        {
-            pt2  = corners[j*wg + k];
-            val2 = ELEM_FLT2(eig, pt2);
-            if(val1 > val2)
-                break;
-            else
-            {
-                //Increment only if the value is not the same.
-                if( val2 > val1 )
-                    pos++;
-                else
-                    same++;
-            }
-        }
-    }
-
-    for(int k=0; k<remainder; k++)
-    {
-        pt2  = corners[(numOfGroups-1)*wg + k];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val1 > val2)
-            break;
-        else
-        {
-            //Don't increment if the value is the same.
-            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
-            if(val2 > val1)
-                pos++;
-            else
-                same++;
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
-}
diff --git a/modules/ocl/src/opencl/imgproc_histogram.cl b/modules/ocl/src/opencl/imgproc_histogram.cl
deleted file mode 100644
index bac9a6b89..000000000
--- a/modules/ocl/src/opencl/imgproc_histogram.cl
+++ /dev/null
@@ -1,279 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Xu Pang, pangxu010@163.com
-//    Wenju He, wenju@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-#define PARTIAL_HISTOGRAM256_COUNT     (256)
-#define HISTOGRAM256_BIN_COUNT         (256)
-
-#define HISTOGRAM256_WORK_GROUP_SIZE     (256)
-#define HISTOGRAM256_LOCAL_MEM_SIZE      (HISTOGRAM256_BIN_COUNT)
-
-#define NBANKS (16)
-#define NBANKS_BIT (4)
-
-
-__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
-                                                                      __global const uint4* src,
-                                          int src_step, int src_offset,
-                                                                      __global int* globalHist,
-                                                                      int dataCount,  int cols,
-                                          int inc_x, int inc_y,
-                                          int hist_step)
-{
-        __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
-        int gid = get_global_id(0);
-        int lid = get_local_id(0);
-        int gx  = get_group_id(0);
-        int gsize = get_global_size(0);
-        int lsize  = get_local_size(0);
-        const int shift = 8;
-        const int mask = HISTOGRAM256_BIN_COUNT-1;
-        int offset = (lid & (NBANKS-1));// lid % NBANKS
-        uint4 data, temp1, temp2, temp3, temp4;
-        src += src_offset;
-
-        //clear LDS
-        for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
-        {
-            subhist[idx] = 0;
-            subhist[idx+=lsize] = 0;
-            subhist[idx+=lsize] = 0;
-            subhist[idx+=lsize] = 0;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        //read and scatter
-        int y = gid/cols;
-        int x = gid - mul24(y, cols);
-        for(int idx=gid; idx<dataCount; idx+=gsize)
-        {
-              data = src[mad24(y, src_step, x)];
-              temp1 = ((data & mask) << NBANKS_BIT) + offset;
-              data >>= shift;
-              temp2 = ((data & mask) << NBANKS_BIT) + offset;
-              data >>= shift;
-              temp3 = ((data & mask) << NBANKS_BIT) + offset;
-              data >>= shift;
-              temp4 = ((data & mask) << NBANKS_BIT) + offset;
-
-              atomic_inc(subhist + temp1.x);
-              atomic_inc(subhist + temp1.y);
-              atomic_inc(subhist + temp1.z);
-              atomic_inc(subhist + temp1.w);
-
-              atomic_inc(subhist + temp2.x);
-              atomic_inc(subhist + temp2.y);
-              atomic_inc(subhist + temp2.z);
-              atomic_inc(subhist + temp2.w);
-
-              atomic_inc(subhist + temp3.x);
-              atomic_inc(subhist + temp3.y);
-              atomic_inc(subhist + temp3.z);
-              atomic_inc(subhist + temp3.w);
-
-              atomic_inc(subhist + temp4.x);
-              atomic_inc(subhist + temp4.y);
-              atomic_inc(subhist + temp4.z);
-              atomic_inc(subhist + temp4.w);
-
-              x += inc_x;
-              int off = ((x>=cols) ? -1 : 0);
-              x = mad24(off, cols, x);
-              y += inc_y - off;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        //reduce local banks to single histogram per workgroup
-        int bin1=0, bin2=0, bin3=0, bin4=0;
-        for(int i=0; i<NBANKS; i+=4)
-        {
-             bin1 += subhist[(lid << NBANKS_BIT) + i];
-             bin2 += subhist[(lid << NBANKS_BIT) + i+1];
-             bin3 += subhist[(lid << NBANKS_BIT) + i+2];
-             bin4 += subhist[(lid << NBANKS_BIT) + i+3];
-        }
-
-        globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
-}
-
-__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))
-calc_sub_hist_border_D0(__global const uchar* src, int src_step, int src_offset,
-                        __global int* globalHist, int left_col, int cols,
-                        int rows, int hist_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-        int lidy = get_local_id(1);
-        int gx = get_group_id(0);
-        int gy = get_group_id(1);
-        int gn = get_num_groups(0);
-        int rowIndex = mad24(gy, gn, gx);
-//        rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
-
-        __local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
-        subhist[lidy] = 0;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
-        if(gidy<rows)
-        {
-            int src_index = src_offset + mad24(gidy, src_step, gidx);
-            int p = (int)src[src_index];
-//	    p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
-            atomic_inc(subhist + p);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
-}
-
-__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
-                __global int* hist,
-                int src_step)
-{
-    int lx = get_local_id(0);
-    int gx = get_group_id(0);
-
-    int sum = 0;
-
-    for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
-        sum += buf[ mad24(i, src_step, gx)];
-
-    __local int data[HISTOGRAM256_WORK_GROUP_SIZE];
-    data[lx] = sum;
-
-    for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
-    {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lx < stride)
-            data[lx] += data[lx + stride];
-    }
-
-    if(lx == 0)
-        hist[gx] = data[0];
-}
-
-__kernel __attribute__((reqd_work_group_size(256,1,1)))
-void calLUT(__global uchar * dst, __constant int * hist, int total)
-{
-    int lid = get_local_id(0);
-    __local int sumhist[HISTOGRAM256_BIN_COUNT];
-    __local float scale;
-
-    sumhist[lid] = hist[lid];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid == 0)
-    {
-        int sum = 0, i = 0;
-        while (!sumhist[i])
-            ++i;
-
-        if (total == sumhist[i])
-        {
-            scale = 1;
-            for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j)
-                sumhist[i] = i;
-        }
-        else
-        {
-            scale = 255.f/(total - sumhist[i]);
-
-            for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++)
-            {
-                sum += sumhist[i];
-                sumhist[i] = sum;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale);
-}
-
-/*
-///////////////////////////////equalizeHist//////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
-                            __global uchar * src,
-                            __global uchar * dst,
-                            __constant int * hist,
-                            int srcstep,
-                            int srcoffset,
-                            int dststep,
-                            int dstoffset,
-                            int width,
-                            int height,
-                            float scale,
-                            int inc_x,
-                            int inc_y)
-{
-    int gidx = get_global_id(0);
-    int lid = get_local_id(0);
-    int glb_size = get_global_size(0);
-    src+=srcoffset;
-    dst+=dstoffset;
-    __local int sumhist[HISTOGRAM256_BIN_COUNT];
-    __local uchar lut[HISTOGRAM256_BIN_COUNT+1];
-
-    sumhist[lid]=hist[lid];
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if(lid==0)
-    {
-        int sum = 0;
-        for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
-        {
-            sum+=sumhist[i];
-            sumhist[i]=sum;
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
-    lut[0]=0;
-    int pos_y = gidx / width;
-    int pos_x = gidx - mul24(pos_y, width);
-
-    for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
-    {
-        int inaddr = mad24(pos_y,srcstep,pos_x);
-        int outaddr = mad24(pos_y,dststep,pos_x);
-        dst[outaddr] = lut[src[inaddr]];
-        pos_x +=inc_x;
-        int off = (pos_x >= width ? -1 : 0);
-        pos_x =  mad24(off,width,pos_x);
-        pos_y += inc_y - off;
-    }
-}
-*/
diff --git a/modules/ocl/src/opencl/imgproc_hough.cl b/modules/ocl/src/opencl/imgproc_hough.cl
deleted file mode 100644
index fd1c5b9a8..000000000
--- a/modules/ocl/src/opencl/imgproc_hough.cl
+++ /dev/null
@@ -1,280 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-////////////////////////////////////////////////////////////////////////
-// buildPointList
-
-#define PIXELS_PER_THREAD 16
-
-// TODO: add offset to support ROI
-__kernel void buildPointList(__global const uchar* src,
-                             int cols,
-                             int rows,
-                             int step,
-                             __global unsigned int* list,
-                             __global int* counter)
-{
-    __local unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
-    __local int s_qsize[4];
-    __local int s_globStart[4];
-
-    const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0);
-    const int y = get_global_id(1);
-
-    if (get_local_id(0) == 0)
-        s_qsize[get_local_id(1)] = 0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (y < rows)
-    {
-        // fill the queue
-        __global const uchar* srcRow = &src[y * step];
-        for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0))
-        {
-            if (srcRow[xx])
-            {
-                const unsigned int val = (y << 16) | xx;
-                const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1);
-                s_queues[get_local_id(1)][qidx] = val;
-            }
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // let one work-item reserve the space required in the global list
-    if (get_local_id(0) == 0 && get_local_id(1) == 0)
-    {
-        // find how many items are stored in each list
-        int totalSize = 0;
-        for (int i = 0; i < get_local_size(1); ++i)
-        {
-            s_globStart[i] = totalSize;
-            totalSize += s_qsize[i];
-        }
-
-        // calculate the offset in the global list
-        const int globalOffset = atomic_add(counter, totalSize);
-        for (int i = 0; i < get_local_size(1); ++i)
-            s_globStart[i] += globalOffset;
-    }
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
-    // copy local queues to global queue
-    const int qsize = s_qsize[get_local_id(1)];
-    int gidx = s_globStart[get_local_id(1)] + get_local_id(0);
-    for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0))
-        list[gidx] = s_queues[get_local_id(1)][i];
-}
-
-////////////////////////////////////////////////////////////////////////
-// circlesAccumCenters
-
-// TODO: add offset to support ROI
-__kernel void circlesAccumCenters(__global const unsigned int* list,
-                                  const int count,
-                                  __global const int* dx,
-                                  const int dxStep,
-                                  __global const int* dy,
-                                  const int dyStep,
-                                  __global int* accum,
-                                  const int accumStep,
-                                  const int width,
-                                  const int height,
-                                  const int minRadius,
-                                  const int maxRadius,
-                                  const float idp)
-{
-    const int dxStepInPixel    = dxStep    / sizeof(int);
-    const int dyStepInPixel    = dyStep    / sizeof(int);
-    const int accumStepInPixel = accumStep / sizeof(int);
-
-    const int SHIFT = 10;
-    const int ONE = 1 << SHIFT;
-
-    // const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int wid = get_global_id(0);
-
-    if (wid >= count)
-        return;
-
-    const unsigned int val = list[wid];
-
-    const int x = (val & 0xFFFF);
-    const int y = (val >> 16) & 0xFFFF;
-
-    const int vx = dx[mad24(y, dxStepInPixel, x)];
-    const int vy = dy[mad24(y, dyStepInPixel, x)];
-
-    if (vx == 0 && vy == 0)
-        return;
-
-    const float mag = sqrt(convert_float(vx * vx + vy * vy));
-
-    const int x0 = convert_int_rte((x * idp) * ONE);
-    const int y0 = convert_int_rte((y * idp) * ONE);
-
-    int sx = convert_int_rte((vx * idp) * ONE / mag);
-    int sy = convert_int_rte((vy * idp) * ONE / mag);
-
-    // Step from minRadius to maxRadius in both directions of the gradient
-    for (int k1 = 0; k1 < 2; ++k1)
-    {
-        int x1 = x0 + minRadius * sx;
-        int y1 = y0 + minRadius * sy;
-
-        for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
-        {
-            const int x2 = x1 >> SHIFT;
-            const int y2 = y1 >> SHIFT;
-
-            if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
-                break;
-
-            atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1);
-        }
-
-        sx = -sx;
-        sy = -sy;
-    }
-}
-
-// ////////////////////////////////////////////////////////////////////////
-// // buildCentersList
-
-// TODO: add offset to support ROI
-__kernel void buildCentersList(__global const int* accum,
-                               const int accumCols,
-                               const int accumRows,
-                               const int accumStep,
-                               __global unsigned int* centers,
-                               const int threshold,
-                               __global int* counter)
-{
-    const int accumStepInPixel = accumStep/sizeof(int);
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (x < accumCols - 2 && y < accumRows - 2)
-    {
-        const int top    = accum[mad24(y,     accumStepInPixel, x + 1)];
-
-        const int left   = accum[mad24(y + 1, accumStepInPixel, x)];
-        const int cur    = accum[mad24(y + 1, accumStepInPixel, x + 1)];
-        const int right  = accum[mad24(y + 1, accumStepInPixel, x + 2)];
-
-        const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];;
-
-        if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
-        {
-            const unsigned int val = (y << 16) | x;
-            const int idx = atomic_add(counter, 1);
-            centers[idx] = val;
-        }
-    }
-}
-
-
-// ////////////////////////////////////////////////////////////////////////
-// // circlesAccumRadius
-
-// TODO: add offset to support ROI
-__kernel void circlesAccumRadius(__global const unsigned int* centers,
-                                 __global const unsigned int* list, const int count,
-                                 __global float4* circles, const int maxCircles,
-                                 const float dp,
-                                 const int minRadius, const int maxRadius,
-                                 const int histSize,
-                                 const int threshold,
-                                 __local int* smem,
-                                 __global int* counter)
-{
-    for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0))
-        smem[i] = 0;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    unsigned int val = centers[get_group_id(0)];
-
-    float cx = convert_float(val & 0xFFFF);
-    float cy = convert_float((val >> 16) & 0xFFFF);
-
-    cx = (cx + 0.5f) * dp;
-    cy = (cy + 0.5f) * dp;
-
-    for (int i = get_local_id(0); i < count; i += get_local_size(0))
-    {
-        val = list[i];
-
-        const int x = (val & 0xFFFF);
-        const int y = (val >> 16) & 0xFFFF;
-
-        const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y));
-        if (rad >= minRadius && rad <= maxRadius)
-        {
-            const int r = convert_int_rte(rad - minRadius);
-
-            atomic_add(&smem[r + 1], 1);
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for (int i = get_local_id(0); i < histSize; i += get_local_size(0))
-    {
-        const int curVotes = smem[i + 1];
-
-        if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
-
-        {
-            const int ind = atomic_add(counter, 1);
-            if (ind < maxCircles)
-            {
-                circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f);
-            }
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl
deleted file mode 100644
index 1d90e507f..000000000
--- a/modules/ocl/src/opencl/imgproc_integral.cl
+++ /dev/null
@@ -1,503 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define CONVERT(step) ((step)>>1)
-#else
-#define CONVERT(step) ((step))
-#endif
-
-#define LSIZE 256
-#define LSIZE_1 255
-#define LSIZE_2 254
-#define HF_LSIZE 128
-#define LOG_LSIZE 8
-#define LOG_NUM_BANKS 5
-#define NUM_BANKS 32
-#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
-
-
-kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global TYPE *sqsum,
-                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step,int dst1_step)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int4 src_t[2], sum_t[2];
-    TYPE4 sqsum_t[2];
-    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
-    __local int* sum_p;
-    __local TYPE* sqsum_p;
-    src_step = src_step >> 2;
-    gid = gid << 1;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
-        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
-
-        sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-        lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
-
-        lm_sum[1][bf_loc] = src_t[1];
-        lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
-                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        int loc_s0 = gid * dst_step  + i + lid - 1 - pre_invalid * dst_step /4, loc_s1 = loc_s0 + dst_step ;
-        int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE),loc_sq1 = loc_sq0 + CONVERT(dst1_step);
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            lm_sqsum[0][bf_loc] += sqsum_t[0];
-            lm_sqsum[1][bf_loc] += sqsum_t[1];
-            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
-                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
-                sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
-                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
-                sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-
-kernel void integral_rows_D4(__global int4 *srcsum,__global TYPE4 * srcsqsum,__global int *sum ,
-                          __global TYPE *sqsum,int rows,int cols,int src_step,int src1_step,int sum_step,
-                          int sqsum_step,int sum_offset,int sqsum_offset)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int4 src_t[2], sum_t[2];
-    TYPE4 sqsrc_t[2],sqsum_t[2];
-    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
-    __local int *sum_p;
-    __local TYPE *sqsum_p;
-    src_step = src_step >> 4;
-    src1_step = (src1_step / sizeof(TYPE)) >> 2 ;
-    gid <<= 1;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid ] : (int4)0;
-        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid ] : (TYPE4)0;
-        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid  + 1] : (int4)0;
-        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid  + 1] : (TYPE4)0;
-
-        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] =  (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-        lm_sqsum[0][bf_loc] = sqsrc_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-        lm_sqsum[1][bf_loc] = sqsrc_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
-                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(gid == 0 && (i + lid) <= rows)
-        {
-            sum[sum_offset + i + lid] = 0;
-            sqsum[sqsum_offset + i + lid] = 0;
-        }
-        if(i + lid == 0)
-        {
-            int loc0 = gid  * sum_step;
-            int loc1 = gid  * CONVERT(sqsum_step);
-            for(int k = 1; k <= 8; k++)
-            {
-                if(gid * 4 + k > cols) break;
-                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
-                sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
-            }
-        }
-        int loc_s0 = sum_offset + gid  * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
-        int loc_sq0 = sqsum_offset + gid  * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;
-
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            lm_sqsum[0][bf_loc] += sqsum_t[0];
-            lm_sqsum[1][bf_loc] += sqsum_t[1];
-            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k >= cols) break;
-                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
-                sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + 4 + k >= cols) break;
-                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
-                sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-          }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global TYPE *sqsum,
-                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step, int dst1_step)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    float4 src_t[2], sum_t[2];
-    TYPE4 sqsum_t[2];
-    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
-    __local float* sum_p;
-    __local TYPE* sqsum_p;
-    src_step = src_step >> 2;
-    gid = gid << 1;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
-        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
-
-        sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] = (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-        lm_sqsum[0][bf_loc] = convert_TYPE4(src_t[0] * src_t[0]);
-
-        lm_sum[1][bf_loc] = src_t[1];
-        lm_sqsum[1][bf_loc] = convert_TYPE4(src_t[1] * src_t[1]);
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
-                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
-        int loc_sq0 = gid * CONVERT(dst1_step) + i + lid - 1 - pre_invalid * dst1_step / sizeof(TYPE), loc_sq1 = loc_sq0 + CONVERT(dst1_step);
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            lm_sqsum[0][bf_loc] += sqsum_t[0];
-            lm_sqsum[1][bf_loc] += sqsum_t[1];
-            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
-                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
-                sqsum[loc_sq0 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
-                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
-                sqsum[loc_sq1 + k * dst1_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-
-kernel void integral_rows_D5(__global float4 *srcsum,__global TYPE4 * srcsqsum,__global float *sum ,
-                          __global TYPE *sqsum,int rows,int cols,int src_step,int src1_step, int sum_step,
-                          int sqsum_step,int sum_offset,int sqsum_offset)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    float4 src_t[2], sum_t[2];
-    TYPE4 sqsrc_t[2],sqsum_t[2];
-    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local TYPE4 lm_sqsum[2][LSIZE + LOG_LSIZE];
-    __local float *sum_p;
-    __local TYPE *sqsum_p;
-    src_step = src_step >> 4;
-    src1_step = (src1_step / sizeof(TYPE)) >> 2;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
-        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2] : (TYPE4)0;
-        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
-        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src1_step + gid * 2 + 1] : (TYPE4)0;
-
-        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[0] =  (i == 0 ? (TYPE4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        sqsum_t[1] =  (i == 0 ? (TYPE4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-        lm_sqsum[0][bf_loc] = sqsrc_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-        lm_sqsum[1][bf_loc] = sqsrc_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
-                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
-                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(gid == 0 && (i + lid) <= rows)
-        {
-            sum[sum_offset + i + lid] = 0;
-            sqsum[sqsum_offset + i + lid] = 0;
-        }
-        if(i + lid == 0)
-        {
-            int loc0 = gid * 2 * sum_step;
-            int loc1 = gid * 2 * CONVERT(sqsum_step);
-            for(int k = 1; k <= 8; k++)
-            {
-                if(gid * 8 + k > cols) break;
-                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
-                sqsum[sqsum_offset + loc1 + k * sqsum_step / sizeof(TYPE)] = 0;
-            }
-        }
-        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
-        int loc_sq0 = sqsum_offset + gid * 2 * CONVERT(sqsum_step) + sqsum_step / sizeof(TYPE) + i + lid, loc_sq1 = loc_sq0 + CONVERT(sqsum_step) ;
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            lm_sqsum[0][bf_loc] += sqsum_t[0];
-            lm_sqsum[1][bf_loc] += sqsum_t[1];
-            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + k >= cols) break;
-                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
-                sqsum[loc_sq0 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
-            sqsum_p = (__local TYPE*)(&(lm_sqsum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + 4 + k >= cols) break;
-                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
-                sqsum[loc_sq1 + k * sqsum_step / sizeof(TYPE)] = sqsum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl
deleted file mode 100644
index 662406140..000000000
--- a/modules/ocl/src/opencl/imgproc_integral_sum.cl
+++ /dev/null
@@ -1,412 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#define LSIZE 256
-#define LSIZE_1 255
-#define LSIZE_2 254
-#define HF_LSIZE 128
-#define LOG_LSIZE 8
-#define LOG_NUM_BANKS 5
-#define NUM_BANKS 32
-#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
-
-
-kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
-                              int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int4 src_t[2], sum_t[2];
-    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local int* sum_p;
-    src_step = src_step >> 2;
-    gid = gid << 1;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
-        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
-
-        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
-                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
-            }
-            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
-                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-
-kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
-                              int rows,int cols,int src_step,int sum_step,
-                              int sum_offset)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    int4 src_t[2], sum_t[2];
-    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local int *sum_p;
-    src_step = src_step >> 4;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
-        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
-
-        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(gid == 0 && (i + lid) <= rows)
-        {
-            sum[sum_offset + i + lid] = 0;
-        }
-        if(i + lid == 0)
-        {
-            int loc0 = gid * 2 * sum_step;
-            for(int k = 1; k <= 8; k++)
-            {
-                if(gid * 8 + k > cols) break;
-                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
-            }
-        }
-
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + k >= cols) break;
-                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
-            }
-            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + 4 + k >= cols) break;
-                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
-                              int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    float4 src_t[2], sum_t[2];
-    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local float* sum_p;
-    src_step = src_step >> 2;
-    gid = gid << 1;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
-        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
-
-        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
-                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
-            }
-            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
-                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
-
-
-kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
-                              int rows,int cols,int src_step,int sum_step,
-                              int sum_offset)
-{
-    int lid = get_local_id(0);
-    int gid = get_group_id(0);
-    float4 src_t[2], sum_t[2];
-    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
-    __local float *sum_p;
-    src_step = src_step >> 4;
-    for(int i = 0; i < rows; i =i + LSIZE_1)
-    {
-        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
-        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
-
-        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
-        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
-        lm_sum[0][bf_loc] = src_t[0];
-
-        lm_sum[1][bf_loc] = src_t[1];
-
-        int offset = 1;
-        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
-            }
-            offset <<= 1;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(lid < 2)
-        {
-            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
-        }
-        for(int d = 1;  d < LSIZE; d <<= 1)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            offset >>= 1;
-            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
-            ai += GET_CONFLICT_OFFSET(ai);
-            bi += GET_CONFLICT_OFFSET(bi);
-
-            if((lid & 127) < d)
-            {
-                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
-                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(gid == 0 && (i + lid) <= rows)
-        {
-            sum[sum_offset + i + lid] = 0;
-        }
-        if(i + lid == 0)
-        {
-            int loc0 = gid * 2 * sum_step;
-            for(int k = 1; k <= 8; k++)
-            {
-                if(gid * 8 + k > cols) break;
-                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
-            }
-        }
-
-        if(lid > 0 && (i+lid) <= rows)
-        {
-            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
-            lm_sum[0][bf_loc] += sum_t[0];
-            lm_sum[1][bf_loc] += sum_t[1];
-            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + k >= cols) break;
-                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
-            }
-            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
-            for(int k = 0; k < 4; k++)
-            {
-                if(gid * 8 + 4 + k >= cols) break;
-                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
-            }
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_median.cl b/modules/ocl/src/opencl/imgproc_median.cl
deleted file mode 100644
index 5fa7a17b8..000000000
--- a/modules/ocl/src/opencl/imgproc_median.cl
+++ /dev/null
@@ -1,486 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Zero Lin, zero.lin@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-
-/*
-__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep, int m)
-{
-    int dx = get_global_id(0)-(m>>1);
-    int dy = get_global_id(1)-(m>>1);
-
-    short histom[256];
-    for(int i=0;i<256;++i)
-        histom[i]=0;
-
-
-    for(int i=0;i<m;++i)
-    {
-        __global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
-        for(int j=dx;j<dx+m;++j)
-        {
-            histom[data[clamp(j, 0, cols-1)]]++;
-        }
-    }
-
-    int now=0;
-    int goal=(m*m+1)>>1;
-    int v;
-    for(int i=0;i<256;++i)
-    {
-        v=(now<goal?i:v);
-        now+=histom[i];
-    }
-
-    if(dy<rows && dx<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
-}
-*/
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local uchar4 data[18][18];
-    __global uchar4* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -1;
-    int dy = get_global_id(1) - get_local_id(1) -1;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
-    int dr=id/18;
-    int dc=id%18;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+9, 0, rows-1);
-    data[dr+9][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-    uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-    uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
-    uchar4 mid;
-
-    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
-    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
-    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
-    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
-    op(p4, p2); op(p6, p4); op(p4, p2);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local uchar data[18][18];
-    __global uchar* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -1;
-    int dy = get_global_id(1) - get_local_id(1) -1;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
-    int dr=id/18;
-    int dc=id%18;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+9, 0, rows-1);
-    data[dr+9][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-    uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-    uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
-    uchar mid;
-
-    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
-    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
-    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
-    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
-    op(p4, p2); op(p6, p4); op(p4, p2);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local float data[18][18];
-    __global float* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -1;
-    int dy = get_global_id(1) - get_local_id(1) -1;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
-    int dr=id/18;
-    int dc=id%18;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+9, 0, rows-1);
-    data[dr+9][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-    float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-    float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
-    float mid;
-
-    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
-    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
-    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
-    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
-    op(p4, p2); op(p6, p4); op(p4, p2);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local float4 data[18][18];
-    __global float4* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -1;
-    int dy = get_global_id(1) - get_local_id(1) -1;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
-    int dr=id/18;
-    int dc=id%18;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+9, 0, rows-1);
-    data[dr+9][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
-    float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
-    float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
-    float4 mid;
-
-    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
-    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
-    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
-    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
-    op(p4, p2); op(p6, p4); op(p4, p2);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local uchar4 data[20][20];
-    __global uchar4* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -2;
-    int dy = get_global_id(1) - get_local_id(1) -2;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
-    int dr=id/20;
-    int dc=id%20;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+10, 0, rows-1);
-    data[dr+10][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-    uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-    uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-    uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-    uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-    uchar4 mid;
-
-    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
-    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
-    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
-    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
-    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
-    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
-    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
-    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
-    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
-    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
-    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
-    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
-    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
-    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
-    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
-    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
-    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
-    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
-    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
-    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
-    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
-    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
-    op(p7, p11); op(p11, p13); op(p11, p12);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local uchar data[20][20];
-    __global uchar* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -2;
-    int dy = get_global_id(1) - get_local_id(1) -2;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
-    int dr=id/20;
-    int dc=id%20;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+10, 0, rows-1);
-    data[dr+10][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-    uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-    uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-    uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-    uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-    uchar mid;
-
-    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
-    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
-    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
-    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
-    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
-    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
-    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
-    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
-    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
-    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
-    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
-    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
-    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
-    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
-    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
-    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
-    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
-    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
-    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
-    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
-    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
-    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
-    op(p7, p11); op(p11, p13); op(p11, p12);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local float4 data[20][20];
-    __global float4* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -2;
-    int dy = get_global_id(1) - get_local_id(1) -2;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
-    int dr=id/20;
-    int dc=id%20;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+10, 0, rows-1);
-    data[dr+10][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-    float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-    float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-    float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-    float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-    float4 mid;
-
-    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
-    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
-    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
-    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
-    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
-    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
-    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
-    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
-    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
-    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
-    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
-    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
-    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
-    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
-    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
-    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
-    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
-    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
-    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
-    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
-    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
-    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
-    op(p7, p11); op(p11, p13); op(p11, p12);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
-                                int rows, int srcStep, int dstStep)
-{
-
-    __local float data[20][20];
-    __global float* source=src + srcOffset;
-
-    int dx = get_global_id(0) - get_local_id(0) -2;
-    int dy = get_global_id(1) - get_local_id(1) -2;
-
-    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
-    int dr=id/20;
-    int dc=id%20;
-    int r=clamp(dy+dr, 0, rows-1);
-    int c=clamp(dx+dc, 0, cols-1);
-
-    data[dr][dc] = source[r*srcStep + c];
-    r=clamp(dy+dr+10, 0, rows-1);
-    data[dr+10][dc] = source[r*srcStep + c];
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    int x =get_local_id(0);
-    int y =get_local_id(1);
-    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
-    float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
-    float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
-    float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
-    float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
-    float mid;
-
-    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
-    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
-    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
-    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
-    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
-    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
-    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
-    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
-    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
-    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
-    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
-    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
-    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
-    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
-    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
-    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
-    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
-    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
-    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
-    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
-    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
-    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
-    op(p7, p11); op(p11, p13); op(p11, p12);
-
-    if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
-        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op
diff --git a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
deleted file mode 100644
index 86d4e5d52..000000000
--- a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the uintel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business uinterruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-typedef float2 cfloat;
-inline cfloat cmulf(cfloat a, cfloat b)
-{
-    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-
-inline cfloat conjf(cfloat a)
-{
-    return (cfloat)( a.x, - a.y );
-}
-
-__kernel void
-mulAndScaleSpectrumsKernel(
-    __global const cfloat* a,
-    __global const cfloat* b,
-    float scale,
-    __global cfloat* dst,
-    uint cols,
-    uint rows,
-    uint mstep
-)
-{
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
-    if (x < cols && y < rows)
-    {
-        cfloat v = cmulf(a[idx], b[idx]);
-        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
-    }
-}
-__kernel void
-mulAndScaleSpectrumsKernel_CONJ(
-    __global const cfloat* a,
-    __global const cfloat* b,
-    float scale,
-    __global cfloat* dst,
-    uint cols,
-    uint rows,
-    uint mstep
-)
-{
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
-    if (x < cols && y < rows)
-    {
-        cfloat v = cmulf(a[idx], conjf(b[idx]));
-        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_remap.cl b/modules/ocl/src/opencl/imgproc_remap.cl
deleted file mode 100644
index e1e3ca8a0..000000000
--- a/modules/ocl/src/opencl/imgproc_remap.cl
+++ /dev/null
@@ -1,408 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Wu Zailong, bullet@yeah.net
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-enum
-{
-    INTER_BITS = 5,
-    INTER_TAB_SIZE = 1 << INTER_BITS,
-    INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
-};
-
-#ifdef INTER_NEAREST
-#define convertToWT
-#endif
-
-#ifdef BORDER_CONSTANT
-#define EXTRAPOLATE(v2, v) v = scalar;
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(v2, v) \
-    { \
-        v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
-        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
-    }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(v2, v) \
-    { \
-        if (v2.x < 0) \
-            v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \
-        if (v2.x >= src_cols) \
-            v2.x %= src_cols; \
-        \
-        if (v2.y < 0) \
-            v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
-        if( v2.y >= src_rows ) \
-            v2.y %= src_rows; \
-        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
-    }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#ifdef BORDER_REFLECT
-#define DELTA int delta = 0
-#else
-#define DELTA int delta = 1
-#endif
-#define EXTRAPOLATE(v2, v) \
-    { \
-        DELTA; \
-        if (src_cols == 1) \
-            v2.x = 0; \
-        else \
-            do \
-            { \
-                if( v2.x < 0 ) \
-                    v2.x = -v2.x - 1 + delta; \
-                else \
-                    v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \
-            } \
-            while (v2.x >= src_cols || v2.x < 0); \
-        \
-        if (src_rows == 1) \
-            v2.y = 0; \
-        else \
-            do \
-            { \
-                if( v2.y < 0 ) \
-                    v2.y = -v2.y - 1 + delta; \
-                else \
-                    v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
-            } \
-            while (v2.y >= src_rows || v2.y < 0); \
-        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
-    }
-#else
-#error No extrapolation method
-#endif
-
-#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
-
-#ifdef INTER_NEAREST
-
-__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
-        __global float * map1, __global float * map2,
-        int src_offset, int dst_offset, int map1_offset, int map2_offset,
-        int src_step, int dst_step, int map1_step, int map2_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-        int map2Idx = mad24(y, map2_step, x + map2_offset);
-
-        int gx = convert_int_sat_rte(map1[map1Idx]);
-        int gy = convert_int_sat_rte(map2[map2Idx]);
-
-        if (NEED_EXTRAPOLATION(gx, gy))
-        {
-#ifndef BORDER_CONSTANT
-            int2 gxy = (int2)(gx, gy);
-#endif
-            EXTRAPOLATE(gxy, dst[dstIdx]);
-        }
-        else
-        {
-            int srcIdx = mad24(gy, src_step, gx + src_offset);
-            dst[dstIdx] = src[srcIdx];
-        }
-    }
-}
-
-__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1,
-        int src_offset, int dst_offset, int map1_offset,
-        int src_step, int dst_step, int map1_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-
-        int2 gxy = convert_int2_sat_rte(map1[map1Idx]);
-        int gx = gxy.x, gy = gxy.y;
-
-        if (NEED_EXTRAPOLATION(gx, gy))
-            EXTRAPOLATE(gxy, dst[dstIdx])
-        else
-        {
-            int srcIdx = mad24(gy, src_step, gx + src_offset);
-            dst[dstIdx] = src[srcIdx];
-        }
-    }
-}
-
-__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1,
-        int src_offset, int dst_offset, int map1_offset,
-        int src_step, int dst_step, int map1_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-
-        int2 gxy = convert_int2(map1[map1Idx]);
-        int gx = gxy.x, gy = gxy.y;
-
-        if (NEED_EXTRAPOLATION(gx, gy))
-            EXTRAPOLATE(gxy, dst[dstIdx])
-        else
-        {
-            int srcIdx = mad24(gy, src_step, gx + src_offset);
-            dst[dstIdx] = src[srcIdx];
-        }
-    }
-}
-
-__kernel void remap_16SC2_16UC1(__global const T * restrict src, __global T * dst, __global short2 * map1, __global ushort * map2,
-        int src_offset, int dst_offset, int map1_offset, int map2_offset,
-        int src_step, int dst_step, int map1_step, int map2_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-        int map2Idx = mad24(y, map2_step, x + map2_offset);
-
-        int map2Value = convert_int(map2[map2Idx]) & (INTER_TAB_SIZE2 - 1);
-        int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
-        int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;
-        int2 gxy = convert_int2(map1[map1Idx]) + (int2)(dx, dy);
-        int gx = gxy.x, gy = gxy.y;
-
-        if (NEED_EXTRAPOLATION(gx, gy))
-            EXTRAPOLATE(gxy, dst[dstIdx])
-        else
-        {
-            int srcIdx = mad24(gy, src_step, gx + src_offset);
-            dst[dstIdx] = src[srcIdx];
-        }
-    }
-}
-
-#elif INTER_LINEAR
-
-__kernel void remap_16SC2_16UC1(__global T const * restrict src, __global T * dst,
-        __global short2 * restrict map1, __global ushort * restrict map2,
-        int src_offset, int dst_offset, int map1_offset, int map2_offset,
-        int src_step, int dst_step, int map1_step, int map2_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-        int map2Idx = mad24(y, map2_step, x + map2_offset);
-
-        int2 map_dataA = convert_int2(map1[map1Idx]);
-        int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
-        int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
-        int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
-
-        ushort map2Value = (ushort)(map2[map2Idx] & (INTER_TAB_SIZE2 - 1));
-        WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);
-
-        WT scalar = convertToWT(nVal);
-        WT a = scalar, b = scalar, c = scalar, d = scalar;
-
-        if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-            a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataA, a);
-
-        if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-            b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataB, b);
-
-        if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-            c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataC, c);
-
-        if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-            d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataD, d);
-
-        WT dst_data = a * (1 - u.x) * (1 - u.y) +
-                      b * (u.x)     * (1 - u.y) +
-                      c * (1 - u.x) * (u.y) +
-                      d * (u.x)     * (u.y);
-        dst[dstIdx] = convertToT(dst_data);
-    }
-}
-
-__kernel void remap_2_32FC1(__global T const * restrict  src, __global T * dst,
-        __global float * map1, __global float * map2,
-        int src_offset, int dst_offset, int map1_offset, int map2_offset,
-        int src_step, int dst_step, int map1_step, int map2_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-        int map2Idx = mad24(y, map2_step, x + map2_offset);
-
-        float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]);
-
-        int2 map_dataA = convert_int2_sat_rtn(map_data);
-        int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
-        int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
-        int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
-
-        float2 _u = map_data - convert_float2(map_dataA);
-        WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-        WT scalar = convertToWT(nVal);
-        WT a = scalar, b = scalar, c = scalar, d = scalar;
-
-        if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-            a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataA, a);
-
-        if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-            b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataB, b);
-
-        if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-            c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataC, c);
-
-        if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-            d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataD, d);
-
-        WT dst_data = a * (1 - u.x) * (1 - u.y) +
-                      b * (u.x)     * (1 - u.y) +
-                      c * (1 - u.x) * (u.y) +
-                      d * (u.x)     * (u.y);
-        dst[dstIdx] = convertToT(dst_data);
-    }
-}
-
-__kernel void remap_32FC2(__global T const * restrict  src, __global T * dst,
-        __global float2 * map1,
-        int src_offset, int dst_offset, int map1_offset,
-        int src_step, int dst_step, int map1_step,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        int dstIdx = mad24(y, dst_step, x + dst_offset);
-        int map1Idx = mad24(y, map1_step, x + map1_offset);
-
-        float2 map_data = map1[map1Idx];
-        int2 map_dataA = convert_int2_sat_rtn(map_data);
-        int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
-        int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
-        int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
-
-        float2 _u = map_data - convert_float2(map_dataA);
-        WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
-        WT scalar = convertToWT(nVal);
-        WT a = scalar, b = scalar, c = scalar, d = scalar;
-
-        if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
-            a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataA, a);
-
-        if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
-            b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataB, b);
-
-        if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
-            c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataC, c);
-
-        if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
-            d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
-        else
-            EXTRAPOLATE(map_dataD, d);
-
-        WT dst_data = a * (1 - u.x) * (1 - u.y) +
-                      b * (u.x)     * (1 - u.y) +
-                      c * (1 - u.x) * (u.y) +
-                      d * (u.x)     * (u.y);
-        dst[dstIdx] = convertToT(dst_data);
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl
deleted file mode 100644
index ebf8c712b..000000000
--- a/modules/ocl/src/opencl/imgproc_resize.cl
+++ /dev/null
@@ -1,405 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//	  Niko Li, newlife20080214@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-// resize kernel
-// Currently, CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 are supported.
-// We shall support other types later if necessary.
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#define INTER_RESIZE_COEF_BITS 11
-#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
-#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
-#define CAST_SCALE (1.0f/(1<<CAST_BITS))
-#define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))
-
-#ifdef LN
-
-__kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src,
-                     int dst_offset, int src_offset,int dst_step, int src_step,
-                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int gx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    float4  sx, u, xf;
-    int4 x, DX;
-    gx = (gx<<2) - (dst_offset&3);
-    DX = (int4)(gx, gx+1, gx+2, gx+3);
-    sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f;
-    xf = floor(sx);
-    x = convert_int4(xf);
-    u = sx - xf;
-    float sy = ((dy+0.5f) * ify - 0.5f);
-    int y = floor(sy);
-    float v = sy - y;
-
-    u = x < 0 ? 0 : u;
-    u = (x >= src_cols) ? 0 : u;
-    x = x < 0 ? 0 : x;
-    x = (x >= src_cols) ? src_cols-1 : x;
-
-    y<0 ? y=0,v=0 : y;
-    y>=src_rows ? y=src_rows-1,v=0 : y;
-
-    int4 U, U1;
-    int V, V1;
-    float4 utmp1, utmp2;
-    float vtmp;
-    float4 scale_vec = INTER_RESIZE_COEF_SCALE;
-    utmp1 = u * scale_vec;
-    utmp2 = scale_vec - utmp1;
-    U = convert_int4(rint(utmp1));
-    U1 = convert_int4(rint(utmp2));
-    vtmp = v * INTER_RESIZE_COEF_SCALE;
-    V = rint(vtmp);
-    V1= rint(INTER_RESIZE_COEF_SCALE - vtmp);
-
-    int y_ = INC(y,src_rows);
-    int4 x_;
-    x_ =  ((x+1 >= src_cols) != 0) ? x : x+1;
-
-    int4 val1, val2, val;
-    int4 sdata1, sdata2, sdata3, sdata4;
-
-    int4 pos1 = mad24((int4)y, (int4)src_step, x+(int4)src_offset);
-    int4 pos2 = mad24((int4)y, (int4)src_step, x_+(int4)src_offset);
-    int4 pos3 = mad24((int4)y_, (int4)src_step, x+(int4)src_offset);
-    int4 pos4 = mad24((int4)y_, (int4)src_step, x_+(int4)src_offset);
-
-    sdata1.s0 = src[pos1.s0];
-    sdata1.s1 = src[pos1.s1];
-    sdata1.s2 = src[pos1.s2];
-    sdata1.s3 = src[pos1.s3];
-
-    sdata2.s0 = src[pos2.s0];
-    sdata2.s1 = src[pos2.s1];
-    sdata2.s2 = src[pos2.s2];
-    sdata2.s3 = src[pos2.s3];
-
-    sdata3.s0 = src[pos3.s0];
-    sdata3.s1 = src[pos3.s1];
-    sdata3.s2 = src[pos3.s2];
-    sdata3.s3 = src[pos3.s3];
-
-    sdata4.s0 = src[pos4.s0];
-    sdata4.s1 = src[pos4.s1];
-    sdata4.s2 = src[pos4.s2];
-    sdata4.s3 = src[pos4.s3];
-
-    val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
-    val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
-    val = mul24((int4)V1 , val1) + mul24((int4)V , val2);
-
-    val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);
-
-    pos4 = mad24(dy, dst_step, gx+dst_offset);
-    pos4.y++;
-    pos4.z+=2;
-    pos4.w+=3;
-    uchar4 uval = convert_uchar4_sat(val);
-        int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows && (dst_offset&3)==0);
-    if(con)
-    {
-        *(__global uchar4*)(dst + pos4.x)=uval;
-    }
-    else
-    {
-        if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            dst[pos4.x]=uval.x;
-        }
-        if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            dst[pos4.y]=uval.y;
-        }
-        if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            dst[pos4.z]=uval.z;
-        }
-        if(gx+3 >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            dst[pos4.w]=uval.w;
-        }
-    }
-}
-
-__kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
-                     int dst_offset, int src_offset,int dst_step, int src_step,
-                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
-    int x = floor(sx), y = floor(sy);
-    float u = sx - x, v = sy - y;
-
-    if ( x<0 ) x=0,u=0;
-    if ( x>=src_cols ) x=src_cols-1,u=0;
-    if ( y<0 ) y=0,v=0;
-    if (y>=src_rows ) y=src_rows-1,v=0;
-
-    u = u * INTER_RESIZE_COEF_SCALE;
-    v = v * INTER_RESIZE_COEF_SCALE;
-
-    int U = rint(u);
-    int V = rint(v);
-    int U1= rint(INTER_RESIZE_COEF_SCALE - u);
-    int V1= rint(INTER_RESIZE_COEF_SCALE - v);
-
-    int y_ = INC(y,src_rows);
-    int x_ = INC(x,src_cols);
-    int4 srcpos;
-    srcpos.x = mad24(y, src_step, x+src_offset);
-    srcpos.y = mad24(y, src_step, x_+src_offset);
-    srcpos.z = mad24(y_, src_step, x+src_offset);
-    srcpos.w = mad24(y_, src_step, x_+src_offset);
-    int4 data0 = convert_int4(src[srcpos.x]);
-    int4 data1 = convert_int4(src[srcpos.y]);
-    int4 data2 = convert_int4(src[srcpos.z]);
-    int4 data3 = convert_int4(src[srcpos.w]);
-    int4 val = mul24((int4)mul24(U1, V1) ,  data0) + mul24((int4)mul24(U, V1) ,  data1)
-               +mul24((int4)mul24(U1, V) ,  data2)+mul24((int4)mul24(U, V) ,  data3);
-    int dstpos = mad24(dy, dst_step, dx+dst_offset);
-    uchar4 uval =   convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
-    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
-         dst[dstpos] = uval;
-}
-
-__kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
-                     int dst_offset, int src_offset,int dst_step, int src_step,
-                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
-    int x = floor(sx), y = floor(sy);
-    float u = sx - x, v = sy - y;
-
-    if ( x<0 ) x=0,u=0;
-    if ( x>=src_cols ) x=src_cols-1,u=0;
-    if ( y<0 ) y=0,v=0;
-    if (y>=src_rows ) y=src_rows-1,v=0;
-
-    int y_ = INC(y,src_rows);
-    int x_ = INC(x,src_cols);
-    float u1 = 1.f-u;
-    float v1 = 1.f-v;
-    int4 srcpos;
-    srcpos.x = mad24(y, src_step, x+src_offset);
-    srcpos.y = mad24(y, src_step, x_+src_offset);
-    srcpos.z = mad24(y_, src_step, x+src_offset);
-    srcpos.w = mad24(y_, src_step, x_+src_offset);
-    float data0 = src[srcpos.x];
-    float data1 = src[srcpos.y];
-    float data2 = src[srcpos.z];
-    float data3 = src[srcpos.w];
-    float val1 = u1 *  data0 +
-                u  *  data1 ;
-    float val2 = u1 *  data2 +
-                u *  data3;
-    float val = v1 * val1 + v * val2;
-    int dstpos = mad24(dy, dst_step, dx+dst_offset);
-    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
-         dst[dstpos] = val;
-}
-
-__kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
-                     int dst_offset, int src_offset,int dst_step, int src_step,
-                     int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
-    int x = floor(sx), y = floor(sy);
-    float u = sx - x, v = sy - y;
-
-    if ( x<0 ) x=0,u=0;
-    if ( x>=src_cols ) x=src_cols-1,u=0;
-    if ( y<0 ) y=0,v=0;
-    if (y>=src_rows ) y=src_rows-1,v=0;
-
-    int y_ = INC(y,src_rows);
-    int x_ = INC(x,src_cols);
-    float u1 = 1.f-u;
-    float v1 = 1.f-v;
-    int4 srcpos;
-    srcpos.x = mad24(y, src_step, x+src_offset);
-    srcpos.y = mad24(y, src_step, x_+src_offset);
-    srcpos.z = mad24(y_, src_step, x+src_offset);
-    srcpos.w = mad24(y_, src_step, x_+src_offset);
-    float4 s_data1, s_data2, s_data3, s_data4;
-    s_data1 = src[srcpos.x];
-    s_data2 = src[srcpos.y];
-    s_data3 = src[srcpos.z];
-    s_data4 = src[srcpos.w];
-    float4 val = u1 * v1 * s_data1 + u * v1 * s_data2
-              +u1 * v *s_data3 + u * v *s_data4;
-    int dstpos = mad24(dy, dst_step, dx+dst_offset);
-
-    if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
-         dst[dstpos] = val;
-}
-
-#elif defined NN
-
-__kernel void resizeNN(__global T * dst, __global T * src,
-                       int dst_offset, int src_offset, int dst_step, int src_step,
-                       int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if (dx < dst_cols && dy < dst_rows)
-    {
-        float s1 = dx * ifx, s2 = dy * ify;
-        int sx = min(convert_int_sat_rtn(s1), src_cols - 1);
-        int sy = min(convert_int_sat_rtn(s2), src_rows - 1);
-
-        int dst_index = mad24(dy, dst_step, dx + dst_offset);
-        int src_index = mad24(sy, src_step, sx + src_offset);
-
-        dst[dst_index] = src[src_index];
-    }
-}
-
-#elif defined AREA
-
-#ifdef AREA_FAST
-
-__kernel void resizeAREA_FAST(__global T * dst, __global T * src,
-                         int dst_offset, int src_offset, int dst_step, int src_step,
-                         int src_cols, int src_rows, int dst_cols, int dst_rows, WT ifx, WT ify,
-                         __global const int * dmap_tab, __global const int * smap_tab)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if (dx < dst_cols && dy < dst_rows)
-    {
-        int dst_index = mad24(dy, dst_step, dst_offset + dx);
-
-        __global const int * xmap_tab = dmap_tab;
-        __global const int * ymap_tab = dmap_tab + dst_cols;
-        __global const int * sxmap_tab = smap_tab;
-        __global const int * symap_tab = smap_tab + XSCALE * dst_cols;
-
-        int sx = xmap_tab[dx], sy = ymap_tab[dy];
-        WTV sum = (WTV)(0);
-
-        #pragma unroll
-        for (int y = 0; y < YSCALE; ++y)
-        {
-            int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
-            #pragma unroll
-            for (int x = 0; x < XSCALE; ++x)
-                sum += convertToWTV(src[src_index + sxmap_tab[sx + x]]);
-        }
-
-        dst[dst_index] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
-    }
-}
-
-#else
-
-__kernel void resizeAREA(__global T * dst, __global T * src,
-                         int dst_offset, int src_offset, int dst_step, int src_step,
-                         int src_cols, int src_rows, int dst_cols, int dst_rows, WT ifx, WT ify,
-                         __global const int * ofs_tab, __global const int * map_tab,
-                         __global const float * alpha_tab)
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if (dx < dst_cols && dy < dst_rows)
-    {
-        int dst_index = mad24(dy, dst_step, dst_offset + dx);
-
-        __global const int * xmap_tab = map_tab;
-        __global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
-        __global const float * xalpha_tab = alpha_tab;
-        __global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
-        __global const int * xofs_tab = ofs_tab;
-        __global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
-
-        int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
-        int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
-
-        int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
-        int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
-
-        WTV sum = (WTV)(0), buf;
-        int src_index = mad24(sy0, src_step, src_offset);
-
-        for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
-        {
-            WTV beta = (WTV)(yalpha_tab[yk]);
-            buf = (WTV)(0);
-
-            for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
-            {
-                WTV alpha = (WTV)(xalpha_tab[xk]);
-                buf += convertToWTV(src[src_index + sx]) * alpha;
-            }
-            sum += buf * beta;
-        }
-
-        dst[dst_index] = convertToT(sum);
-    }
-}
-
-#endif
-
-#endif
diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl
deleted file mode 100644
index d6a995f55..000000000
--- a/modules/ocl/src/opencl/imgproc_sobel3.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-__kernel void sobel3(
-        __global uchar* Src,
-        __global float* DstX,
-        __global float* DstY,
-        int width, int height,
-        uint srcStride, uint dstStride,
-        float scale
-        )
-{
-    __local float lsmem[BLK_Y+2][BLK_X+2];
-
-    int lix = get_local_id(0);
-    int liy = get_local_id(1);
-
-    int gix = get_group_id(0);
-    int giy = get_group_id(1);
-
-    int id_x = get_global_id(0);
-    int id_y = get_global_id(1);
-
-    lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
-
-    int id_y_h = ADDR_H(id_y-1, 0,height);
-    int id_y_b = ADDR_B(id_y+1, height,id_y+1);
-
-    int id_x_l = ADDR_L(id_x-1, 0,width);
-    int id_x_r = ADDR_R(id_x+1, width,id_x+1);
-
-    if(liy==0)
-    {
-        lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
-    }
-    else if(liy==BLK_Y-1)
-    {
-        lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
-
-        if(lix==0)
-            lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
-        else if(lix==BLK_X-1)
-            lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
-    }
-
-    if(lix==0)
-        lsmem[liy+1][0]    = convert_float(Src[ id_y * srcStride + id_x_l ]);
-    else if(lix==BLK_X-1)
-        lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float u1 = lsmem[liy][lix];
-    float u2 = lsmem[liy][lix+1];
-    float u3 = lsmem[liy][lix+2];
-
-    float m1 = lsmem[liy+1][lix];
-    float m2 = lsmem[liy+1][lix+1];
-    float m3 = lsmem[liy+1][lix+2];
-
-    float b1 = lsmem[liy+2][lix];
-    float b2 = lsmem[liy+2][lix+1];
-    float b3 = lsmem[liy+2][lix+2];
-
-    //m2 * scale;//
-    float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
-    DstX[ id_y * dstStride + id_x ] = dx * scale;
-
-    float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
-    DstY[ id_y * dstStride + id_x ] = dy * scale;
-}
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
deleted file mode 100644
index 85631be36..000000000
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef VECTORIZED
-
-__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
-                        __global T * dst, int dst_offset, int dst_step,
-                        T thresh, T max_val, int max_index, int rows, int cols)
-{
-    int gx = get_global_id(0);
-    int gy = get_global_id(1);
-
-    if (gx < cols && gy < rows)
-    {
-        gx *= VECSIZE;
-        int src_index = mad24(gy, src_step, src_offset + gx);
-        int dst_index = mad24(gy, dst_step, dst_offset + gx);
-
-#ifdef SRC_ALIGNED
-        VT sdata = *((__global VT *)(src + src_index));
-#else
-        VT sdata = VLOADN(0, src + src_index);
-#endif
-        VT vthresh = (VT)(thresh);
-
-#ifdef THRESH_BINARY
-        VT vecValue = sdata > vthresh ? (VT)max_val : (VT)(0);
-#elif defined THRESH_BINARY_INV
-        VT vecValue = sdata > vthresh ? (VT)(0) : (VT)max_val;
-#elif defined THRESH_TRUNC
-        VT vecValue = sdata > vthresh ? (VT)thresh : sdata;
-#elif defined THRESH_TOZERO
-        VT vecValue = sdata > vthresh ? sdata : (VT)(0);
-#elif defined THRESH_TOZERO_INV
-        VT vecValue = sdata > vthresh ? (VT)(0) : sdata;
-#endif
-
-        if (gx + VECSIZE <= max_index)
-#ifdef DST_ALIGNED
-            *(__global VT*)(dst + dst_index) = vecValue;
-#else
-            VSTOREN(vecValue, 0, dst + dst_index);
-#endif
-        else
-        {
-            __attribute__(( aligned(sizeof(VT)) )) T array[VECSIZE];
-            *((VT*)array) = vecValue;
-            #pragma unroll
-            for (int i = 0; i < VECSIZE; ++i)
-                if (gx + i < max_index)
-                    dst[dst_index + i] = array[i];
-        }
-    }
-}
-
-#else
-
-__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
-                        __global T * dst, int dst_offset, int dst_step,
-                        T thresh, T max_val, int rows, int cols)
-{
-    int gx = get_global_id(0);
-    int gy = get_global_id(1);
-
-    if (gx < cols && gy < rows)
-    {
-        int src_index = mad24(gy, src_step, src_offset + gx);
-        int dst_index = mad24(gy, dst_step, dst_offset + gx);
-
-        T sdata = src[src_index];
-
-#ifdef THRESH_BINARY
-        dst[dst_index] = sdata > thresh ? max_val : (T)(0);
-#elif defined THRESH_BINARY_INV
-        dst[dst_index] = sdata > thresh ? (T)(0) : max_val;
-#elif defined THRESH_TRUNC
-        dst[dst_index] = sdata > thresh ? thresh : sdata;
-#elif defined THRESH_TOZERO
-        dst[dst_index] = sdata > thresh ? sdata : (T)(0);
-#elif defined THRESH_TOZERO_INV
-        dst[dst_index] = sdata > thresh ? (T)(0) : sdata;
-#endif
-    }
-}
-
-#endif
diff --git a/modules/ocl/src/opencl/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl
deleted file mode 100644
index 27f99e005..000000000
--- a/modules/ocl/src/opencl/imgproc_warpAffine.cl
+++ /dev/null
@@ -1,761 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-//warpAffine kernel
-//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#endif
-
-#define INTER_BITS 5
-#define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE
-#define AB_BITS max(10, (int)INTER_BITS)
-#define AB_SCALE (1 << AB_BITS)
-#define INTER_REMAP_COEF_BITS 15
-#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
-
-inline void interpolateCubic( float x, float* coeffs )
-{
-    const float A = -0.75f;
-
-    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
-    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
-    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
-    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
-}
-
-
-/**********************************************8UC1*********************************************
-***********************************************************************************************/
-__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        dx = (dx<<2) - (dst_offset&3);
-
-        int round_delta = (AB_SCALE>>1);
-
-        int4 X, Y;
-        int4 sx, sy;
-        int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
-        DX = (DX << AB_BITS);
-        F4 M0DX, M3DX;
-        M0DX = M[0] * convert_F4(DX);
-        M3DX = M[3] * convert_F4(DX);
-        X = convert_int4(rint(M0DX));
-        Y = convert_int4(rint(M3DX));
-        int tmp1, tmp2;
-        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
-        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-
-        X += tmp1 + round_delta;
-        Y += tmp2 + round_delta;
-
-        sx = convert_int4(convert_short4(X >> AB_BITS));
-        sy = convert_int4(convert_short4(Y >> AB_BITS));
-
-        __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
-        uchar4 dval = *d;
-        DX = (int4)(dx, dx+1, dx+2, dx+3);
-        int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
-        int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
-        int4 spos = src_offset + sy * srcStep + sx;
-        uchar4 sval;
-        sval.s0 = scon.s0 ? src[spos.s0] : 0;
-        sval.s1 = scon.s1 ? src[spos.s1] : 0;
-        sval.s2 = scon.s2 ? src[spos.s2] : 0;
-        sval.s3 = scon.s3 ? src[spos.s3] : 0;
-        dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
-        *d = dval;
-    }
-}
-
-__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        dx = (dx<<2) - (dst_offset&3);
-
-        int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
-
-        int4 X, Y;
-        short4  ax, ay;
-        int4 sx, sy;
-        int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
-        DX = (DX << AB_BITS);
-        F4 M0DX, M3DX;
-        M0DX = M[0] * convert_F4(DX);
-        M3DX = M[3] * convert_F4(DX);
-        X = convert_int4(rint(M0DX));
-        Y = convert_int4(rint(M3DX));
-
-        int tmp1, tmp2;
-        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
-        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-
-        X += tmp1 + round_delta;
-        Y += tmp2 + round_delta;
-
-        X = X >> (AB_BITS - INTER_BITS);
-        Y = Y >> (AB_BITS - INTER_BITS);
-
-        sx = convert_int4(convert_short4(X >> INTER_BITS));
-        sy = convert_int4(convert_short4(Y >> INTER_BITS));
-        ax = convert_short4(X & (INTER_TAB_SIZE-1));
-        ay = convert_short4(Y & (INTER_TAB_SIZE-1));
-
-        uchar4 v0, v1, v2,v3;
-        int4 scon0, scon1, scon2, scon3;
-        int4 spos0, spos1, spos2, spos3;
-
-        scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows);
-        scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows);
-        scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows);
-        scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows);
-        spos0 = src_offset + sy * srcStep + sx;
-        spos1 = src_offset + sy * srcStep + sx + 1;
-        spos2 = src_offset + (sy+1) * srcStep + sx;
-        spos3 = src_offset + (sy+1) * srcStep + sx + 1;
-
-        v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
-        v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
-        v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
-        v3.s0 = scon3.s0 ? src[spos3.s0] : 0;
-
-        v0.s1 = scon0.s1 ? src[spos0.s1] : 0;
-        v1.s1 = scon1.s1 ? src[spos1.s1] : 0;
-        v2.s1 = scon2.s1 ? src[spos2.s1] : 0;
-        v3.s1 = scon3.s1 ? src[spos3.s1] : 0;
-
-        v0.s2 = scon0.s2 ? src[spos0.s2] : 0;
-        v1.s2 = scon1.s2 ? src[spos1.s2] : 0;
-        v2.s2 = scon2.s2 ? src[spos2.s2] : 0;
-        v3.s2 = scon3.s2 ? src[spos3.s2] : 0;
-
-        v0.s3 = scon0.s3 ? src[spos0.s3] : 0;
-        v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
-        v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
-        v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
-
-        short4 itab0, itab1, itab2, itab3;
-        float4 taby, tabx;
-        taby = INTER_SCALE * convert_float4(ay);
-        tabx = INTER_SCALE * convert_float4(ax);
-
-        itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
-        itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
-        itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
-        itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
-
-
-        int4 val;
-        uchar4 tval;
-        val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
-              + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
-        tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-
-        __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
-        uchar4 dval = *d;
-        DX = (int4)(dx, dx+1, dx+2, dx+3);
-        int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
-        dval = convert_uchar4(dcon != 0) ? tval : dval;
-        *d = dval;
-    }
-}
-
-__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
-                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        int X = X0 >> (AB_BITS - INTER_BITS);
-        int Y = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx = (short)(X >> INTER_BITS) - 1;
-        short sy = (short)(Y >> INTER_BITS) - 1;
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        uchar v[16];
-        int i, j;
-
-#pragma unroll 4
-        for(i=0; i<4;  i++)
-            for(j=0; j<4;  j++)
-            {
-                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
-            }
-
-        short itab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-        int isum = 0;
-
-#pragma unroll 16
-        for( i=0; i<16; i++ )
-        {
-            F v = tab1y[(i>>2)] * tab1x[(i&3)];
-            isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
-        }
-
-        if( isum != INTER_REMAP_COEF_SCALE )
-        {
-            int k1, k2;
-            int diff = isum - INTER_REMAP_COEF_SCALE;
-            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-            for( k1 = 2; k1 < 4; k1++ )
-                for( k2 = 2; k2 < 4; k2++ )
-                {
-                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
-                        mk1 = k1, mk2 = k2;
-                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
-                        Mk1 = k1, Mk2 = k2;
-                }
-            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            int sum=0;
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * itab[i] ;
-            }
-            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        }
-    }
-}
-
-/**********************************************8UC4*********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = (AB_SCALE >> 1);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
-        int sx0 = (short)(X0 >> AB_BITS);
-        int sy0 = (short)(Y0 >> AB_BITS);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
-    }
-}
-
-__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
-        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2);
-
-        int tmp = (dx << AB_BITS);
-        int X0 = rint(M[0] * tmp);
-        int Y0 = rint(M[3] * tmp);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx0 = (short)(X0 >> INTER_BITS);
-        short sy0 = (short)(Y0 >> INTER_BITS);
-        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
-        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
-        int4 v0, v1, v2, v3;
-
-        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
-        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0;
-        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0;
-        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0;
-
-        int itab0, itab1, itab2, itab3;
-        float taby, tabx;
-        taby = 1.f/INTER_TAB_SIZE*ay0;
-        tabx = 1.f/INTER_TAB_SIZE*ax0;
-
-        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
-        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
-        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
-        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-
-        int4 val;
-        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-    }
-}
-
-__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
-                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-
-        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2);
-        dst_offset = (dst_offset>>2);
-        dstStep = (dstStep>>2);
-
-        int tmp = (dx << AB_BITS);
-        int X0 = rint(M[0] * tmp);
-        int Y0 = rint(M[3] * tmp);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        int sx = (short)(X0 >> INTER_BITS) - 1;
-        int sy = (short)(Y0 >> INTER_BITS) - 1;
-        int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
-        int ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
-        uchar4 v[16];
-        int i,j;
-#pragma unroll 4
-        for(i=0; i<4; i++)
-            for(j=0; j<4; j++)
-            {
-                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
-            }
-        int itab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = INTER_SCALE * ay;
-        axx = INTER_SCALE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-        int isum = 0;
-
-#pragma unroll 16
-        for( i=0; i<16; i++ )
-        {
-            float tmp;
-            tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
-            itab[i] = rint(tmp);
-            isum += itab[i];
-        }
-
-        if( isum != INTER_REMAP_COEF_SCALE )
-        {
-            int k1, k2;
-            int diff = isum - INTER_REMAP_COEF_SCALE;
-            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-
-            for( k1 = 2; k1 < 4; k1++ )
-                for( k2 = 2; k2 < 4; k2++ )
-                {
-
-                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
-                        mk1 = k1, mk2 = k2;
-                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
-                        Mk1 = k1, Mk2 = k2;
-                }
-
-            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            int4 sum=0;
-            for ( i =0; i<16; i++ )
-            {
-                sum += convert_int4(v[i]) * itab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        }
-    }
-}
-
-
-/**********************************************32FC1********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/2;
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
-        short sx0 = (short)(X0 >> AB_BITS);
-        short sy0 = (short)(Y0 >> AB_BITS);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
-    }
-}
-
-__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
-        src_offset = (src_offset>>2);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx0 = (short)(X0 >> INTER_BITS);
-        short sy0 = (short)(Y0 >> INTER_BITS);
-        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
-        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
-        float v0, v1, v2, v3;
-
-        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
-        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
-        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
-        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
-
-        float tab[4];
-        float taby[2], tabx[2];
-        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
-        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
-        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
-        tab[0] = taby[0] * tabx[0];
-        tab[1] = taby[0] * tabx[1];
-        tab[2] = taby[1] * tabx[0];
-        tab[3] = taby[1] * tabx[1];
-
-        float sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
-    }
-}
-
-__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
-        src_offset = (src_offset>>2);
-        dst_offset = (dst_offset>>2);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx = (short)(X0 >> INTER_BITS) - 1;
-        short sy = (short)(Y0 >> INTER_BITS) - 1;
-        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
-        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
-        float v[16];
-        int i;
-
-        for(i=0; i<16;  i++)
-            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
-
-        float tab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
-        for( i=0; i<16; i++ )
-        {
-            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            float sum = 0;
-#pragma unroll 4
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * tab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = sum;
-
-        }
-    }
-}
-
-
-/**********************************************32FC4********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/2;
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
-        short sx0 = (short)(X0 >> AB_BITS);
-        short sy0 = (short)(Y0 >> AB_BITS);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
-    }
-}
-
-__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
-        src_offset = (src_offset>>4);
-        dst_offset = (dst_offset>>4);
-        srcStep = (srcStep>>2);
-        dstStep = (dstStep>>2);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx0 = (short)(X0 >> INTER_BITS);
-        short sy0 = (short)(Y0 >> INTER_BITS);
-        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
-        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
-        float4 v0, v1, v2, v3;
-
-        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
-        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
-        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
-        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
-
-        float tab[4];
-        float taby[2], tabx[2];
-        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
-        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
-        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
-        tab[0] = taby[0] * tabx[0];
-        tab[1] = taby[0] * tabx[1];
-        tab[2] = taby[1] * tabx[0];
-        tab[3] = taby[1] * tabx[1];
-
-        float4 sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[dst_offset+dy*dstStep+dx] = sum;
-    }
-}
-
-__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
-        src_offset = (src_offset>>4);
-        dst_offset = (dst_offset>>4);
-        srcStep = (srcStep>>2);
-        dstStep = (dstStep>>2);
-
-        int X0 = rint(M[0] * dx * AB_SCALE);
-        int Y0 = rint(M[3] * dx * AB_SCALE);
-        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
-        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-        X0 = X0 >> (AB_BITS - INTER_BITS);
-        Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
-        short sx = (short)(X0 >> INTER_BITS) - 1;
-        short sy = (short)(Y0 >> INTER_BITS) - 1;
-        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
-        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
-        float4 v[16];
-        int i;
-
-        for(i=0; i<16;  i++)
-            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
-
-        float tab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
-        for( i=0; i<16; i++ )
-        {
-            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            float4 sum = 0;
-#pragma unroll 4
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * tab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = sum;
-
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
deleted file mode 100644
index 97f86640b..000000000
--- a/modules/ocl/src/opencl/imgproc_warpPerspective.cl
+++ /dev/null
@@ -1,682 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-//wrapPerspective kernel
-//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#endif
-
-
-#define INTER_BITS 5
-#define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE
-#define AB_BITS max(10, (int)INTER_BITS)
-#define AB_SCALE (1 << AB_BITS)
-#define INTER_REMAP_COEF_BITS 15
-#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
-
-inline void interpolateCubic( float x, float* coeffs )
-{
-    const float A = -0.75f;
-
-    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
-    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
-    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
-    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
-}
-
-
-/**********************************************8UC1*********************************************
-***********************************************************************************************/
-__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
-                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        dx = (dx<<2) - (dst_offset&3);
-
-        F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
-        F4 X0 = M[0]*DX + M[1]*dy + M[2];
-        F4 Y0 = M[3]*DX + M[4]*dy + M[5];
-        F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
-        W = (W!=zero) ? one/W : zero;
-        short4 X = convert_short4_sat_rte(X0*W);
-        short4 Y = convert_short4_sat_rte(Y0*W);
-        int4 sx = convert_int4(X);
-        int4 sy = convert_int4(Y);
-
-        int4 DXD = (int4)(dx, dx+1, dx+2, dx+3);
-        __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
-        uchar4 dval = *d;
-        int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows;
-        int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
-        int4 spos = src_offset + sy * srcStep + sx;
-        uchar4 sval;
-        sval.s0 = scon.s0 ? src[spos.s0] : 0;
-        sval.s1 = scon.s1 ? src[spos.s1] : 0;
-        sval.s2 = scon.s2 ? src[spos.s2] : 0;
-        sval.s3 = scon.s3 ? src[spos.s3] : 0;
-        dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
-        *d = dval;
-    }
-}
-
-__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        int sx = convert_short_sat(X >> INTER_BITS);
-        int sy = convert_short_sat(Y >> INTER_BITS);
-        int ay = (short)(Y & (INTER_TAB_SIZE-1));
-        int ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        uchar v[4];
-        int i;
-#pragma unroll 4
-        for(i=0; i<4;  i++)
-            v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
-
-        short itab[4];
-        float tab1y[2], tab1x[2];
-        tab1y[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
-        tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
-        tab1x[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
-        tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
-
-#pragma unroll 4
-        for(i=0; i<4;  i++)
-        {
-            float v = tab1y[(i>>1)] * tab1x[(i&1)];
-            itab[i] = convert_short_sat_rte( v * INTER_REMAP_COEF_SCALE );
-        }
-        if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            int sum = 0;
-            for ( i =0; i<4; i++ )
-            {
-                sum += v[i] * itab[i] ;
-            }
-            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        }
-    }
-}
-
-__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
-        int dst_cols, int dst_rows, int srcStep, int dstStep,
-        int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS) - 1;
-        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        uchar v[16];
-        int i, j;
-
-#pragma unroll 4
-        for(i=0; i<4;  i++)
-            for(j=0; j<4;  j++)
-            {
-                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
-            }
-
-        short itab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-
-        int isum = 0;
-#pragma unroll 16
-        for( i=0; i<16; i++ )
-        {
-            F v = tab1y[(i>>2)] * tab1x[(i&3)];
-            isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
-        }
-        if( isum != INTER_REMAP_COEF_SCALE )
-        {
-            int k1, k2;
-            int diff = isum - INTER_REMAP_COEF_SCALE;
-            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-            for( k1 = 2; k1 < 4; k1++ )
-                for( k2 = 2; k2 < 4; k2++ )
-                {
-                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
-                        mk1 = k1, mk2 = k2;
-                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
-                        Mk1 = k1, Mk2 = k2;
-                }
-            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
-        }
-
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            int sum=0;
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * itab[i] ;
-            }
-            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        }
-    }
-}
-
-/**********************************************8UC4*********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
-                                      int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-                                      int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? 1.f/W : 0.0f;
-        short sx = convert_short_sat_rte(X0*W);
-        short sy = convert_short_sat_rte(Y0*W);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
-    }
-}
-
-__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS);
-        short sy = convert_short_sat(Y >> INTER_BITS);
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-
-        int4 v0, v1, v2, v3;
-
-        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
-        v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
-        v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
-        v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
-
-        int itab0, itab1, itab2, itab3;
-        float taby, tabx;
-        taby = 1.f/INTER_TAB_SIZE*ay;
-        tabx = 1.f/INTER_TAB_SIZE*ax;
-
-        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
-        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
-        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
-        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-
-        int4 val;
-        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-    }
-}
-
-__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        src_offset = (src_offset>>2);
-        srcStep = (srcStep>>2);
-        dst_offset = (dst_offset>>2);
-        dstStep = (dstStep>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS) - 1;
-        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        uchar4 v[16];
-        int i,j;
-#pragma unroll 4
-        for(i=0; i<4; i++)
-            for(j=0; j<4; j++)
-            {
-                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
-            }
-        int itab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = INTER_SCALE * ay;
-        axx = INTER_SCALE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-        int isum = 0;
-
-#pragma unroll 16
-        for( i=0; i<16; i++ )
-        {
-            float tmp;
-            tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
-            itab[i] = rint(tmp);
-            isum += itab[i];
-        }
-
-        if( isum != INTER_REMAP_COEF_SCALE )
-        {
-            int k1, k2;
-            int diff = isum - INTER_REMAP_COEF_SCALE;
-            int Mk1=2, Mk2=2, mk1=2, mk2=2;
-
-            for( k1 = 2; k1 < 4; k1++ )
-                for( k2 = 2; k2 < 4; k2++ )
-                {
-
-                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
-                        mk1 = k1, mk2 = k2;
-                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
-                        Mk1 = k1, Mk2 = k2;
-                }
-
-            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            int4 sum=0;
-            for ( i =0; i<16; i++ )
-            {
-                sum += convert_int4(v[i]) * itab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-        }
-    }
-}
-
-
-/**********************************************32FC1********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? 1.f/W : 0.0f;
-        short sx = convert_short_sat_rte(X0*W);
-        short sy = convert_short_sat_rte(Y0*W);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
-    }
-}
-
-__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-        int dst_cols, int dst_rows, int srcStep, int dstStep,
-        int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        src_offset = (src_offset>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS);
-        short sy = convert_short_sat(Y >> INTER_BITS);
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        float v0, v1, v2, v3;
-
-        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
-        v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
-        v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
-        v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
-
-        float tab[4];
-        float taby[2], tabx[2];
-        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
-        taby[1] = 1.f/INTER_TAB_SIZE*ay;
-        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
-        tabx[1] = 1.f/INTER_TAB_SIZE*ax;
-
-        tab[0] = taby[0] * tabx[0];
-        tab[1] = taby[0] * tabx[1];
-        tab[2] = taby[1] * tabx[0];
-        tab[3] = taby[1] * tabx[1];
-
-        float sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
-    }
-}
-
-__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
-        int dst_cols, int dst_rows, int srcStep, int dstStep,
-        int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        src_offset = (src_offset>>2);
-        dst_offset = (dst_offset>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS) - 1;
-        short sy = convert_short_sat(Y >> INTER_BITS) - 1;
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-        float v[16];
-        int i;
-
-        for(i=0; i<16;  i++)
-            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
-
-        float tab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
-        for( i=0; i<16; i++ )
-        {
-            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            float sum = 0;
-#pragma unroll 4
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * tab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = sum;
-
-        }
-    }
-}
-
-
-/**********************************************32FC4********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
-                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W =(W != 0.0f)? 1.f/W : 0.0f;
-        short sx = convert_short_sat_rte(X0*W);
-        short sy = convert_short_sat_rte(Y0*W);
-
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
-    }
-}
-
-__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
-        int dst_cols, int dst_rows, int srcStep, int dstStep,
-        int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows)
-    {
-        src_offset = (src_offset>>4);
-        dst_offset = (dst_offset>>4);
-        srcStep = (srcStep>>2);
-        dstStep = (dstStep>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx0 = convert_short_sat(X >> INTER_BITS);
-        short sy0 = convert_short_sat(Y >> INTER_BITS);
-        short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax0 = (short)(X & (INTER_TAB_SIZE-1));
-
-
-        float4 v0, v1, v2, v3;
-
-        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
-        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
-        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
-        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
-
-        float tab[4];
-        float taby[2], tabx[2];
-        taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
-        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
-        tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
-        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
-        tab[0] = taby[0] * tabx[0];
-        tab[1] = taby[0] * tabx[1];
-        tab[2] = taby[1] * tabx[0];
-        tab[3] = taby[1] * tabx[1];
-
-        float4 sum = 0;
-        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
-        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-            dst[dst_offset+dy*dstStep+dx] = sum;
-    }
-}
-
-__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
-        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
-        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
-{
-    int dx = get_global_id(0);
-    int dy = get_global_id(1);
-
-    if( dx < threadCols && dy < dst_rows )
-    {
-        src_offset = (src_offset>>4);
-        dst_offset = (dst_offset>>4);
-        srcStep = (srcStep>>2);
-        dstStep = (dstStep>>2);
-
-        F X0 = M[0]*dx + M[1]*dy + M[2];
-        F Y0 = M[3]*dx + M[4]*dy + M[5];
-        F W = M[6]*dx + M[7]*dy + M[8];
-        W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
-        int X = rint(X0*W);
-        int Y = rint(Y0*W);
-
-        short sx = convert_short_sat(X >> INTER_BITS)-1;
-        short sy = convert_short_sat(Y >> INTER_BITS)-1;
-        short ay = (short)(Y & (INTER_TAB_SIZE-1));
-        short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-
-        float4 v[16];
-        int i;
-
-        for(i=0; i<16;  i++)
-            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
-
-        float tab[16];
-        float tab1y[4], tab1x[4];
-        float axx, ayy;
-
-        ayy = 1.f/INTER_TAB_SIZE * ay;
-        axx = 1.f/INTER_TAB_SIZE * ax;
-        interpolateCubic(ayy, tab1y);
-        interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
-        for( i=0; i<16; i++ )
-        {
-            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
-        }
-
-        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
-        {
-            float4 sum = 0;
-#pragma unroll 4
-            for ( i =0; i<16; i++ )
-            {
-                sum += v[i] * tab[i];
-            }
-            dst[dst_offset+dy*dstStep+dx] = sum;
-
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/interpolate_frames.cl b/modules/ocl/src/opencl/interpolate_frames.cl
deleted file mode 100644
index eb0b55f33..000000000
--- a/modules/ocl/src/opencl/interpolate_frames.cl
+++ /dev/null
@@ -1,252 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-// Image read mode
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
-
-// atomic add for 32bit floating point
-inline void atomic_addf(volatile __global float *source, const float operand) {
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } newVal;
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } prevVal;
-    do {
-        prevVal.floatVal = *source;
-        newVal.floatVal = prevVal.floatVal + operand;
-    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
-}
-
-__kernel void memsetKernel(
-    float val,
-    __global float * image,
-    int width,
-    int height,
-    int step, // in element
-    int offset
-    )
-{
-    if(get_global_id(0) >= width || get_global_id(1) >= height)
-    {
-        return;
-    }
-    image += offset;
-    image[get_global_id(0) + get_global_id(1) * step] = val;
-}
-
-__kernel void normalizeKernel(
-    __global float * buffer,
-    int width,
-    int height,
-    int step,
-    int f_offset,
-    int d_offset
-    )
-{
-    __global float * factors = buffer + f_offset;
-    __global float * dst     = buffer + d_offset;
-
-    int j = get_global_id(0);
-    int i = get_global_id(1);
-
-    if(j >= width || i >= height)
-    {
-        return;
-    }
-    float scale = factors[step * i + j];
-    float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
-
-    dst[step * i + j] *= invScale;
-}
-
-__kernel void forwardWarpKernel(
-    __global const float * src,
-    __global float * buffer,
-    __global const float * u,
-    __global const float * v,
-    const int w,
-    const int h,
-    const int flow_stride,
-    const int image_stride,
-    const int factor_offset,
-    const int dst_offset,
-    const float time_scale
-    )
-{
-    int j = get_global_id(0);
-    int i = get_global_id(1);
-
-    if (i >= h || j >= w) return;
-
-    volatile __global float * normalization_factor = (volatile __global float *) buffer + factor_offset;
-    volatile __global float * dst = (volatile __global float *)buffer + dst_offset;
-
-    int flow_row_offset  = i * flow_stride;
-    int image_row_offset = i * image_stride;
-
-    //bottom left corner of a target pixel
-    float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
-    float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
-    // pixel containing bottom left corner
-    float px;
-    float py;
-    float dx = modf(cx, &px);
-    float dy = modf(cy, &py);
-    // target pixel integer coords
-    int tx;
-    int ty;
-    tx = (int) px;
-    ty = (int) py;
-    float value = src[image_row_offset + j];
-    float weight;
-    // fill pixel containing bottom right corner
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = dx * dy;
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing bottom left corner
-    tx -= 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = (1.0f - dx) * dy;
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing upper left corner
-    ty -= 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = (1.0f - dx) * (1.0f - dy);
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-
-    // fill pixel containing upper right corner
-    tx += 1;
-    if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
-    {
-        weight = dx * (1.0f - dy);
-        atomic_addf(dst + ty * image_stride + tx, value * weight);
-        atomic_addf(normalization_factor + ty * image_stride + tx, weight);
-    }
-}
-
-// define buffer offsets
-enum
-{
-    O0_OS = 0,
-    O1_OS,
-    U_OS,
-    V_OS,
-    UR_OS,
-    VR_OS
-};
-
-__kernel void blendFramesKernel(
-    image2d_t tex_src0,
-    image2d_t tex_src1,
-    __global float * buffer,
-    __global float * out,
-    int w,
-    int h,
-    int step,
-    float theta
-    )
-{
-    __global float * u  = buffer + h * step * U_OS;
-    __global float * v  = buffer + h * step * V_OS;
-    __global float * ur = buffer + h * step * UR_OS;
-    __global float * vr = buffer + h * step * VR_OS;
-    __global float * o0 = buffer + h * step * O0_OS;
-    __global float * o1 = buffer + h * step * O1_OS;
-
-    int ix = get_global_id(0);
-    int iy = get_global_id(1);
-
-    if(ix >= w || iy >= h) return;
-
-    int pos = ix + step * iy;
-
-    float _u  = u[pos];
-    float _v  = v[pos];
-
-    float _ur = ur[pos];
-    float _vr = vr[pos];
-
-    float x = (float)ix + 0.5f;
-    float y = (float)iy + 0.5f;
-    bool b0 = o0[pos] > 1e-4f;
-    bool b1 = o1[pos] > 1e-4f;
-
-    float2 coord0 = (float2)(x - _u * theta, y - _v * theta);
-    float2 coord1 = (float2)(x + _u * (1.0f - theta), y + _v * (1.0f - theta));
-
-    if (b0 && b1)
-    {
-        // pixel is visible on both frames
-        out[pos] = read_imagef(tex_src0, sampler, coord0).x * (1.0f - theta) +
-            read_imagef(tex_src1, sampler, coord1).x * theta;
-    }
-    else if (b0)
-    {
-        // visible on the first frame only
-        out[pos] = read_imagef(tex_src0, sampler, coord0).x;
-    }
-    else
-    {
-        // visible on the second frame only
-        out[pos] = read_imagef(tex_src1, sampler, coord1).x;
-    }
-}
diff --git a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
deleted file mode 100644
index 7e09f3fc5..000000000
--- a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
-
-#ifndef N   // number of radices
-#define N 4
-#endif
-
-#ifndef K_T
-#define K_T float
-#endif
-
-#ifndef V_T
-#define V_T float
-#endif
-
-#ifndef IS_GT
-#define IS_GT 0
-#endif
-
-
-// from Thrust::b40c, link:
-// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
-__inline uint convertKey(uint converted_key)
-{
-#ifdef K_FLT
-    unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
-    converted_key ^= mask;
-#elif defined(K_INT)
-    const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
-    converted_key ^= SIGN_MASK;
-#else
-
-#endif
-    return converted_key;
-}
-
-//FIXME(pengx17):
-// exclusive scan, need to be optimized as this is too naive...
-kernel
-    void naiveScanAddition(
-    __global int * input,
-    __global int * output,
-    int size
-    )
-{
-    if(get_global_id(0) == 0)
-    {
-        output[0] = 0;
-        for(int i = 1; i < size; i ++)
-        {
-            output[i] = output[i - 1] + input[i - 1];
-        }
-    }
-}
-
-// following is ported from
-// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
-kernel
-    void histogramRadixN (
-    __global K_T* unsortedKeys,
-    __global int * buckets,
-    uint shiftCount
-    )
-{
-    const int RADIX_T     = N;
-    const int RADICES_T   = (1 << RADIX_T);
-    const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T;
-    const int MASK_T      = (1 << RADIX_T) - 1;
-    int localBuckets[16] = {0,0,0,0,0,0,0,0,
-                            0,0,0,0,0,0,0,0};
-    int globalId    = get_global_id(0);
-    int numOfGroups = get_num_groups(0);
-
-    /* Calculate thread-histograms */
-    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
-    {
-        uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
-        value = (value >> shiftCount) & MASK_T;
-#if IS_GT
-        localBuckets[RADICES_T - value - 1]++;
-#else
-        localBuckets[value]++;
-#endif
-    }
-
-    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
-    {
-        buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
-    }
-}
-
-kernel
-    void permuteRadixN (
-    __global K_T*  unsortedKeys,
-    __global V_T*  unsortedVals,
-    __global int* scanedBuckets,
-    uint shiftCount,
-    __global K_T*  sortedKeys,
-    __global V_T*  sortedVals
-    )
-{
-    const int RADIX_T     = N;
-    const int RADICES_T   = (1 << RADIX_T);
-    const int MASK_T = (1<<RADIX_T)  -1;
-
-    int globalId  = get_global_id(0);
-    int numOfGroups = get_num_groups(0);
-    const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
-    int  localIndex[16];
-
-    /*Load the index to local memory*/
-    for(int i = 0; i < RADICES_T; ++i)
-    {
-#if IS_GT
-        localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
-#else
-        localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
-#endif
-    }
-    /* Permute elements to appropriate location */
-    for(int i = 0; i < RADICES_T; ++i)
-    {
-        int old_idx = mad24(globalId, RADICES_T, i);
-        K_T  ovalue = unsortedKeys[old_idx];
-        uint value = convertKey(as_uint(ovalue));
-        uint maskedValue = (value >> shiftCount) & MASK_T;
-        uint index = localIndex[maskedValue];
-        sortedKeys[index] = ovalue;
-        sortedVals[index] = unsortedVals[old_idx];
-        localIndex[maskedValue] = index + 1;
-    }
-}
diff --git a/modules/ocl/src/opencl/kernel_sort_by_key.cl b/modules/ocl/src/opencl/kernel_sort_by_key.cl
deleted file mode 100644
index 0e8d581b7..000000000
--- a/modules/ocl/src/opencl/kernel_sort_by_key.cl
+++ /dev/null
@@ -1,244 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef K_T
-#define K_T float
-#endif
-
-#ifndef V_T
-#define V_T float
-#endif
-
-#ifndef IS_GT
-#define IS_GT false
-#endif
-
-#if IS_GT
-#define my_comp(x,y) ((x) > (y))
-#else
-#define my_comp(x,y) ((x) < (y))
-#endif
-
-/////////////////////// Bitonic sort ////////////////////////////
-// ported from
-// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
-__kernel
-    void bitonicSort
-    (
-        __global K_T * keys,
-        __global V_T * vals,
-        int count,
-        int stage,
-        int passOfStage
-    )
-{
-    const int threadId = get_global_id(0);
-    if(threadId >= count / 2)
-    {
-        return;
-    }
-    const int pairDistance = 1 << (stage - passOfStage);
-    const int blockWidth   = 2 * pairDistance;
-
-    int leftId = min( (threadId % pairDistance)
-                   + (threadId / pairDistance) * blockWidth, count );
-
-    int rightId = min( leftId + pairDistance, count );
-
-    int temp;
-
-    const V_T lval = vals[leftId];
-    const V_T rval = vals[rightId];
-
-    const K_T lkey = keys[leftId];
-    const K_T rkey = keys[rightId];
-
-    int sameDirectionBlockWidth = 1 << stage;
-
-    if((threadId/sameDirectionBlockWidth) % 2 == 1)
-    {
-        temp = rightId;
-        rightId = leftId;
-        leftId = temp;
-    }
-
-    const bool compareResult = my_comp(lkey, rkey);
-
-    if(compareResult)
-    {
-        keys[rightId] = rkey;
-        keys[leftId]  = lkey;
-        vals[rightId] = rval;
-        vals[leftId]  = lval;
-    }
-    else
-    {
-        keys[rightId] = lkey;
-        keys[leftId]  = rkey;
-        vals[rightId] = lval;
-        vals[leftId]  = rval;
-    }
-}
-
-/////////////////////// Selection sort ////////////////////////////
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-__kernel
-    void selectionSortLocal
-    (
-        __global K_T * keys,
-        __global V_T * vals,
-        const int count,
-        __local  K_T * scratch
-    )
-{
-    int          i  = get_local_id(0); // index in workgroup
-    int numOfGroups = get_num_groups(0); // index in workgroup
-    int groupID     = get_group_id(0);
-    int         wg  = get_local_size(0); // workgroup size = block size
-    int n; // number of elements to be processed for this work group
-
-    int offset   = groupID * wg;
-    int same     = 0;
-
-    vals      += offset;
-    keys      += offset;
-    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
-
-    int clamped_i= min(i, n - 1);
-
-    K_T key1 = keys[clamped_i], key2;
-    V_T val1 = vals[clamped_i];
-    scratch[i] = key1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(i >= n)
-    {
-        return;
-    }
-
-    int pos = 0;
-    for (int j=0;j<n;++j)
-    {
-        key2  = scratch[j];
-        if(my_comp(key2, key1))
-            pos++;//calculate the rank of this element in this work group
-        else
-        {
-            if(my_comp(key1, key2))
-                continue;
-            else
-            {
-                // key1 and key2 are same
-                same++;
-            }
-        }
-    }
-    for (int j=0; j< same; j++)
-    {
-        vals[pos + j] = val1;
-        keys[pos + j] = key1;
-    }
-}
-__kernel
-    void selectionSortFinal
-    (
-        __global K_T * keys,
-        __global V_T * vals,
-        const int count
-    )
-{
-    const int          i  = get_local_id(0); // index in workgroup
-    const int numOfGroups = get_num_groups(0); // index in workgroup
-    const int         wg  = get_local_size(0); // workgroup size = block size
-    int pos = 0, same = 0;
-    const int offset = get_group_id(0) * wg;
-    const int remainder = count - wg*(numOfGroups-1);
-
-    if((offset + i ) >= count)
-        return;
-    V_T val1 = vals[offset + i];
-
-    K_T key1 = keys[offset + i];
-    K_T key2;
-
-    for(int j=0; j<numOfGroups-1; j++ )
-    {
-        for(int k=0; k<wg; k++)
-        {
-            key2 = keys[j*wg + k];
-            if(my_comp(key1, key2))
-                break;
-            else
-            {
-                //Increment only if the value is not the same.
-                if(my_comp(key2, key1))
-                    pos++;
-                else
-                    same++;
-            }
-        }
-    }
-
-    for(int k=0; k<remainder; k++)
-    {
-        key2 = keys[(numOfGroups-1)*wg + k];
-        if(my_comp(key1, key2))
-            break;
-        else
-        {
-            //Don't increment if the value is the same.
-            if(my_comp(key2, key1))
-                pos++;
-            else
-                same++;
-        }
-    }
-    for (int j=0; j< same; j++)
-    {
-        vals[pos + j] = val1;
-        keys[pos + j] = key1;
-    }
-}
diff --git a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
deleted file mode 100644
index c573e3ebb..000000000
--- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
+++ /dev/null
@@ -1,264 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef K_T
-#define K_T float
-#endif
-
-#ifndef V_T
-#define V_T float
-#endif
-
-#ifndef IS_GT
-#define IS_GT false
-#endif
-
-#if IS_GT
-#define my_comp(x,y) ((x) > (y))
-#else
-#define my_comp(x,y) ((x) < (y))
-#endif
-
-//  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
-//  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
-//  passed as a functor parameter my_comp
-//  This function returns an index that is the first index whos value would be equal to the searched value
-inline uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
-{
-    //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
-    uint firstIndex = left;
-    uint lastIndex = right;
-
-    //  This loops through [firstIndex, lastIndex)
-    //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
-    //  this while loop will be divergent within a wavefront
-    while( firstIndex < lastIndex )
-    {
-        //  midIndex is the average of first and last, rounded down
-        uint midIndex = ( firstIndex + lastIndex ) / 2;
-        K_T midValue = data[ midIndex ];
-
-        //  This branch will create divergent wavefronts
-        if( my_comp( midValue, searchVal ) )
-        {
-            firstIndex = midIndex+1;
-            // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
-        }
-        else
-        {
-            lastIndex = midIndex;
-            // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
-        }
-    }
-
-    return firstIndex;
-}
-
-//  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
-//  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
-//  passed as a functor parameter my_comp
-//  This function returns an index that is the first index whos value would be greater than the searched value
-//  If the search value is not found in the sequence, upperbound returns the same result as lowerbound
-inline uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
-{
-    uint upperBound = lowerBoundBinary( data, left, right, searchVal );
-
-    // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
-    //  If upperBound == right, then  searchVal was not found in the sequence.  Just return.
-    if( upperBound != right )
-    {
-        //  While the values are equal i.e. !(x < y) && !(y < x) increment the index
-        K_T upperValue = data[ upperBound ];
-        while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
-        {
-            upperBound++;
-            upperValue = data[ upperBound ];
-        }
-    }
-
-    return upperBound;
-}
-
-//  This kernel implements merging of blocks of sorted data.  The input to this kernel most likely is
-//  the output of blockInsertionSortTemplate.  It is expected that the source array contains multiple
-//  blocks, each block is independently sorted.  The goal is to write into the output buffer half as
-//  many blocks, of double the size.  The even and odd blocks are stably merged together to form
-//  a new sorted block of twice the size.  The algorithm is out-of-place.
-kernel void merge(
-    global K_T*   iKey_ptr,
-    global V_T*   iValue_ptr,
-    global K_T*   oKey_ptr,
-    global V_T*   oValue_ptr,
-    const uint    srcVecSize,
-    const uint    srcLogicalBlockSize,
-    local K_T*    key_lds,
-    local V_T*    val_lds
-)
-{
-    size_t globalID     = get_global_id( 0 );
-
-    //  Abort threads that are passed the end of the input vector
-    if( globalID >= srcVecSize )
-        return; // on SI this doesn't mess-up barriers
-
-    //  For an element in sequence A, find the lowerbound index for it in sequence B
-    uint srcBlockNum   = globalID / srcLogicalBlockSize;
-    uint srcBlockIndex = globalID % srcLogicalBlockSize;
-
-    // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
-
-    //  Pairs of even-odd blocks will be merged together
-    //  An even block should search for an insertion point in the next odd block,
-    //  and the odd block should look for an insertion point in the corresponding previous even block
-    uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
-    uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
-    leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
-    leftBlockIndex = min( leftBlockIndex, srcVecSize );
-    uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
-
-    // if( localID == 0 )
-    // {
-    // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
-    // }
-
-    //  For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
-    // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
-    uint insertionIndex = 0;
-    if( (srcBlockNum & 0x1) == 0 )
-    {
-        insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
-    }
-    else
-    {
-        insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
-    }
-
-    //  The index of an element in the result sequence is the summation of it's indixes in the two input
-    //  sequences
-    uint dstBlockIndex = srcBlockIndex + insertionIndex;
-    uint dstBlockNum = srcBlockNum/2;
-
-    // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
-    // {
-    // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
-    // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
-    // }
-    oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
-    oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
-    // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
-}
-
-kernel void blockInsertionSort(
-    global K_T*   key_ptr,
-    global V_T*   value_ptr,
-    const uint    vecSize,
-    local K_T*    key_lds,
-    local V_T*    val_lds
-)
-{
-    int gloId    = get_global_id( 0 );
-    int groId    = get_group_id( 0 );
-    int locId    = get_local_id( 0 );
-    int wgSize   = get_local_size( 0 );
-
-    bool in_range = gloId < (int)vecSize;
-    K_T key;
-    V_T val;
-    //  Abort threads that are passed the end of the input vector
-    if (in_range)
-    {
-        //  Make a copy of the entire input array into fast local memory
-        key = key_ptr[ gloId ];
-        val = value_ptr[ gloId ];
-        key_lds[ locId ] = key;
-        val_lds[ locId ] = val;
-    }
-    barrier( CLK_LOCAL_MEM_FENCE );
-    //  Sorts a workgroup using a naive insertion sort
-    //  The sort uses one thread within a workgroup to sort the entire workgroup
-    if( locId == 0 && in_range )
-    {
-        //  The last workgroup may have an irregular size, so we calculate a per-block endIndex
-        //  endIndex is essentially emulating a mod operator with subtraction and multiply
-        int endIndex = vecSize - ( groId * wgSize );
-        endIndex = min( endIndex, wgSize );
-
-        // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
-
-        //  Indices are signed because the while loop will generate a -1 index inside of the max function
-        for( int currIndex = 1; currIndex < endIndex; ++currIndex )
-        {
-            key = key_lds[ currIndex ];
-            val = val_lds[ currIndex ];
-            int scanIndex = currIndex;
-            K_T ldsKey = key_lds[scanIndex - 1];
-            while( scanIndex > 0 && my_comp( key, ldsKey ) )
-            {
-                V_T ldsVal = val_lds[scanIndex - 1];
-
-                //  If the keys are being swapped, make sure the values are swapped identicaly
-                key_lds[ scanIndex ] = ldsKey;
-                val_lds[ scanIndex ] = ldsVal;
-
-                scanIndex = scanIndex - 1;
-                ldsKey = key_lds[ max( 0, scanIndex - 1 ) ];  // scanIndex-1 may be -1
-            }
-            key_lds[ scanIndex ] = key;
-            val_lds[ scanIndex ] = val;
-        }
-    }
-    barrier( CLK_LOCAL_MEM_FENCE );
-
-    if(in_range)
-    {
-        key = key_lds[ locId ];
-        key_ptr[ gloId ] = key;
-
-        val = val_lds[ locId ];
-        value_ptr[ gloId ] = val;
-    }
-}
-
-///////////// Radix sort from b40c library /////////////
diff --git a/modules/ocl/src/opencl/kmeans_kernel.cl b/modules/ocl/src/opencl/kmeans_kernel.cl
deleted file mode 100644
index 244d52ca3..000000000
--- a/modules/ocl/src/opencl/kmeans_kernel.cl
+++ /dev/null
@@ -1,107 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Xiaopeng Fu, fuxiaopeng2222@163.com
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-static float distance_(__global const float * center, __global const float * src, int feature_length)
-{
-    float res = 0;
-    float4 v0, v1, v2;
-    int i = 0;
-
-#ifdef L1_DIST
-    float4 sum = (float4)(0.0f);
-#endif
-
-    for ( ; i <= feature_length - 4; i += 4)
-    {
-        v0 = vload4(0, center + i);
-        v1 = vload4(0, src + i);
-        v2 = v1 - v0;
-#ifdef L1_DIST
-        v0 = fabs(v2);
-        sum += v0;
-#else
-        res += dot(v2, v2);
-#endif
-    }
-
-#ifdef L1_DIST
-    res = sum.x + sum.y + sum.z + sum.w;
-#endif
-
-    for ( ; i < feature_length; ++i)
-    {
-        float t0 = src[i];
-        float t1 = center[i];
-#ifdef L1_DIST
-        res += fabs(t0 - t1);
-#else
-        float t2 = t0 - t1;
-        res += t2 * t2;
-#endif
-    }
-
-    return res;
-}
-
-__kernel void distanceToCenters(__global const float * src, __global const float * centers,
-                                __global float * dists, int feature_length,
-                                int src_step, int centers_step,
-                                int features_count, int centers_count,
-                                int src_offset, int centers_offset)
-{
-    int gid = get_global_id(0);
-
-    if (gid < (features_count * centers_count))
-    {
-        int feature_index = gid / centers_count;
-        int center_index = gid % centers_count;
-
-        int center_idx = mad24(center_index, centers_step, centers_offset);
-        int src_idx = mad24(feature_index, src_step, src_offset);
-
-        dists[gid] = distance_(centers + center_idx, src + src_idx, feature_length);
-    }
-}
diff --git a/modules/ocl/src/opencl/knearest.cl b/modules/ocl/src/opencl/knearest.cl
deleted file mode 100644
index 85e24517d..000000000
--- a/modules/ocl/src/opencl/knearest.cl
+++ /dev/null
@@ -1,186 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define TYPE double
-#else
-#define TYPE float
-#endif
-
-#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
-///////////////////////////////////// find_nearest //////////////////////////////////////
-__kernel void knn_find_nearest(__global float* sample, int sample_row, int sample_col, int sample_step,
-                               int k, __global float* samples_ocl, int sample_ocl_row, int sample_ocl_step,
-                               __global float* _results, int _results_step, int _regression, int K1,
-                               int sample_ocl_col, int nThreads, __local float* nr)
-{
-    int k1 = 0;
-    int k2 = 0;
-
-    bool regression = false;
-
-    if(_regression)
-        regression = true;
-
-    TYPE inv_scale;
-#ifdef DOUBLE_SUPPORT
-    inv_scale = 1.0/K1;
-#else
-    inv_scale = 1.0f/K1;
-#endif
-
-    int y = get_global_id(1);
-    int j, j1;
-    int threadY = (y % nThreads);
-    __local float* dd = nr + nThreads * k;
-    if(y >= sample_row)
-    {
-        return;
-    }
-    for(j = 0; j < sample_ocl_row; j++)
-    {
-        TYPE sum;
-#ifdef DOUBLE_SUPPORT
-        sum = 0.0;
-#else
-        sum = 0.0f;
-#endif
-        float si;
-        int t, ii, ii1;
-        for(t = 0; t < sample_col - 16; t += 16)
-        {
-            float16 t0 = vload16(0, sample + y * sample_step + t) - vload16(0, samples_ocl + j * sample_ocl_step + t);
-            t0 *= t0;
-            sum += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
-                t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
-        }
-
-        for(; t < sample_col; t++)
-        {
-#ifdef DOUBLE_SUPPORT
-            double t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
-#else
-            float t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
-#endif
-            sum = sum + t0 * t0;
-        }
-
-        si = (float)sum;
-        for(ii = k1 - 1; ii >= 0; ii--)
-        {
-            if(as_int(si) > as_int(dd[ii * nThreads + threadY]))
-                break;
-        }
-        if(ii < k - 1)
-        {
-            for(ii1 = k2 - 1; ii1 > ii; ii1--)
-            {
-                dd[(ii1 + 1) * nThreads + threadY] = dd[ii1 * nThreads + threadY];
-                nr[(ii1 + 1) * nThreads + threadY] = nr[ii1 * nThreads + threadY];
-            }
-
-            dd[(ii + 1) * nThreads + threadY] = si;
-            nr[(ii + 1) * nThreads + threadY] = samples_ocl[sample_col + j * sample_ocl_step];
-        }
-        k1 = (k1 + 1) < k ? (k1 + 1) : k;
-        k2 = k1 < (k - 1) ? k1 : (k - 1);
-    }
-    /*! find_nearest_neighbor done!*/
-    /*! write_results start!*/
-    if (regression)
-    {
-        TYPE s;
-#ifdef DOUBLE_SUPPORT
-        s = 0.0;
-#else
-        s = 0.0f;
-#endif
-        for(j = 0; j < K1; j++)
-            s += nr[j * nThreads + threadY];
-
-        _results[y * _results_step] = (float)(s * inv_scale);
-    }
-    else
-    {
-        int prev_start = 0, best_count = 0, cur_count;
-        float best_val;
-
-        for(j = K1 - 1; j > 0; j--)
-        {
-            bool swap_f1 = false;
-            for(j1 = 0; j1 < j; j1++)
-            {
-                if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
-                {
-                    int t;
-                    CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
-                    swap_f1 = true;
-                }
-            }
-            if(!swap_f1)
-                break;
-        }
-
-        best_val = 0;
-        for(j = 1; j <= K1; j++)
-            if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
-            {
-                cur_count = j - prev_start;
-                if(best_count < cur_count)
-                {
-                    best_count = cur_count;
-                    best_val = nr[(j - 1) * nThreads + threadY];
-                }
-                prev_start = j;
-            }
-            _results[y * _results_step] = best_val;
-    }
-    ///*! write_results done!*/
-}
diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl
deleted file mode 100644
index 4d46d0084..000000000
--- a/modules/ocl/src/opencl/match_template.cl
+++ /dev/null
@@ -1,853 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define TYPE_IMAGE_SQSUM double
-#else
-#define TYPE_IMAGE_SQSUM float
-#endif
-
-#ifndef CN4
-#define CN4 1
-#else
-#define CN4 4
-#endif
-
-//////////////////////////////////////////////////
-// utilities
-#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
-#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
-// normAcc* are accurate normalization routines which make GPU matchTemplate
-// consistent with CPU one
-inline float normAcc(float num, float denum)
-{
-    if(fabs(num) < denum)
-    {
-        return num / denum;
-    }
-    if(fabs(num) < denum * 1.125f)
-    {
-        return num > 0 ? 1 : -1;
-    }
-    return 0;
-}
-
-inline float normAcc_SQDIFF(float num, float denum)
-{
-    if(fabs(num) < denum)
-    {
-        return num / denum;
-    }
-    if(fabs(num) < denum * 1.125f)
-    {
-        return num > 0 ? 1 : -1;
-    }
-    return 1;
-}
-//////////////////////////////////////////////////////////////////////
-// normalize
-
-__kernel
-void normalizeKernel_C1_D0
-(
-    __global const float * img_sqsums,
-    __global float * res,
-    ulong tpl_sqsum,
-    int res_rows,
-    int res_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int img_sqsums_offset,
-    int img_sqsums_step,
-    int res_offset,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-    img_sqsums_step /= sizeof(*img_sqsums);
-    img_sqsums_offset /= sizeof(*img_sqsums);
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float image_sqsum_ = (float)(
-                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
-        res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
-    }
-}
-
-__kernel
-void matchTemplate_Prepared_SQDIFF_C1_D0
-(
-    __global const TYPE_IMAGE_SQSUM * img_sqsums,
-    __global float * res,
-    ulong tpl_sqsum,
-    int res_rows,
-    int res_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int img_sqsums_offset,
-    int img_sqsums_step,
-    int res_offset,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-    img_sqsums_step /= sizeof(*img_sqsums);
-    img_sqsums_offset /= sizeof(*img_sqsums);
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float image_sqsum_ = (float)(
-                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
-        res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
-    }
-}
-
-__kernel
-void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
-(
-    __global const float * img_sqsums,
-    __global float * res,
-    ulong tpl_sqsum,
-    int res_rows,
-    int res_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int img_sqsums_offset,
-    int img_sqsums_step,
-    int res_offset,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-    img_sqsums_step /= sizeof(*img_sqsums);
-    img_sqsums_offset /= sizeof(*img_sqsums);
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float image_sqsum_ = (float)(
-                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
-        res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
-                                      sqrt(image_sqsum_ * tpl_sqsum));
-    }
-}
-
-//////////////////////////////////////////////////
-// SQDIFF
-__kernel
-void matchTemplate_Naive_SQDIFF_C1_D0
-(
-    __global const uchar * img,
-    __global const uchar * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    int delta;
-    int sum = 0;
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                delta = img_ptr[j] - tpl_ptr[j];
-                sum   = mad24(delta, delta, sum);
-            }
-        }
-        res[res_idx] = sum;
-    }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C1_D5
-(
-    __global const float * img,
-    __global const float * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    float delta;
-    float sum = 0;
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                delta = img_ptr[j] - tpl_ptr[j];
-                sum   = mad(delta, delta, sum);
-            }
-        }
-        res[res_idx] = sum;
-    }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C4_D0
-(
-    __global const uchar4 * img,
-    __global const uchar4 * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    int4 delta;
-    int4 sum = (int4)(0, 0, 0, 0);
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
-                delta.x = img_ptr[j].x - tpl_ptr[j].x;
-                delta.y = img_ptr[j].y - tpl_ptr[j].y;
-                delta.z = img_ptr[j].z - tpl_ptr[j].z;
-                delta.w = img_ptr[j].w - tpl_ptr[j].w;
-                sum   = mad24(delta, delta, sum);
-            }
-        }
-        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
-    }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C4_D5
-(
-    __global const float4 * img,
-    __global const float4 * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    float4 delta;
-    float4 sum = (float4)(0, 0, 0, 0);
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
-                delta.x = img_ptr[j].x - tpl_ptr[j].x;
-                delta.y = img_ptr[j].y - tpl_ptr[j].y;
-                delta.z = img_ptr[j].z - tpl_ptr[j].z;
-                delta.w = img_ptr[j].w - tpl_ptr[j].w;
-                sum   = mad(delta, delta, sum);
-            }
-        }
-        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
-    }
-}
-
-//////////////////////////////////////////////////
-// CCORR
-__kernel
-void matchTemplate_Naive_CCORR_C1_D0
-(
-    __global const uchar * img,
-    __global const uchar * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    int sum = 0;
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
-            }
-        }
-        res[res_idx] = (float)sum;
-    }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C1_D5
-(
-    __global const float * img,
-    __global const float * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    float sum = 0;
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                sum = mad(img_ptr[j], tpl_ptr[j], sum);
-            }
-        }
-        res[res_idx] = sum;
-    }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C4_D0
-(
-    __global const uchar4 * img,
-    __global const uchar4 * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    int4 sum = (int4)(0, 0, 0, 0);
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
-            }
-        }
-        res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
-    }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C4_D5
-(
-    __global const float4 * img,
-    __global const float4 * tpl,
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int img_offset,
-    int tpl_offset,
-    int res_offset,
-    int img_step,
-    int tpl_step,
-    int res_step
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    int i,j;
-    float4 sum = (float4)(0, 0, 0, 0);
-    img_step   /= sizeof(*img);
-    img_offset /= sizeof(*img);
-    tpl_step   /= sizeof(*tpl);
-    tpl_offset /= sizeof(*tpl);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        for(i = 0; i < tpl_rows; i ++)
-        {
-            // get specific rows of img data
-            __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
-            __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
-            for(j = 0; j < tpl_cols; j ++)
-            {
-                sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
-            }
-        }
-        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
-    }
-}
-
-//////////////////////////////////////////////////
-// CCOFF
-__kernel
-void matchTemplate_Prepared_CCOFF_C1_D0
-(
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int res_offset,
-    int res_step,
-    __global const uint * img_sums,
-    int img_sums_offset,
-    int img_sums_step,
-    float tpl_sum
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    img_sums_offset   /= sizeof(*img_sums);
-    img_sums_step     /= sizeof(*img_sums);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-                            -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
-        res[res_idx] -= sum * tpl_sum;
-    }
-}
-__kernel
-void matchTemplate_Prepared_CCOFF_C4_D0
-(
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int res_offset,
-    int res_step,
-    __global const uint * img_sums_c0,
-    __global const uint * img_sums_c1,
-    __global const uint * img_sums_c2,
-    __global const uint * img_sums_c3,
-    int img_sums_offset,
-    int img_sums_step,
-    float tpl_sum_c0,
-    float tpl_sum_c1,
-    float tpl_sum_c2,
-    float tpl_sum_c3
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    img_sums_offset   /= sizeof(*img_sums_c0);
-    img_sums_step     /= sizeof(*img_sums_c0);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float ccorr = res[res_idx];
-        ccorr -= tpl_sum_c0*(float)(
-                     (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-                     - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
-        ccorr -= tpl_sum_c1*(float)(
-                     (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-                     - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
-        ccorr -= tpl_sum_c2*(float)(
-                     (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-                     - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
-        ccorr -= tpl_sum_c3*(float)(
-                     (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-                     - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
-        res[res_idx] = ccorr;
-    }
-}
-
-__kernel
-void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
-(
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int res_offset,
-    int res_step,
-    float weight,
-    __global const uint * img_sums,
-    int img_sums_offset,
-    int img_sums_step,
-    __global const float * img_sqsums,
-    int img_sqsums_offset,
-    int img_sqsums_step,
-    float tpl_sum,
-    float tpl_sqsum
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    img_sqsums_step   /= sizeof(*img_sqsums);
-    img_sqsums_offset /= sizeof(*img_sqsums);
-    img_sums_offset   /= sizeof(*img_sums);
-    img_sums_step     /= sizeof(*img_sums);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float image_sum_ =  (float)(
-                                (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
-                                - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
-
-        float image_sqsum_ = (float)(
-                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
-                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
-        res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
-                               sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
-    }
-}
-__kernel
-void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
-(
-    __global float * res,
-    int img_rows,
-    int img_cols,
-    int tpl_rows,
-    int tpl_cols,
-    int res_rows,
-    int res_cols,
-    int res_offset,
-    int res_step,
-    float weight,
-    __global const uint * img_sums_c0,
-    __global const uint * img_sums_c1,
-    __global const uint * img_sums_c2,
-    __global const uint * img_sums_c3,
-    int img_sums_offset,
-    int img_sums_step,
-    __global const float * img_sqsums_c0,
-    __global const float * img_sqsums_c1,
-    __global const float * img_sqsums_c2,
-    __global const float * img_sqsums_c3,
-    int img_sqsums_offset,
-    int img_sqsums_step,
-    float tpl_sum_c0,
-    float tpl_sum_c1,
-    float tpl_sum_c2,
-    float tpl_sum_c3,
-    float tpl_sqsum
-)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    img_sqsums_step   /= sizeof(*img_sqsums_c0);
-    img_sqsums_offset /= sizeof(*img_sqsums_c0);
-    img_sums_offset   /= sizeof(*img_sums_c0);
-    img_sums_step     /= sizeof(*img_sums_c0);
-    res_step   /= sizeof(*res);
-    res_offset /= sizeof(*res);
-
-    int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
-    if(gidx < res_cols && gidy < res_rows)
-    {
-        float image_sum_c0 =  (float)(
-                                  (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
-                                  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
-        float image_sum_c1 =  (float)(
-                                  (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
-                                  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
-        float image_sum_c2 =  (float)(
-                                  (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
-                                  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
-        float image_sum_c3 =  (float)(
-                                  (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
-                                  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
-
-        float image_sqsum_c0 = (float)(
-                                   (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
-                                   (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
-        float image_sqsum_c1 = (float)(
-                                   (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
-                                   (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
-        float image_sqsum_c2 = (float)(
-                                   (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
-                                   (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
-        float image_sqsum_c3 = (float)(
-                                   (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
-                                   (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
-
-        float num = res[res_idx] -
-                    image_sum_c0 * tpl_sum_c0 -
-                    image_sum_c1 * tpl_sum_c1 -
-                    image_sum_c2 * tpl_sum_c2 -
-                    image_sum_c3 * tpl_sum_c3;
-        float denum = sqrt( tpl_sqsum * (
-                                image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
-                                image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
-                                image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
-                                image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
-                          );
-        res[res_idx] = normAcc(num, denum);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// extractFirstChannel
-__kernel
-void extractFirstChannel
-(
-    const __global float4* img,
-    __global float* res,
-    int rows,
-    int cols,
-    int img_offset,
-    int res_offset,
-    int img_step,
-    int res_step
-)
-{
-    img_step   /= sizeof(float4);
-    res_step   /= sizeof(float);
-    img_offset /= sizeof(float4);
-    res_offset /= sizeof(float);
-    img += img_offset;
-    res += res_offset;
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    if(gidx < cols && gidy < rows)
-    {
-        res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
-    }
-}
diff --git a/modules/ocl/src/opencl/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl
deleted file mode 100644
index ea5060e46..000000000
--- a/modules/ocl/src/opencl/meanShift.cl
+++ /dev/null
@@ -1,241 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//    Xu Pang, pangxu010@163.com
-//    Wenju He, wenju@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-static short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
-               __global uchar4* in, int in_step, int dst_off, int src_off,
-               int cols, int rows, int sp, int sr, int maxIter, float eps)
-{
-    int isr2 = sr*sr;
-    in_step = in_step >> 2;
-    out_step = out_step >> 2;
-    src_off = src_off >> 2;
-    dst_off = dst_off >> 2;
-    int idx = src_off + y0 * in_step + x0;
-    uchar4 c = in[idx];
-    int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ;
-
-    // iterate meanshift procedure
-    for( int iter = 0; iter < maxIter; iter++ )
-    {
-        int count = 0;
-        int4 s = (int4)0;
-        int sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        //deal with the image boundary
-        int minx = (x0-sp)>0 ? x0-sp : 0;
-        int miny = (y0-sp)>0 ? y0-sp : 0;
-        int maxx = (x0+sp)<cols ? x0+sp : cols-1;
-        int maxy = (y0+sp)<rows ? y0+sp : rows-1;
-
-        for( int y = miny; y <= maxy; y++)
-        {
-            int rowCount = 0;
-            int x = minx;
-            for( ; x+3 <= maxx; x+=4 )
-            {
-                int id = src_off + y*in_step + x;
-                uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
-                int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
-                              (t.s2 - c.z) * (t.s2 - c.z);
-                int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
-                              (t.s6 - c.z) * (t.s6 - c.z);
-                int norm2_3 = (t.s8 - c.x) * (t.s8 - c.x) + (t.s9 - c.y) * (t.s9 - c.y) +
-                              (t.sa - c.z) * (t.sa - c.z);
-                int norm2_4 = (t.sc - c.x) * (t.sc - c.x) + (t.sd - c.y) * (t.sd - c.y) +
-                              (t.se - c.z) * (t.se - c.z);
-                if( norm2_1 <= isr2 )
-                {
-                    s.x += t.s0; s.y += t.s1; s.z += t.s2;
-                    sx += x; rowCount++;
-                }
-                if( norm2_2 <= isr2 )
-                {
-                    s.x += t.s4; s.y += t.s5; s.z += t.s6;
-                    sx += x+1; rowCount++;
-                }
-                if( norm2_3 <= isr2 )
-                {
-                    s.x += t.s8; s.y += t.s9; s.z += t.sa;
-                    sx += x+2; rowCount++;
-                }
-                if( norm2_4 <= isr2 )
-                {
-                    s.x += t.sc; s.y += t.sd; s.z += t.se;
-                    sx += x+3; rowCount++;
-                }
-            }
-            if(x == maxx)
-            {
-                int id = src_off + y*in_step + x;
-                uchar4 t = in[id];
-                int norm2 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
-                            (t.s2 - c.z) * (t.s2 - c.z);
-                if( norm2 <= isr2 )
-                {
-                    s.x += t.s0; s.y += t.s1; s.z += t.s2;
-                    sx += x; rowCount++;
-                }
-
-            }
-            if(x+1 == maxx)
-            {
-                  int id = src_off + y*in_step + x;
-                  uchar8 t = (uchar8)(in[id],in[id+1]);
-                  int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
-                                (t.s2 - c.z) * (t.s2 - c.z);
-                  int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
-                                (t.s6 - c.z) * (t.s6 - c.z);
-                  if( norm2_1 <= isr2 )
-                  {
-                      s.x += t.s0; s.y += t.s1; s.z += t.s2;
-                      sx += x; rowCount++;
-                  }
-                  if( norm2_2 <= isr2 )
-                  {
-                      s.x += t.s4; s.y += t.s5; s.z += t.s6;
-                      sx += x+1; rowCount++;
-                  }
-            }
-            if(x+2 == maxx)
-            {
-                  int id = src_off + y*in_step + x;
-                  uchar16 t = (uchar16)(in[id],in[id+1],in[id+2],in[id+3]);
-                  int norm2_1 = (t.s0 - c.x) * (t.s0 - c.x) + (t.s1 - c.y) * (t.s1 - c.y) +
-                                (t.s2 - c.z) * (t.s2 - c.z);
-                  int norm2_2 = (t.s4 - c.x) * (t.s4 - c.x) + (t.s5 - c.y) * (t.s5 - c.y) +
-                                (t.s6 - c.z) * (t.s6 - c.z);
-                  int norm2_3 = (t.s8 - c.x) * (t.s8 - c.x) + (t.s9 - c.y) * (t.s9 - c.y) +
-                                (t.sa - c.z) * (t.sa - c.z);
-                  if( norm2_1 <= isr2 )
-                  {
-                      s.x += t.s0; s.y += t.s1; s.z += t.s2;
-                      sx += x; rowCount++;
-                  }
-                  if( norm2_2 <= isr2 )
-                  {
-                      s.x += t.s4; s.y += t.s5; s.z += t.s6;
-                      sx += x+1; rowCount++;
-                  }
-                  if( norm2_3 <= isr2 )
-                  {
-                      s.x += t.s8; s.y += t.s9; s.z += t.sa;
-                      sx += x+2; rowCount++;
-                  }
-            }
-            if(rowCount == 0)
-               continue;
-            count += rowCount;
-            if(y == 0)
-               continue;
-            sy += y*rowCount;
-        }
-
-        if( count == 0 )
-            break;
-
-        int x1 = sx/count;
-        int y1 = sy/count;
-        s.x = s.x/count;
-        s.y = s.y/count;
-        s.z = s.z/count;
-
-        int4 tmp = s - convert_int4(c);
-        int norm2 = tmp.x * tmp.x + tmp.y *  tmp.y +
-                    tmp.z * tmp.z;
-
-        bool stopFlag = (x1 == x0 && y1 == y0) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps);
-
-        x0 = x1;
-        y0 = y1;
-        c.x = s.x;
-        c.y = s.y;
-        c.z = s.z;
-
-        if( stopFlag )
-            break;
-    }
-
-    out[base] = c;
-
-    return (short2)((short)x0, (short)y0);
-}
-
-
-__kernel void meanshift_kernel(__global uchar4* out, int out_step,
-                               __global uchar4* in, int in_step,
-                        int dst_off, int src_off, int cols, int rows,
-                        int sp, int sr, int maxIter, float eps)
-{
-    int x0 = get_global_id(0);
-    int y0 = get_global_id(1);
-    if( x0 < cols && y0 < rows )
-        do_mean_shift(x0, y0, out, out_step, in, in_step, dst_off, src_off,
-                          cols, rows, sp, sr, maxIter, eps);
-}
-
-__kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
-                             __global short2* outsp, int instep, int outrstep,
-                             int outspstep, int in_off, int outr_off, int outsp_off,
-                             int cols, int rows, int sp, int sr, int maxIter, float eps )
-{
-    int x0 = get_global_id(0);
-    int y0 = get_global_id(1);
-
-    if( x0 < cols && y0 < rows )
-    {
-        //int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-        //*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
-        // we have ensured before that ((outspstep & 0x11)==0).
-        outsp_off >>= 2;
-        outspstep >>= 2;
-        int basesp = outsp_off + y0 * outspstep + x0;
-        outsp[basesp] = do_mean_shift(x0, y0, outr, outrstep, in, instep, outr_off, in_off, cols, rows, sp, sr, maxIter, eps);
-//        outsp[basesp] =(short2)((short)x0,(short)y0);
-    }
-}
diff --git a/modules/ocl/src/opencl/merge_mat.cl b/modules/ocl/src/opencl/merge_mat.cl
deleted file mode 100644
index aea05aeb8..000000000
--- a/modules/ocl/src/opencl/merge_mat.cl
+++ /dev/null
@@ -1,1378 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////optimized code using vector roi//////////////////////////
-////////////vector fuction name format: merge_vector_C(channels number)D_(data type depth)//////
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void merge_vector_C2_D0(__global uchar *mat_dst,  int dst_step,  int dst_offset,
-                                 __global uchar *mat_src0, int src0_step, int src0_offset,
-                                 __global uchar *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        #define dst_align  ((dst_offset & 3) >> 1)
-        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
-        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        __global uchar4 * dst  = (__global uchar4 *)(mat_dst + dst_index);
-        __global uchar  * src0 = mat_src0 + src0_index;
-        __global uchar  * src1 = src0     + 1;
-        __global uchar  * src2 = mat_src1 + src1_index;
-        __global uchar  * src3 = src2     + 1;
-
-        uchar4 dst_data = *dst;
-        uchar  data_0   = *(src0);
-        uchar  data_1   = *(src1);
-        uchar  data_2   = *(src2);
-        uchar  data_3   = *(src3);
-
-        uchar4 tmp_data = (uchar4)(data_0, data_2, data_1, data_3);
-
-        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
-        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw;
-
-        *dst = tmp_data;
-    }
-}
-__kernel void merge_vector_C2_D1(__global char *mat_dst,  int dst_step,  int dst_offset,
-                                 __global char *mat_src0, int src0_step, int src0_offset,
-                                 __global char *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        #define dst_align  ((dst_offset & 3) >> 1)
-        int src0_index = mad24(y, src0_step, src0_offset + x - dst_align);
-        int src1_index = mad24(y, src1_step, src1_offset + x - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        __global char4 * dst  = (__global char4 *)(mat_dst + dst_index);
-        __global char  * src0 = mat_src0 + src0_index;
-        __global char  * src1 = src0     + 1;
-        __global char  * src2 = mat_src1 + src1_index;
-        __global char  * src3 = src2     + 1;
-
-        char4 dst_data = *dst;
-        char  data_0   = *(src0);
-        char  data_1   = *(src1);
-        char  data_2   = *(src2);
-        char  data_3   = *(src3);
-
-        char4 tmp_data = (char4)(data_0, data_2, data_1, data_3);
-
-        tmp_data.xy = dst_index + 0 >= dst_start ? tmp_data.xy : dst_data.xy;
-        tmp_data.zw = dst_index + 2 <  dst_end   ? tmp_data.zw : dst_data.zw;
-
-        *dst = tmp_data;
-    }
-}
-__kernel void merge_vector_C2_D2(__global ushort *mat_dst,  int dst_step,  int dst_offset,
-                                 __global ushort *mat_src0, int src0_step, int src0_offset,
-                                 __global ushort *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        __global ushort*  src0 = (__global ushort * )((__global uchar *)mat_src0 + src0_index + (x << 1));
-        __global ushort*  src1 = (__global ushort * )((__global uchar *)mat_src1 + src1_index + (x << 1));
-        __global ushort2* dist = (__global ushort2 *)((__global uchar *)mat_dst  + dst_index  + (x << 2));
-
-        ushort  src0_data = *src0;
-        ushort  src1_data = *src1;
-
-        *dist = (ushort2)(src0_data, src1_data);
-
-    }
-}
-__kernel void merge_vector_C2_D3(__global short *mat_dst,  int dst_step,  int dst_offset,
-                                 __global short *mat_src0, int src0_step, int src0_offset,
-                                 __global short *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        __global short*  src0 = (__global short * )((__global uchar *)mat_src0 + src0_index + (x << 1));
-        __global short*  src1 = (__global short * )((__global uchar *)mat_src1 + src1_index + (x << 1));
-        __global short2* dist = (__global short2 *)((__global uchar *)mat_dst  + dst_index   + (x << 2));
-
-        short  src0_data = *src0;
-        short  src1_data = *src1;
-
-        *dist = (short2)(src0_data, src1_data);
-    }
-}
-
-__kernel void merge_vector_C2_D4(__global int *mat_dst,  int dst_step,  int dst_offset,
-                                 __global int *mat_src0, int src0_step, int src0_offset,
-                                 __global int *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
-        int src1 = *((__global int *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
-
-        *((__global int2 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (int2)(src0, src1);
-    }
-}
-__kernel void merge_vector_C2_D5(__global float *mat_dst,  int dst_step,  int dst_offset,
-                                 __global float *mat_src0, int src0_step, int src0_offset,
-                                 __global float *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
-        float src1 = *((__global float *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
-
-        *((__global float2 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (float2)(src0, src1);
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C2_D6(__global double *mat_dst,  int dst_step,  int dst_offset,
-                                 __global double *mat_src0, int src0_step, int src0_offset,
-                                 __global double *mat_src1, int src1_step, int src1_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
-        double src1 = *((__global double *)((__global uchar *)mat_src1 + src1_index + (x << 3)));
-
-        *((__global double2 *)((__global uchar *)mat_dst  + dst_index + (x << 4))) = (double2)(src0, src1);
-    }
-}
-#endif
-
-__kernel void merge_vector_C3_D0(__global uchar *mat_dst,  int dst_step,  int dst_offset,
-                                 __global uchar *mat_src0, int src0_step, int src0_offset,
-                                 __global uchar *mat_src1, int src1_step, int src1_offset,
-                                 __global uchar *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src0_index = mad24(y, src0_step, x + src0_offset - offset_cols);
-        int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
-        int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
-
-        int dst_start = mad24(y, dst_step, dst_offset);
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
-
-        uchar data0_0 = *(mat_src0 + src0_index + 0);
-        uchar data0_1 = *(mat_src0 + src0_index + 1);
-        uchar data0_2 = *(mat_src0 + src0_index + 2);
-        uchar data0_3 = *(mat_src0 + src0_index + 3);
-
-        uchar data1_0 = *(mat_src1 + src1_index + 0);
-        uchar data1_1 = *(mat_src1 + src1_index + 1);
-        uchar data1_2 = *(mat_src1 + src1_index + 2);
-        uchar data1_3 = *(mat_src1 + src1_index + 3);
-
-        uchar data2_0 = *(mat_src2 + src2_index + 0);
-        uchar data2_1 = *(mat_src2 + src2_index + 1);
-        uchar data2_2 = *(mat_src2 + src2_index + 2);
-        uchar data2_3 = *(mat_src2 + src2_index + 3);
-
-        uchar4 tmp_data0 = (uchar4)(data0_0, data1_0, data2_0, data0_1);
-        uchar4 tmp_data1 = (uchar4)(data1_1, data2_1, data0_2, data1_2);
-        uchar4 tmp_data2 = (uchar4)(data2_2, data0_3, data1_3, data2_3);
-
-        uchar4 dst_data0 = *((__global uchar4*)(mat_dst + dst_index + 0));
-        uchar4 dst_data1 = *((__global uchar4*)(mat_dst + dst_index + 4));
-        uchar4 dst_data2 = *((__global uchar4*)(mat_dst + dst_index + 8));
-
-        tmp_data0.x = ((dst_index + 0  >= dst_start) && (dst_index + 0  < dst_end)) ? tmp_data0.x : dst_data0.x;
-        tmp_data0.y = ((dst_index + 1  >= dst_start) && (dst_index + 1  < dst_end)) ? tmp_data0.y : dst_data0.y;
-        tmp_data0.z = ((dst_index + 2  >= dst_start) && (dst_index + 2  < dst_end)) ? tmp_data0.z : dst_data0.z;
-        tmp_data0.w = ((dst_index + 3  >= dst_start) && (dst_index + 3  < dst_end)) ? tmp_data0.w : dst_data0.w;
-
-        tmp_data1.x = ((dst_index + 4  >= dst_start) && (dst_index + 4  < dst_end)) ? tmp_data1.x : dst_data1.x;
-        tmp_data1.y = ((dst_index + 5  >= dst_start) && (dst_index + 5  < dst_end)) ? tmp_data1.y : dst_data1.y;
-        tmp_data1.z = ((dst_index + 6  >= dst_start) && (dst_index + 6  < dst_end)) ? tmp_data1.z : dst_data1.z;
-        tmp_data1.w = ((dst_index + 7  >= dst_start) && (dst_index + 7  < dst_end)) ? tmp_data1.w : dst_data1.w;
-
-        tmp_data2.x = ((dst_index + 8  >= dst_start) && (dst_index + 8  < dst_end)) ? tmp_data2.x : dst_data2.x;
-        tmp_data2.y = ((dst_index + 9  >= dst_start) && (dst_index + 9  < dst_end)) ? tmp_data2.y : dst_data2.y;
-        tmp_data2.z = ((dst_index + 10 >= dst_start) && (dst_index + 10 < dst_end)) ? tmp_data2.z : dst_data2.z;
-        tmp_data2.w = ((dst_index + 11 >= dst_start) && (dst_index + 11 < dst_end)) ? tmp_data2.w : dst_data2.w;
-
-        *((__global uchar4*)(mat_dst + dst_index + 0)) = tmp_data0;
-        *((__global uchar4*)(mat_dst + dst_index + 4)) = tmp_data1;
-        *((__global uchar4*)(mat_dst + dst_index + 8)) = tmp_data2;
-    }
-}
-__kernel void merge_vector_C3_D1(__global char *mat_dst,  int dst_step,  int dst_offset,
-                                 __global char *mat_src0, int src0_step, int src0_offset,
-                                 __global char *mat_src1, int src1_step, int src1_offset,
-                                 __global char *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 2;
-
-        int src0_index = mad24(y, src0_step, x + src0_offset - offset_cols);
-        int src1_index = mad24(y, src1_step, x + src1_offset - offset_cols);
-        int src2_index = mad24(y, src2_step, x + src2_offset - offset_cols);
-
-        int dst_start = mad24(y, dst_step, dst_offset);
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index = mad24(y, dst_step, dst_offset + 3 * x - offset_cols * 3);
-
-        char data0_0 = *(mat_src0 + src0_index + 0);
-        char data0_1 = *(mat_src0 + src0_index + 1);
-        char data0_2 = *(mat_src0 + src0_index + 2);
-        char data0_3 = *(mat_src0 + src0_index + 3);
-
-        char data1_0 = *(mat_src1 + src1_index + 0);
-        char data1_1 = *(mat_src1 + src1_index + 1);
-        char data1_2 = *(mat_src1 + src1_index + 2);
-        char data1_3 = *(mat_src1 + src1_index + 3);
-
-        char data2_0 = *(mat_src2 + src2_index + 0);
-        char data2_1 = *(mat_src2 + src2_index + 1);
-        char data2_2 = *(mat_src2 + src2_index + 2);
-        char data2_3 = *(mat_src2 + src2_index + 3);
-
-        char4 tmp_data0 = (char4)(data0_0, data1_0, data2_0, data0_1);
-        char4 tmp_data1 = (char4)(data1_1, data2_1, data0_2, data1_2);
-        char4 tmp_data2 = (char4)(data2_2, data0_3, data1_3, data2_3);
-
-        char4 dst_data0 = *((__global char4*)(mat_dst + dst_index + 0));
-        char4 dst_data1 = *((__global char4*)(mat_dst + dst_index + 4));
-        char4 dst_data2 = *((__global char4*)(mat_dst + dst_index + 8));
-
-        tmp_data0.x = ((dst_index + 0  >= dst_start) && (dst_index + 0  < dst_end)) ? tmp_data0.x : dst_data0.x;
-        tmp_data0.y = ((dst_index + 1  >= dst_start) && (dst_index + 1  < dst_end)) ? tmp_data0.y : dst_data0.y;
-        tmp_data0.z = ((dst_index + 2  >= dst_start) && (dst_index + 2  < dst_end)) ? tmp_data0.z : dst_data0.z;
-        tmp_data0.w = ((dst_index + 3  >= dst_start) && (dst_index + 3  < dst_end)) ? tmp_data0.w : dst_data0.w;
-
-        tmp_data1.x = ((dst_index + 4  >= dst_start) && (dst_index + 4  < dst_end)) ? tmp_data1.x : dst_data1.x;
-        tmp_data1.y = ((dst_index + 5  >= dst_start) && (dst_index + 5  < dst_end)) ? tmp_data1.y : dst_data1.y;
-        tmp_data1.z = ((dst_index + 6  >= dst_start) && (dst_index + 6  < dst_end)) ? tmp_data1.z : dst_data1.z;
-        tmp_data1.w = ((dst_index + 7  >= dst_start) && (dst_index + 7  < dst_end)) ? tmp_data1.w : dst_data1.w;
-
-        tmp_data2.x = ((dst_index + 8  >= dst_start) && (dst_index + 8  < dst_end)) ? tmp_data2.x : dst_data2.x;
-        tmp_data2.y = ((dst_index + 9  >= dst_start) && (dst_index + 9  < dst_end)) ? tmp_data2.y : dst_data2.y;
-        tmp_data2.z = ((dst_index + 10 >= dst_start) && (dst_index + 10 < dst_end)) ? tmp_data2.z : dst_data2.z;
-        tmp_data2.w = ((dst_index + 11 >= dst_start) && (dst_index + 11 < dst_end)) ? tmp_data2.w : dst_data2.w;
-
-        *((__global char4*)(mat_dst + dst_index + 0)) = tmp_data0;
-        *((__global char4*)(mat_dst + dst_index + 4)) = tmp_data1;
-        *((__global char4*)(mat_dst + dst_index + 8)) = tmp_data2;
-    }
-}
-__kernel void merge_vector_C3_D2(__global ushort *mat_dst,  int dst_step,  int dst_offset,
-                                 __global ushort *mat_src0, int src0_step, int src0_offset,
-                                 __global ushort *mat_src1, int src1_step, int src1_offset,
-                                 __global ushort *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src0_index = mad24(y, src0_step, (x << 1) + src0_offset - offset_cols);
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
-
-        int dst_start = mad24(y, dst_step, dst_offset);
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
-
-        ushort data0_0 = *((__global ushort *)((__global char *)mat_src0 + src0_index + 0));
-        ushort data0_1 = *((__global ushort *)((__global char *)mat_src0 + src0_index + 2));
-
-        ushort data1_0 = *((__global ushort *)((__global char *)mat_src1 + src1_index + 0));
-        ushort data1_1 = *((__global ushort *)((__global char *)mat_src1 + src1_index + 2));
-
-        ushort data2_0 = *((__global ushort *)((__global char *)mat_src2 + src2_index + 0));
-        ushort data2_1 = *((__global ushort *)((__global char *)mat_src2 + src2_index + 2));
-
-        ushort2 tmp_data0 = (ushort2)(data0_0, data1_0);
-        ushort2 tmp_data1 = (ushort2)(data2_0, data0_1);
-        ushort2 tmp_data2 = (ushort2)(data1_1, data2_1);
-
-        ushort2 dst_data0 = *((__global ushort2*)((__global char *)mat_dst + dst_index + 0));
-        ushort2 dst_data1 = *((__global ushort2*)((__global char *)mat_dst + dst_index + 4));
-        ushort2 dst_data2 = *((__global ushort2*)((__global char *)mat_dst + dst_index + 8));
-
-        tmp_data0.x = ((dst_index + 0  >= dst_start) && (dst_index + 0  < dst_end)) ? tmp_data0.x : dst_data0.x;
-        tmp_data0.y = ((dst_index + 2  >= dst_start) && (dst_index + 2  < dst_end)) ? tmp_data0.y : dst_data0.y;
-
-        tmp_data1.x = ((dst_index + 4  >= dst_start) && (dst_index + 4  < dst_end)) ? tmp_data1.x : dst_data1.x;
-        tmp_data1.y = ((dst_index + 6  >= dst_start) && (dst_index + 6  < dst_end)) ? tmp_data1.y : dst_data1.y;
-
-        tmp_data2.x = ((dst_index + 8  >= dst_start) && (dst_index + 8  < dst_end)) ? tmp_data2.x : dst_data2.x;
-        tmp_data2.y = ((dst_index + 10 >= dst_start) && (dst_index + 10 < dst_end)) ? tmp_data2.y : dst_data2.y;
-
-        *((__global ushort2*)((__global char *)mat_dst + dst_index + 0)) = tmp_data0;
-        *((__global ushort2*)((__global char *)mat_dst + dst_index + 4)) = tmp_data1;
-        *((__global ushort2*)((__global char *)mat_dst + dst_index + 8)) = tmp_data2;
-    }
-}
-__kernel void merge_vector_C3_D3(__global short *mat_dst,  int dst_step,  int dst_offset,
-                                 __global short *mat_src0, int src0_step, int src0_offset,
-                                 __global short *mat_src1, int src1_step, int src1_offset,
-                                 __global short *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        x = x << 1;
-
-        int src0_index = mad24(y, src0_step, (x << 1) + src0_offset - offset_cols);
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - offset_cols);
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - offset_cols);
-
-        int dst_start = mad24(y, dst_step, dst_offset);
-        int dst_end   = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index = mad24(y, dst_step, dst_offset + 6 * x - offset_cols * 6);
-
-        short data0_0 = *((__global short *)((__global char *)mat_src0 + src0_index + 0));
-        short data0_1 = *((__global short *)((__global char *)mat_src0 + src0_index + 2));
-
-        short data1_0 = *((__global short *)((__global char *)mat_src1 + src1_index + 0));
-        short data1_1 = *((__global short *)((__global char *)mat_src1 + src1_index + 2));
-
-        short data2_0 = *((__global short *)((__global char *)mat_src2 + src2_index + 0));
-        short data2_1 = *((__global short *)((__global char *)mat_src2 + src2_index + 2));
-
-        short2 tmp_data0 = (short2)(data0_0, data1_0);
-        short2 tmp_data1 = (short2)(data2_0, data0_1);
-        short2 tmp_data2 = (short2)(data1_1, data2_1);
-
-        short2 dst_data0 = *((__global short2*)((__global char *)mat_dst + dst_index + 0));
-        short2 dst_data1 = *((__global short2*)((__global char *)mat_dst + dst_index + 4));
-        short2 dst_data2 = *((__global short2*)((__global char *)mat_dst + dst_index + 8));
-
-        tmp_data0.x = ((dst_index + 0  >= dst_start) && (dst_index + 0  < dst_end)) ? tmp_data0.x : dst_data0.x;
-        tmp_data0.y = ((dst_index + 2  >= dst_start) && (dst_index + 2  < dst_end)) ? tmp_data0.y : dst_data0.y;
-
-        tmp_data1.x = ((dst_index + 4  >= dst_start) && (dst_index + 4  < dst_end)) ? tmp_data1.x : dst_data1.x;
-        tmp_data1.y = ((dst_index + 6  >= dst_start) && (dst_index + 6  < dst_end)) ? tmp_data1.y : dst_data1.y;
-
-        tmp_data2.x = ((dst_index + 8  >= dst_start) && (dst_index + 8  < dst_end)) ? tmp_data2.x : dst_data2.x;
-        tmp_data2.y = ((dst_index + 10 >= dst_start) && (dst_index + 10 < dst_end)) ? tmp_data2.y : dst_data2.y;
-
-        *((__global short2*)((__global char *)mat_dst + dst_index + 0)) = tmp_data0;
-        *((__global short2*)((__global char *)mat_dst + dst_index + 4)) = tmp_data1;
-        *((__global short2*)((__global char *)mat_dst + dst_index + 8)) = tmp_data2;
-    }
-}
-__kernel void merge_vector_C3_D4(__global int *mat_dst,  int dst_step,  int dst_offset,
-                                 __global int *mat_src0, int src0_step, int src0_offset,
-                                 __global int *mat_src1, int src1_step, int src1_offset,
-                                 __global int *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        __global int* src0 = (__global int * )((__global uchar *)mat_src0 + src0_index + (x << 2));
-        __global int* src1 = (__global int * )((__global uchar *)mat_src1 + src1_index + (x << 2));
-        __global int* src2 = (__global int * )((__global uchar *)mat_src2 + src2_index + (x << 2));
-
-        __global int* dist0 = (__global int *)((__global uchar *)mat_dst  + dst_index  + 3 * (x << 2));
-        __global int* dist1 = dist0 + 1;
-        __global int* dist2 = dist0 + 2;
-
-        int  src0_data = *src0;
-        int  src1_data = *src1;
-        int  src2_data = *src2;
-
-        *dist0 = src0_data;
-        *dist1 = src1_data;
-        *dist2 = src2_data;
-    }
-}
-__kernel void merge_vector_C3_D5(__global float *mat_dst,  int dst_step,  int dst_offset,
-                                 __global float *mat_src0, int src0_step, int src0_offset,
-                                 __global float *mat_src1, int src1_step, int src1_offset,
-                                 __global float *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        __global float* src0 = (__global float * )((__global uchar *)mat_src0 + src0_index + (x << 2));
-        __global float* src1 = (__global float * )((__global uchar *)mat_src1 + src1_index + (x << 2));
-        __global float* src2 = (__global float * )((__global uchar *)mat_src2 + src2_index + (x << 2));
-
-        __global float* dist0 = (__global float *)((__global uchar *)mat_dst  + dst_index  + 3 * (x << 2));
-        __global float* dist1 = dist0 + 1;
-        __global float* dist2 = dist0 + 2;
-
-        float  src0_data = *src0;
-        float  src1_data = *src1;
-        float  src2_data = *src2;
-
-        *dist0 = src0_data;
-        *dist1 = src1_data;
-        *dist2 = src2_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C3_D6(__global double *mat_dst,  int dst_step,  int dst_offset,
-                                 __global double *mat_src0, int src0_step, int src0_offset,
-                                 __global double *mat_src1, int src1_step, int src1_offset,
-                                 __global double *mat_src2, int src2_step, int src2_offset, int offset_cols,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        __global double* src0 = (__global double * )((__global uchar *)mat_src0 + src0_index + (x << 3));
-        __global double* src1 = (__global double * )((__global uchar *)mat_src1 + src1_index + (x << 3));
-        __global double* src2 = (__global double * )((__global uchar *)mat_src2 + src2_index + (x << 3));
-
-        __global double* dist0 = (__global double *)((__global uchar *)mat_dst  + dst_index  + 3 * (x << 3));
-        __global double* dist1 = dist0 + 1;
-        __global double* dist2 = dist0 + 2;
-
-        double  src0_data = *src0;
-        double  src1_data = *src1;
-        double  src2_data = *src2;
-
-        *dist0 = src0_data;
-        *dist1 = src1_data;
-        *dist2 = src2_data;
-    }
-}
-#endif
-__kernel void merge_vector_C4_D0(__global uchar *mat_dst,  int dst_step,  int dst_offset,
-                                 __global uchar *mat_src0, int src0_step, int src0_offset,
-                                 __global uchar *mat_src1, int src1_step, int src1_offset,
-                                 __global uchar *mat_src2, int src2_step, int src2_offset,
-                                 __global uchar *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        uchar src0 = *(mat_src0 + src0_index + x );
-        uchar src1 = *(mat_src1 + src1_index + x);
-        uchar src2 = *(mat_src2 + src2_index + x);
-        uchar src3 = *(mat_src3 + src3_index + x);
-
-        *((__global uchar4 *)(mat_dst  + dst_index + (x << 2))) = (uchar4)(src0, src1, src2, src3);
-    }
-}
-__kernel void merge_vector_C4_D1(__global char *mat_dst,  int dst_step,  int dst_offset,
-                                 __global char *mat_src0, int src0_step, int src0_offset,
-                                 __global char *mat_src1, int src1_step, int src1_offset,
-                                 __global char *mat_src2, int src2_step, int src2_offset,
-                                 __global char *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        char src0 = *(mat_src0 + src0_index + x );
-        char src1 = *(mat_src1 + src1_index + x);
-        char src2 = *(mat_src2 + src2_index + x);
-        char src3 = *(mat_src3 + src3_index + x);
-
-        *((__global char4 *)(mat_dst  + dst_index + (x << 2))) = (char4)(src0, src1, src2, src3);
-    }
-}
-__kernel void merge_vector_C4_D2(__global ushort *mat_dst,  int dst_step,  int dst_offset,
-                                 __global ushort *mat_src0, int src0_step, int src0_offset,
-                                 __global ushort *mat_src1, int src1_step, int src1_offset,
-                                 __global ushort *mat_src2, int src2_step, int src2_offset,
-                                 __global ushort *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        ushort src0 = *((__global ushort *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
-        ushort src1 = *((__global ushort *)((__global uchar *)mat_src1 + src1_index + (x << 1)));
-        ushort src2 = *((__global ushort *)((__global uchar *)mat_src2 + src2_index + (x << 1)));
-        ushort src3 = *((__global ushort *)((__global uchar *)mat_src3 + src3_index + (x << 1)));
-
-        *((__global ushort4 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (ushort4)(src0, src1, src2, src3);
-    }
-}
-__kernel void merge_vector_C4_D3(__global short *mat_dst,  int dst_step,  int dst_offset,
-                                 __global short *mat_src0, int src0_step, int src0_offset,
-                                 __global short *mat_src1, int src1_step, int src1_offset,
-                                 __global short *mat_src2, int src2_step, int src2_offset,
-                                 __global short *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        short src0 = *((__global short *)((__global uchar *)mat_src0 + src0_index + (x << 1)));
-        short src1 = *((__global short *)((__global uchar *)mat_src1 + src1_index + (x << 1)));
-        short src2 = *((__global short *)((__global uchar *)mat_src2 + src2_index + (x << 1)));
-        short src3 = *((__global short *)((__global uchar *)mat_src3 + src3_index + (x << 1)));
-
-        *((__global short4 *)((__global uchar *)mat_dst  + dst_index + (x << 3))) = (short4)(src0, src1, src2, src3);
-    }
-}
-__kernel void merge_vector_C4_D4(__global int *mat_dst,  int dst_step,  int dst_offset,
-                                 __global int *mat_src0, int src0_step, int src0_offset,
-                                 __global int *mat_src1, int src1_step, int src1_offset,
-                                 __global int *mat_src2, int src2_step, int src2_offset,
-                                 __global int *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        int src0 = *((__global int *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
-        int src1 = *((__global int *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
-        int src2 = *((__global int *)((__global uchar *)mat_src2 + src2_index + (x << 2)));
-        int src3 = *((__global int *)((__global uchar *)mat_src3 + src3_index + (x << 2)));
-
-        *((__global int4 *)((__global uchar *)mat_dst  + dst_index + (x << 4))) = (int4)(src0, src1, src2, src3);
-    }
-}
-__kernel void merge_vector_C4_D5(__global float *mat_dst,  int dst_step,  int dst_offset,
-                                 __global float *mat_src0, int src0_step, int src0_offset,
-                                 __global float *mat_src1, int src1_step, int src1_offset,
-                                 __global float *mat_src2, int src2_step, int src2_offset,
-                                 __global float *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        float src0 = *((__global float *)((__global uchar *)mat_src0 + src0_index + (x << 2)));
-        float src1 = *((__global float *)((__global uchar *)mat_src1 + src1_index + (x << 2)));
-        float src2 = *((__global float *)((__global uchar *)mat_src2 + src2_index + (x << 2)));
-        float src3 = *((__global float *)((__global uchar *)mat_src3 + src3_index + (x << 2)));
-
-        *((__global float4 *)((__global uchar *)mat_dst  + dst_index + (x << 4))) = (float4)(src0, src1, src2, src3);
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C4_D6(__global double *mat_dst,  int dst_step,  int dst_offset,
-                                 __global double *mat_src0, int src0_step, int src0_offset,
-                                 __global double *mat_src1, int src1_step, int src1_offset,
-                                 __global double *mat_src2, int src2_step, int src2_offset,
-                                 __global double *mat_src3, int src3_step, int src3_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        int src0_index = mad24(y, src0_step, src0_offset);
-        int src1_index = mad24(y, src1_step, src1_offset);
-        int src2_index = mad24(y, src2_step, src2_offset);
-        int src3_index = mad24(y, src3_step, src3_offset);
-        int dst_index  = mad24(y, dst_step , dst_offset);
-
-        double src0 = *((__global double *)((__global uchar *)mat_src0 + src0_index + (x << 3)));
-        double src1 = *((__global double *)((__global uchar *)mat_src1 + src1_index + (x << 3)));
-        double src2 = *((__global double *)((__global uchar *)mat_src2 + src2_index + (x << 3)));
-        double src3 = *((__global double *)((__global uchar *)mat_src3 + src3_index + (x << 3)));
-
-        *((__global double4 *)((__global uchar *)mat_dst  + dst_index + (x << 5))) = (double4)(src0, src1, src2, src3);
-    }
-}
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////optimized code using vector  no roi//////////////////////////
-////////////vector fuction name format: merge_vector_C(channels number)D_(data type depth)//////
-////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void merge_vector_C2_D0_1(int rows, int cols,
-                                   __global uchar *mat_dst,  int dst_step,
-                                   __global uchar *mat_src0, int src0_step,
-                                   __global uchar *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
-        __global uchar4  *src1_y = (__global uchar4 * )(mat_src1 + y * src1_step);
-        __global uchar8 *dst_y  = (__global uchar8 *)(mat_dst  + y * dst_step);
-
-        uchar4 value1 = src0_y[x];
-        uchar4 value2 = src1_y[x];
-
-        uchar8 value;
-        value.even = value1;
-        value.odd = value2;
-
-        dst_y[x] = value;
-    }
-}
-__kernel void merge_vector_C2_D1_1(int rows, int cols,
-                                   __global char *mat_dst,  int dst_step,
-                                   __global char *mat_src0, int src0_step,
-                                   __global char *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
-        __global char4  *src1_y = (__global char4 * )(mat_src1 + y * src1_step);
-        __global char8 *dst_y  = (__global char8 *)(mat_dst  + y * dst_step);
-
-        char4 value1 = src0_y[x];
-        char4 value2 = src1_y[x];
-
-        char8 value;
-        value.even = value1;
-        value.odd = value2;
-
-        dst_y[x] = value;
-    }
-}
-__kernel void merge_vector_C2_D2_1(int rows, int cols,
-                                   __global ushort *mat_dst,  int dst_step,
-                                   __global ushort *mat_src0, int src0_step,
-                                   __global ushort *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global ushort2  *src0_y = (__global ushort2 *)((__global uchar *)mat_src0 + y * src0_step);
-        __global ushort2  *src1_y = (__global ushort2 *)((__global uchar *)mat_src1 + y * src1_step);
-        __global ushort4  *dst_y  = (__global ushort4 *)((__global uchar *)mat_dst  + y * dst_step);
-
-        ushort2 value1 = src0_y[x];
-        ushort2 value2 = src1_y[x];
-
-        ushort4 value;
-        value.even = value1;
-        value.odd = value2;
-
-        dst_y[x] = value;
-    }
-}
-__kernel void merge_vector_C2_D3_1(int rows, int cols,
-                                   __global short *mat_dst,  int dst_step,
-                                   __global short *mat_src0, int src0_step,
-                                   __global short *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global short2  *src0_y = (__global short2 *)((__global uchar *)mat_src0 + y * src0_step);
-        __global short2  *src1_y = (__global short2 *)((__global uchar *)mat_src1 + y * src1_step);
-        __global short4 *dst_y   = (__global short4 *)((__global uchar *)mat_dst  + y * dst_step);
-
-        short2 value1 = src0_y[x];
-        short2 value2 = src1_y[x];
-
-        short4 value;
-        value.even = value1;
-        value.odd = value2;
-
-        dst_y[x] = value;
-    }
-}
-
-__kernel void merge_vector_C2_D4_1(int rows, int cols,
-                                   __global int *mat_dst,  int dst_step,
-                                   __global int *mat_src0, int src0_step,
-                                   __global int *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global int  *src0_y = (__global int *)((__global uchar *)mat_src0 + y * src0_step);
-        __global int  *src1_y = (__global int *)((__global uchar *)mat_src1 + y * src1_step);
-        __global int2  *dst_y  = (__global int2 *)((__global uchar *)mat_dst  + y * dst_step);
-
-        int value1 = src0_y[x];
-        int value2 = src1_y[x];
-
-        int2 value;
-        value.even = value1;
-        value.odd = value2;
-
-        dst_y[x] = value;
-    }
-}
-__kernel void merge_vector_C2_D5_1(int rows, int cols,
-                                   __global float *mat_dst,  int dst_step,
-                                   __global float *mat_src0, int src0_step,
-                                   __global float *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global float  *src0_y = (__global float *)((__global uchar *)mat_src0 + y * src0_step);
-        __global float  *src1_y = (__global float *)((__global uchar *)mat_src1 + y * src1_step);
-        __global float2  *dst_y  = (__global float2 *)((__global uchar *)mat_dst  + y * dst_step);
-
-        float value1 = src0_y[x];
-        float value2 = src1_y[x];
-
-        dst_y[x] = (float2)(value1, value2);
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C2_D6_1(int rows, int cols,
-                                   __global double *mat_dst,  int dst_step,
-                                   __global double *mat_src0, int src0_step,
-                                   __global double *mat_src1, int src1_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global double  *src0_y = (__global double *)((__global uchar *)mat_src0 + y * src0_step);
-        __global double  *src1_y = (__global double *)((__global uchar *)mat_src1 + y * src1_step);
-        __global double2 *dst_y  = (__global double2 *)((__global uchar *)mat_dst  + y * dst_step);
-
-        double value1 = src0_y[x];
-        double value2 = src1_y[x];
-
-        dst_y[x] = (double2)(value1, value2);
-    }
-}
-#endif
-
-__kernel void merge_vector_C3_D0_1(int rows, int cols,
-                                   __global uchar *mat_dst,  int dst_step,
-                                   __global uchar *mat_src0, int src0_step,
-                                   __global uchar *mat_src1, int src1_step,
-                                   __global uchar *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
-        __global uchar4  *src1_y = (__global uchar4 * )(mat_src1 + y * src1_step);
-        __global uchar4  *src2_y = (__global uchar4 * )(mat_src2 + y * src0_step);
-
-        __global uchar4 *dst_y  = (__global uchar4 *)(mat_dst  + y * dst_step);
-
-        uchar4 value0 = src0_y[x];
-        uchar4 value1 = src1_y[x];
-        uchar4 value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = (uchar4)(value0.s0, value1.s0, value2.s0,
-                                    value0.s1);
-
-        dst_y[3 * x + 1] = (uchar4)(value1.s1, value2.s1,
-                                    value0.s2, value1.s2);
-
-        dst_y[3 * x + 2] = (uchar4)(value2.s2,
-                                    value0.s3, value1.s3, value2.s3);
-
-    }
-}
-__kernel void merge_vector_C3_D1_1(int rows, int cols,
-                                   __global char *mat_dst,  int dst_step,
-                                   __global char *mat_src0, int src0_step,
-                                   __global char *mat_src1, int src1_step,
-                                   __global char *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
-        __global char4  *src1_y = (__global char4 * )(mat_src1 + y * src1_step);
-        __global char4  *src2_y = (__global char4 * )(mat_src2 + y * src0_step);
-
-        __global char4 *dst_y  = (__global char4 *)(mat_dst  + y * dst_step);
-
-        char4 value0 = src0_y[x];
-        char4 value1 = src1_y[x];
-        char4 value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = (char4)(value0.s0, value1.s0, value2.s0,
-                                   value0.s1);
-
-        dst_y[3 * x + 1] = (char4)(value1.s1, value2.s1,
-                                     value0.s2, value1.s2);
-
-        dst_y[3 * x + 2] = (char4)(value2.s2,
-                                     value0.s3, value1.s3, value2.s3);
-
-        /* for test do not delete
-        dst_y[3 * x + 0] = (char8)(value0.s0, value1.s0, value2.s0,
-                                    value0.s1, value1.s1, value2.s1,
-                                    value0.s2, value1.s2);
-
-        dst_y[3 * x + 1] = (char8)(value2.s2,
-                                    value0.s3, value1.s3, value2.s3,
-                                    value0.s4, value1.s4, value2.s4,
-                                    value0.s5);
-
-        dst_y[3 * x + 2] = (char8)(value1.s5, value2.s5,
-                                    value0.s6, value1.s6, value2.s6,
-                                    value0.s7, value1.s7, value2.s7);
-                                    */
-    }
-}
-__kernel void merge_vector_C3_D2_1(int rows, int cols,
-                                   __global ushort *mat_dst,  int dst_step,
-                                   __global ushort *mat_src0, int src0_step,
-                                   __global ushort *mat_src1, int src1_step,
-                                   __global ushort *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global ushort2  *src0_y = (__global ushort2 * )((__global char *)mat_src0 + y * src0_step);
-        __global ushort2  *src1_y = (__global ushort2 * )((__global char *)mat_src1 + y * src1_step);
-        __global ushort2  *src2_y = (__global ushort2 * )((__global char *)mat_src2 + y * src0_step);
-
-        __global ushort2 *dst_y  = (__global ushort2 *)((__global char *)mat_dst  + y * dst_step);
-
-        ushort2 value0 = src0_y[x];
-        ushort2 value1 = src1_y[x];
-        ushort2 value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = (ushort2)(value0.x, value1.x);
-        dst_y[3 * x + 1] = (ushort2)(value2.x, value0.y);
-        dst_y[3 * x + 2] = (ushort2)(value1.y, value2.y);
-
-    }
-}
-__kernel void merge_vector_C3_D3_1(int rows, int cols,
-                                   __global short *mat_dst,  int dst_step,
-                                   __global short *mat_src0, int src0_step,
-                                   __global short *mat_src1, int src1_step,
-                                   __global short *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global short2  *src0_y = (__global short2 * )((__global char *)mat_src0 + y * src0_step);
-        __global short2  *src1_y = (__global short2 * )((__global char *)mat_src1 + y * src1_step);
-        __global short2  *src2_y = (__global short2 * )((__global char *)mat_src2 + y * src0_step);
-
-        __global short2 *dst_y  = (__global short2 *)((__global char *)mat_dst  + y * dst_step);
-
-        short2 value0 = src0_y[x];
-        short2 value1 = src1_y[x];
-        short2 value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = (short2)(value0.x, value1.x);
-        dst_y[3 * x + 1] = (short2)(value2.x, value0.y);
-        dst_y[3 * x + 2] = (short2)(value1.y, value2.y);
-
-        /*
-        dst_y[3 * x + 0] = (short4)(value0.s0, value1.s0, value2.s0,
-                                    value0.s1);
-
-        dst_y[3 * x + 1] = (short4)(value1.s1, value2.s1,
-                                    value0.s2, value1.s2);
-
-        dst_y[3 * x + 2] = (short4)(value2.s2,
-                                    value0.s3, value1.s3, value2.s3);
-                                    */
-    }
-}
-__kernel void merge_vector_C3_D4_1(int rows, int cols,
-                                   __global int *mat_dst,  int dst_step,
-                                   __global int *mat_src0, int src0_step,
-                                   __global int *mat_src1, int src1_step,
-                                   __global int *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global int  *src0_y = (__global int * )((__global char *)mat_src0 + y * src0_step);
-        __global int  *src1_y = (__global int * )((__global char *)mat_src1 + y * src1_step);
-        __global int  *src2_y = (__global int * )((__global char *)mat_src2 + y * src0_step);
-
-        __global int *dst_y  = (__global int *)((__global char *)mat_dst  + y * dst_step);
-
-        int value0 = src0_y[x];
-        int value1 = src1_y[x];
-        int value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = value0;
-        dst_y[3 * x + 1] = value1;
-        dst_y[3 * x + 2] = value2;
-
-        /*for test do not delete
-        dst_y[3 * x + 0] = (int2)(value0.x, value1.x);
-        dst_y[3 * x + 1] = (int2)(value2.x, value0.y);
-        dst_y[3 * x + 2] = (int2)(value1.y, value2.y);
-        */
-    }
-}
-__kernel void merge_vector_C3_D5_1(int rows, int cols,
-                                   __global float *mat_dst,  int dst_step,
-                                   __global float *mat_src0, int src0_step,
-                                   __global float *mat_src1, int src1_step,
-                                   __global float *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global float  *src0_y = (__global float * )((__global char *)mat_src0 + y * src0_step);
-        __global float  *src1_y = (__global float * )((__global char *)mat_src1 + y * src1_step);
-        __global float  *src2_y = (__global float * )((__global char *)mat_src2 + y * src0_step);
-
-        __global float *dst_y  = (__global float *)((__global char *)mat_dst  + y * dst_step);
-
-        float value0 = src0_y[x];
-        float value1 = src1_y[x];
-        float value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = value0;
-        dst_y[3 * x + 1] = value1;
-        dst_y[3 * x + 2] = value2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C3_D6_1(int rows, int cols,
-                                   __global double *mat_dst,  int dst_step,
-                                   __global double *mat_src0, int src0_step,
-                                   __global double *mat_src1, int src1_step,
-                                   __global double *mat_src2, int src2_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global double  *src0_y = (__global double * )((__global char *)mat_src0 + y * src0_step);
-        __global double  *src1_y = (__global double * )((__global char *)mat_src1 + y * src1_step);
-        __global double  *src2_y = (__global double * )((__global char *)mat_src2 + y * src0_step);
-
-        __global double *dst_y  = (__global double *)((__global char *)mat_dst  + y * dst_step);
-
-        double value0 = src0_y[x];
-        double value1 = src1_y[x];
-        double value2 = src2_y[x];
-
-        dst_y[3 * x + 0] = value0;
-        dst_y[3 * x + 1] = value1;
-        dst_y[3 * x + 2] = value2;
-    }
-}
-#endif
-__kernel void merge_vector_C4_D0_1(int rows, int cols,
-                                   __global uchar *mat_dst,  int dst_step,
-                                   __global uchar *mat_src0, int src0_step,
-                                   __global uchar *mat_src1, int src1_step,
-                                   __global uchar *mat_src2, int src2_step,
-                                   __global uchar *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global uchar4  *src0_y = (__global uchar4 * )(mat_src0 + y * src0_step);
-        __global uchar4  *src1_y = (__global uchar4 * )(mat_src1 + y * src1_step);
-        __global uchar4  *src2_y = (__global uchar4 * )(mat_src2 + y * src0_step);
-        __global uchar4  *src3_y = (__global uchar4 * )(mat_src3 + y * src1_step);
-
-        __global uchar16 *dst_y  = (__global uchar16 *)(mat_dst  + y * dst_step);
-
-        uchar4 value0 = src0_y[x];
-        uchar4 value1 = src1_y[x];
-        uchar4 value2 = src2_y[x];
-        uchar4 value3 = src3_y[x];
-
-        dst_y[x] = (uchar16)(value0.x, value1.x, value2.x, value3.x,
-                             value0.y, value1.y, value2.y, value3.y,
-                             value0.z, value1.z, value2.z, value3.z,
-                             value0.w, value1.w, value2.w, value3.w);
-    }
-}
-
-__kernel void merge_vector_C4_D1_1(int rows, int cols,
-                                   __global char *mat_dst,  int dst_step,
-                                   __global char *mat_src0, int src0_step,
-                                   __global char *mat_src1, int src1_step,
-                                   __global char *mat_src2, int src2_step,
-                                   __global char *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global char4  *src0_y = (__global char4 * )(mat_src0 + y * src0_step);
-        __global char4  *src1_y = (__global char4 * )(mat_src1 + y * src1_step);
-        __global char4  *src2_y = (__global char4 * )(mat_src2 + y * src0_step);
-        __global char4  *src3_y = (__global char4 * )(mat_src3 + y * src1_step);
-
-        __global char16 *dst_y  = (__global char16 *)(mat_dst  + y * dst_step);
-
-        char4 value0 = src0_y[x];
-        char4 value1 = src1_y[x];
-        char4 value2 = src2_y[x];
-        char4 value3 = src3_y[x];
-
-        dst_y[x] = (char16)(value0.x, value1.x, value2.x, value3.x,
-                            value0.y, value1.y, value2.y, value3.y,
-                            value0.z, value1.z, value2.z, value3.z,
-                            value0.w, value1.w, value2.w, value3.w);
-    }
-}
-__kernel void merge_vector_C4_D2_1(int rows, int cols,
-                                   __global ushort *mat_dst,  int dst_step,
-                                   __global ushort *mat_src0, int src0_step,
-                                   __global ushort *mat_src1, int src1_step,
-                                   __global ushort *mat_src2, int src2_step,
-                                   __global ushort *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global ushort2  *src0_y = (__global ushort2 * )((__global uchar*)mat_src0 + y * src0_step);
-        __global ushort2  *src1_y = (__global ushort2 * )((__global uchar*)mat_src1 + y * src1_step);
-        __global ushort2  *src2_y = (__global ushort2 * )((__global uchar*)mat_src2 + y * src0_step);
-        __global ushort2  *src3_y = (__global ushort2 * )((__global uchar*)mat_src3 + y * src1_step);
-
-        __global ushort8 *dst_y  = (__global ushort8 *)((__global uchar*)mat_dst  + y * dst_step);
-
-        ushort2 value0 = src0_y[x];
-        ushort2 value1 = src1_y[x];
-        ushort2 value2 = src2_y[x];
-        ushort2 value3 = src3_y[x];
-
-        dst_y[x] = (ushort8)(value0.x, value1.x, value2.x, value3.x,
-                             value0.y, value1.y, value2.y, value3.y);
-    }
-}
-__kernel void merge_vector_C4_D3_1(int rows, int cols,
-                                   __global short *mat_dst,  int dst_step,
-                                   __global short *mat_src0, int src0_step,
-                                   __global short *mat_src1, int src1_step,
-                                   __global short *mat_src2, int src2_step,
-                                   __global short *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global short2  *src0_y = (__global short2 * )((__global uchar*)mat_src0 + y * src0_step);
-        __global short2  *src1_y = (__global short2 * )((__global uchar*)mat_src1 + y * src1_step);
-        __global short2  *src2_y = (__global short2 * )((__global uchar*)mat_src2 + y * src0_step);
-        __global short2  *src3_y = (__global short2 * )((__global uchar*)mat_src3 + y * src1_step);
-
-        __global short8 *dst_y  = (__global short8 *)((__global uchar*)mat_dst  + y * dst_step);
-
-        short2 value0 = src0_y[x];
-        short2 value1 = src1_y[x];
-        short2 value2 = src2_y[x];
-        short2 value3 = src3_y[x];
-
-        dst_y[x] = (short8)(value0.x, value1.x, value2.x, value3.x,
-                            value0.y, value1.y, value2.y, value3.y);
-    }
-}
-__kernel void merge_vector_C4_D4_1(int rows, int cols,
-                                   __global int *mat_dst,  int dst_step,
-                                   __global int *mat_src0, int src0_step,
-                                   __global int *mat_src1, int src1_step,
-                                   __global int *mat_src2, int src2_step,
-                                   __global int *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global int *src0_y = (__global int * )((__global uchar*)mat_src0 + y * src0_step);
-        __global int *src1_y = (__global int * )((__global uchar*)mat_src1 + y * src1_step);
-        __global int *src2_y = (__global int * )((__global uchar*)mat_src2 + y * src0_step);
-        __global int *src3_y = (__global int * )((__global uchar*)mat_src3 + y * src1_step);
-
-        __global int4 *dst_y  = (__global int4 *)((__global uchar*)mat_dst  + y * dst_step);
-
-        int value0 = src0_y[x];
-        int value1 = src1_y[x];
-        int value2 = src2_y[x];
-        int value3 = src3_y[x];
-
-        dst_y[x] = (int4)(value0, value1, value2, value3);
-    }
-}
-__kernel void merge_vector_C4_D5_1(int rows, int cols,
-                                   __global float *mat_dst,  int dst_step,
-                                   __global float *mat_src0, int src0_step,
-                                   __global float *mat_src1, int src1_step,
-                                   __global float *mat_src2, int src2_step,
-                                   __global float *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global float *src0_y = (__global float * )((__global uchar*)mat_src0 + y * src0_step);
-        __global float *src1_y = (__global float * )((__global uchar*)mat_src1 + y * src1_step);
-        __global float *src2_y = (__global float * )((__global uchar*)mat_src2 + y * src0_step);
-        __global float *src3_y = (__global float * )((__global uchar*)mat_src3 + y * src1_step);
-
-        __global float4 *dst_y  = (__global float4 *)((__global uchar*)mat_dst  + y * dst_step);
-
-        float value0 = src0_y[x];
-        float value1 = src1_y[x];
-        float value2 = src2_y[x];
-        float value3 = src3_y[x];
-
-        dst_y[x] = (float4)(value0, value1, value2, value3);
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void merge_vector_C4_D6_1(int rows, int cols,
-                                   __global double *mat_dst,  int dst_step,
-                                   __global double *mat_src0, int src0_step,
-                                   __global double *mat_src1, int src1_step,
-                                   __global double *mat_src2, int src2_step,
-                                   __global double *mat_src3, int src3_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if ((x < cols) && (y < rows))
-    {
-        __global double *src0_y = (__global double * )((__global uchar*)mat_src0 + y * src0_step);
-        __global double *src1_y = (__global double * )((__global uchar*)mat_src1 + y * src1_step);
-        __global double *src2_y = (__global double * )((__global uchar*)mat_src2 + y * src0_step);
-        __global double *src3_y = (__global double * )((__global uchar*)mat_src3 + y * src1_step);
-
-        __global double4 *dst_y  = (__global double4 *)((__global uchar*)mat_dst  + y * dst_step);
-
-        double value0 = src0_y[x];
-        double value1 = src1_y[x];
-        double value2 = src2_y[x];
-        double value3 = src3_y[x];
-
-        dst_y[x] = (double4)(value0, value1, value2, value3);
-    }
-}
-#endif
diff --git a/modules/ocl/src/opencl/moments.cl b/modules/ocl/src/opencl/moments.cl
deleted file mode 100644
index 09c79c4b5..000000000
--- a/modules/ocl/src/opencl/moments.cl
+++ /dev/null
@@ -1,432 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma,  jin@multicorewareinc.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-typedef double T;
-#else
-typedef long T;
-#endif
-
-#define DST_ROW_00     0
-#define DST_ROW_10     1
-#define DST_ROW_01     2
-#define DST_ROW_20     3
-#define DST_ROW_11     4
-#define DST_ROW_02     5
-#define DST_ROW_30     6
-#define DST_ROW_21     7
-#define DST_ROW_12     8
-#define DST_ROW_03     9
-
-__kernel void icvContourMoments(int contour_total,
-                                __global float* reader_oclmat_data,
-                                __global T* dst_a,
-                                int dst_step)
-{
-    T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
-    int idx = get_global_id(0);
-
-    if (idx < 0 || idx >= contour_total)
-        return;
-
-    xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
-    yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
-    xi_12 = xi_1 * xi_1;
-    yi_12 = yi_1 * yi_1;
-
-    if(idx == contour_total - 1)
-    {
-        xi = (T)(*(reader_oclmat_data));
-        yi = (T)(*(reader_oclmat_data + 1));
-    }
-    else
-    {
-        xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
-        yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
-    }
-    xi2 = xi * xi;
-    yi2 = yi * yi;
-    dxy = xi_1 * yi - xi * yi_1;
-    xii_1 = xi_1 + xi;
-    yii_1 = yi_1 + yi;
-
-    dst_step /= sizeof(T);
-    *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
-    *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
-    *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
-    *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
-    *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
-    *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
-    *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
-    *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
-    *( dst_a + DST_ROW_21 * dst_step + idx) =
-        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
-        xi2 * (yi_1 + 3 * yi));
-    *( dst_a + DST_ROW_12 * dst_step + idx) =
-        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
-        yi2 * (xi_1 + 3 * xi));
-}
-
-#if defined (DOUBLE_SUPPORT)
-#define WT double
-#define WT4 double4
-#define convert_T4 convert_double4
-#define convert_T convert_double
-#else
-#define WT float
-#define WT4 float4
-#define convert_T4 convert_float4
-#define convert_T convert_float
-#endif
-
-#ifdef CV_8UC1
-#define TT uchar
-#elif defined CV_16UC1
-#define TT ushort
-#elif defined CV_16SC1
-#define TT short
-#elif defined CV_32FC1
-#define TT float
-#elif defined CV_64FC1
-#ifdef DOUBLE_SUPPORT
-#define TT double
-#else
-#define TT float
-#endif
-#endif
-__kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int src_step,
-                        __global WT* dst_m,
-                        int dst_cols, int dst_step, int binary)
-{
-    int dy = get_global_id(1);
-    int ly = get_local_id(1);
-    int gidx = get_group_id(0);
-    int gidy = get_group_id(1);
-    int x_rest = src_cols % 256;
-    int y_rest = src_rows % 256;
-    __local int codxy[256];
-    codxy[ly] = ly;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    WT4 x0 = (WT4)(0.f);
-    WT4 x1 = (WT4)(0.f);
-    WT4 x2 = (WT4)(0.f);
-    WT4 x3 = (WT4)(0.f);
-
-    __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256;
-
-    WT4 p;
-    WT4 x;
-    WT4 xp;
-    WT4 xxp;
-
-    WT py = 0.f, sy = 0.f;
-
-    if(dy < src_rows)
-    {
-        if((x_rest > 0) && (gidx == ((int)get_num_groups(0) - 1)))
-        {
-            int i;
-            for(i = 0; i < x_rest - 4; i += 4)
-            {
-                p = convert_T4(vload4(0, row + i));
-                x = convert_T4(vload4(0, codxy + i));
-                xp = x * p;
-                xxp = xp * x;
-
-                x0 += p;
-                x1 += xp;
-                x2 += xxp;
-                x3 += convert_T4(xxp * x);
-            }
-
-            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
-            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
-            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
-            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
-
-            WT x0_ = 0;
-            WT x1_ = 0;
-            WT x2_ = 0;
-            WT x3_ = 0;
-
-            for(; i < x_rest; i++)
-            {
-                WT p_ = 0;
-                p_ = row[i];
-                WT x_ = convert_T(codxy[i]);
-
-
-                WT xp_ = x_ * p_;
-                WT xxp_ = xp_ * x_;
-
-                x0_ += p_;
-                x1_ += xp_;
-                x2_ += xxp_;
-                x3_ += xxp_ * x_;
-            }
-
-            x0.s0 += x0_;
-            x1.s0 += x1_;
-            x2.s0 += x2_;
-            x3.s0 += x3_;
-        }else
-        {
-            for(int i = 0; i < 256; i += 4)
-            {
-                p = convert_T4(vload4(0, row + i));
-                x = convert_T4(vload4(0, codxy + i));
-                xp = x * p;
-                xxp = xp * x;
-
-                x0 += p;
-                x1 += xp;
-                x2 += xxp;
-                x3 += convert_T4(xxp * x);
-            }
-
-            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
-            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
-            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
-            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
-        }
-
-        py = ly * x0.s0;
-        sy = ly * ly;
-    }
-    __local WT mom[10][256];
-
-    if((y_rest > 0) && (gidy == ((int)get_num_groups(1) - 1)))
-    {
-        if(ly < y_rest)
-        {
-            mom[9][ly] = py * sy;
-            mom[8][ly] = x1.s0 * sy;
-            mom[7][ly] = x2.s0 * ly;
-            mom[6][ly] = x3.s0;
-            mom[5][ly] = x0.s0 * sy;
-            mom[4][ly] = x1.s0 * ly;
-            mom[3][ly] = x2.s0;
-            mom[2][ly] = py;
-            mom[1][ly] = x1.s0;
-            mom[0][ly] = x0.s0;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if(ly < 10)
-            for(int i = 1; i < y_rest; i++)
-                mom[ly][0] = mom[ly][i] + mom[ly][0];
-    }
-    else
-    {
-        mom[9][ly] = py * sy;
-        mom[8][ly] = x1.s0 * sy;
-        mom[7][ly] = x2.s0 * ly;
-        mom[6][ly] = x3.s0;
-        mom[5][ly] = x0.s0 * sy;
-        mom[4][ly] = x1.s0 * ly;
-        mom[3][ly] = x2.s0;
-        mom[2][ly] = py;
-        mom[1][ly] = x1.s0;
-        mom[0][ly] = x0.s0;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 128)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 128];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 128];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 128];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 128];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 128];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 128];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 128];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 128];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 128];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 64)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 64];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 64];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 64];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 64];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 64];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 64];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 64];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 64];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 64];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 32)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 32];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 32];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 32];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 32];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 32];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 32];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 32];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 32];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 32];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 16)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 16];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 16];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 16];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 16];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 16];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 16];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 16];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 16];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 16];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 8)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 8];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 8];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 8];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 8];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 8];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 8];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 8];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 8];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 8];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 4)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 4];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 4];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 4];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 4];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 4];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 4];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 4];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 4];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 4];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 2)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 2];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 2];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 2];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 2];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 2];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 2];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 2];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 2];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 2];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if(ly < 1)
-        {
-            mom[0][ly] = mom[0][ly] + mom[0][ly + 1];
-            mom[1][ly] = mom[1][ly] + mom[1][ly + 1];
-            mom[2][ly] = mom[2][ly] + mom[2][ly + 1];
-            mom[3][ly] = mom[3][ly] + mom[3][ly + 1];
-            mom[4][ly] = mom[4][ly] + mom[4][ly + 1];
-            mom[5][ly] = mom[5][ly] + mom[5][ly + 1];
-            mom[6][ly] = mom[6][ly] + mom[6][ly + 1];
-            mom[7][ly] = mom[7][ly] + mom[7][ly + 1];
-            mom[8][ly] = mom[8][ly] + mom[8][ly + 1];
-            mom[9][ly] = mom[9][ly] + mom[9][ly + 1];
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(binary)
-    {
-        WT s = 1.0f/255;
-        if(ly < 10)
-            mom[ly][0] *= s;
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    WT xm = (gidx * 256) * mom[0][0];
-    WT ym = (gidy * 256) * mom[0][0];
-
-    if(ly == 0)
-    {
-        mom[0][1] = mom[0][0];
-        mom[1][1] = mom[1][0] + xm;
-        mom[2][1] = mom[2][0] + ym;
-        mom[3][1] = mom[3][0] + gidx * 256 * (mom[1][0] * 2 + xm);
-        mom[4][1] = mom[4][0] + gidx * 256 * (mom[2][0] + ym) + gidy * 256 * mom[1][0];
-        mom[5][1] = mom[5][0] + gidy * 256 * (mom[2][0] * 2 + ym);
-        mom[6][1] = mom[6][0] + gidx * 256 * (3 * mom[3][0] + 256 * gidx * (3 * mom[1][0] + xm));
-        mom[7][1] = mom[7][0] + gidx * 256 * (2 * (mom[4][0] + 256 * gidy * mom[1][0]) + 256 * gidx * (mom[2][0] + ym)) + 256 * gidy * mom[3][0];
-        mom[8][1] = mom[8][0] + gidy * 256 * (2 * (mom[4][0] + 256 * gidx * mom[2][0]) + 256 * gidy * (mom[1][0] + xm)) + 256 * gidx * mom[5][0];
-        mom[9][1] = mom[9][0] + gidy * 256 * (3 * mom[5][0] + 256 * gidy * (3 * mom[2][0] + ym));
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(ly < 10)
-        dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1];
-}
diff --git a/modules/ocl/src/opencl/operator_convertTo.cl b/modules/ocl/src/opencl/operator_convertTo.cl
deleted file mode 100644
index ca38bd550..000000000
--- a/modules/ocl/src/opencl/operator_convertTo.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void convert_to(
-        __global const srcT* restrict srcMat,
-        __global dstT* dstMat,
-        int cols1, int rows,
-        int sstep1, int soffset1,
-        int dstep1, int doffset1,
-        float alpha, float beta)
-{
-        int x = get_global_id(0);
-        int y = get_global_id(1);
-
-        int srcidx = mad24(y, sstep1, x + soffset1);
-        int dstidx = mad24(y, dstep1, x + doffset1);
-
-        if ( (x < cols1) && (y < rows) )
-        {
-            float temp_src = convert_float(srcMat[srcidx]);
-            dstMat[dstidx] = convertToDstType(temp_src*alpha+beta);
-        }
-}
diff --git a/modules/ocl/src/opencl/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl
deleted file mode 100644
index 69e1798ad..000000000
--- a/modules/ocl/src/opencl/operator_copyToM.cl
+++ /dev/null
@@ -1,71 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void copy_to_with_mask(
-        __global const GENTYPE* restrict srcMat,
-        __global GENTYPE* dstMat,
-        __global const uchar* restrict maskMat,
-        int cols,
-        int rows,
-        int srcStep_in_pixel,
-        int srcoffset_in_pixel,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        int maskStep,
-        int maskoffset)
-{
-    int x=get_global_id(0);
-    int y=get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int maskidx = mad24(y,maskStep,x+ maskoffset);
-        if ( maskMat[maskidx])
-        {
-            int srcidx = mad24(y,srcStep_in_pixel,x+ srcoffset_in_pixel);
-            int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-            dstMat[dstidx] = srcMat[srcidx];
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl
deleted file mode 100644
index 20c5cf211..000000000
--- a/modules/ocl/src/opencl/operator_setTo.cl
+++ /dev/null
@@ -1,95 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void set_to_without_mask_C1_D0(__global uchar * scalar,__global uchar * dstMat,
-        int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
-{
-        int x=get_global_id(0)<<2;
-        int y=get_global_id(1);
-        int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
-        uchar4 out;
-        out.x = out.y = out.z = out.w = scalar[0];
-
-        if ( (x+3 < cols) && (y < rows)&& ((offset_in_pixel&3) == 0))
-        {
-            *(__global uchar4*)(dstMat+idx) = out;
-        }
-        else
-        {
-             if((x+3 < cols) && (y < rows))
-             {
-                dstMat[idx] = out.x;
-                dstMat[idx+1] = out.y;
-                dstMat[idx+2] = out.z;
-                dstMat[idx+3] = out.w;
-             }
-             if((x+2 < cols) && (y < rows))
-             {
-                dstMat[idx] = out.x;
-                dstMat[idx+1] = out.y;
-                dstMat[idx+2] = out.z;
-             }
-             else if((x+1 < cols) && (y < rows))
-             {
-                dstMat[idx] = out.x;
-                dstMat[idx+1] = out.y;
-             }
-             else if((x < cols) && (y < rows))
-             {
-                dstMat[idx] = out.x;
-             }
-        }
-}
-
-__kernel void set_to_without_mask(__global GENTYPE * scalar,__global GENTYPE * dstMat,
-        int cols, int rows, int dstStep_in_pixel, int offset_in_pixel)
-{
-        int x = get_global_id(0);
-        int y = get_global_id(1);
-        if ( (x < cols) & (y < rows))
-        {
-            int idx = mad24(y, dstStep_in_pixel, x + offset_in_pixel);
-            dstMat[idx] = scalar[0];
-        }
-}
diff --git a/modules/ocl/src/opencl/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl
deleted file mode 100644
index afaa2e61f..000000000
--- a/modules/ocl/src/opencl/operator_setToM.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-__kernel void set_to_with_mask(
-        __global GENTYPE * scalar,
-        __global GENTYPE * dstMat,
-        int cols,
-        int rows,
-        int dstStep_in_pixel,
-        int dstoffset_in_pixel,
-        __global const uchar * restrict maskMat,
-        int maskStep,
-        int maskoffset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int maskidx = mad24(y,maskStep,x+ maskoffset);
-        if (maskMat[maskidx])
-        {
-            int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
-            dstMat[dstidx] = scalar[0];
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl
deleted file mode 100644
index 6f10067e9..000000000
--- a/modules/ocl/src/opencl/pyr_down.cl
+++ /dev/null
@@ -1,1010 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Dachuan Zhao, dachuan@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-inline int idx_row_low(int y, int last_row)
-{
-    return abs(y) % (last_row + 1);
-}
-
-inline int idx_row_high(int y, int last_row)
-{
-    return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
-}
-
-inline int idx_row(int y, int last_row)
-{
-    return idx_row_low(idx_row_high(y, last_row), last_row);
-}
-
-inline int idx_col_low(int x, int last_col)
-{
-    return abs(x) % (last_col + 1);
-}
-
-inline int idx_col_high(int x, int last_col)
-{
-    return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
-}
-
-inline int idx_col(int x, int last_col)
-{
-    return idx_col_low(idx_col_high(x, last_col), last_col);
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_8UC1 ///////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float smem[256 + 4];
-
-    float sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       0.0625f * (((srcData + (src_y - 2) * srcStep))[x]);
-        sum = sum + 0.25f   * (((srcData + (src_y - 1) * srcStep))[x]);
-        sum = sum + 0.375f  * (((srcData + (src_y    ) * srcStep))[x]);
-        sum = sum + 0.25f   * (((srcData + (src_y + 1) * srcStep))[x]);
-        sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[x]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       0.0625f * (((srcData + (src_y - 2) * srcStep))[left_x]);
-            sum = sum + 0.25f   * (((srcData + (src_y - 1) * srcStep))[left_x]);
-            sum = sum + 0.375f  * (((srcData + (src_y    ) * srcStep))[left_x]);
-            sum = sum + 0.25f   * (((srcData + (src_y + 1) * srcStep))[left_x]);
-            sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[left_x]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       0.0625f * (((srcData + (src_y - 2) * srcStep))[right_x]);
-            sum = sum + 0.25f   * (((srcData + (src_y - 1) * srcStep))[right_x]);
-            sum = sum + 0.375f  * (((srcData + (src_y    ) * srcStep))[right_x]);
-            sum = sum + 0.25f   * (((srcData + (src_y + 1) * srcStep))[right_x]);
-            sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[right_x]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
-        sum = sum + 0.25f   * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
-        sum = sum + 0.375f  * (((srcData + idx_row(src_y    , last_row) * srcStep))[col]);
-        sum = sum + 0.25f   * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
-        sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
-            sum = sum + 0.25f   * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
-            sum = sum + 0.375f  * (((srcData + idx_row(src_y    , last_row) * srcStep))[col]);
-            sum = sum + 0.25f   * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
-            sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
-            sum = sum + 0.25f   * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
-            sum = sum + 0.375f  * (((srcData + idx_row(src_y    , last_row) * srcStep))[col]);
-            sum = sum + 0.25f   * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
-            sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_8UC4 ///////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float4 smem[256 + 4];
-
-    float4 sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    float4 co1 = 0.375f;
-    float4 co2 = 0.25f;
-    float4 co3 = 0.0625f;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x]));
-        sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
-        sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[x]));
-        sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
-        sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x]));
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x]));
-            sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
-            sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[left_x]));
-            sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
-            sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x]));
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x]));
-            sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
-            sum = sum + co1 * convert_float4((((srcData + (src_y    ) * srcStep / 4))[right_x]));
-            sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
-            sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x]));
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-        sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-        sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-        sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
-        sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
-            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co1 * convert_float4((((srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]));
-            sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
-            sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2 * smem[2 + tid2 - 1];
-        sum = sum + co1 * smem[2 + tid2    ];
-        sum = sum + co2 * smem[2 + tid2 + 1];
-        sum = sum + co3 * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16UC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float smem[256 + 4];
-
-    float sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
-        sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
-        sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
-            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
-            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
-            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + (src_y    ) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
-            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-        sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-        sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global ushort*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16UC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float4 smem[256 + 4];
-
-    float4 sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    float4 co1 = 0.375f;
-    float4 co2 = 0.25f;
-    float4 co3 = 0.0625f;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
-        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
-        sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x]);
-        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
-        sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
-            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
-            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
-            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
-            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-        sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-        sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-        sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2 * smem[2 + tid2 - 1];
-        sum = sum + co1 * smem[2 + tid2    ];
-        sum = sum + co2 * smem[2 + tid2 + 1];
-        sum = sum + co3 * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16SC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float smem[256 + 4];
-
-    float sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
-        sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
-        sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
-            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
-            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
-            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + (src_y    ) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
-            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-        sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-        sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global short*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_16SC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float4 smem[256 + 4];
-
-    float4 sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    float4 co1 = 0.375f;
-    float4 co2 = 0.25f;
-    float4 co3 = 0.0625f;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
-        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
-        sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x]);
-        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
-        sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
-            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
-            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
-            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
-            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-        sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-        sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-        sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col]);
-            sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
-            sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2 * smem[2 + tid2 - 1];
-        sum = sum + co1 * smem[2 + tid2    ];
-        sum = sum + co2 * smem[2 + tid2 + 1];
-        sum = sum + co3 * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_32FC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float smem[256 + 4];
-
-    float sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
-        sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + (src_y    ) * srcStep))[x];
-        sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
-        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
-            sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + (src_y    ) * srcStep))[left_x];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
-            sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
-            sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + (src_y    ) * srcStep))[right_x];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
-            sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-        sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-        sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-        sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
-            sum = sum + 0.375f  * ((__global float*)((__global char*)srcData + idx_row(src_y    , last_row) * srcStep))[col];
-            sum = sum + 0.25f   * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
-            sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 4 + dst_x] = sum;
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-//////////////////////////  CV_32FC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols)
-{
-    const int x = get_global_id(0);
-    const int y = get_group_id(1);
-
-    __local float4 smem[256 + 4];
-
-    float4 sum;
-
-    const int src_y = 2*y;
-    const int last_row = srcRows - 1;
-    const int last_col = srcCols - 1;
-
-    float4 co1 = 0.375f;
-    float4 co2 = 0.25f;
-    float4 co3 = 0.0625f;
-
-    if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
-    {
-        sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x];
-        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
-        sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[x];
-        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
-        sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
-            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[left_x];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
-            sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            sum =       co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
-            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y    ) * srcStep / 4))[right_x];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
-            sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-    else
-    {
-        int col = idx_col(x, last_col);
-
-        sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-        sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-        sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
-        sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
-        smem[2 + get_local_id(0)] = sum;
-
-        if (get_local_id(0) < 2)
-        {
-            const int left_x = x - 2;
-
-            col = idx_col(left_x, last_col);
-
-            sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
-            sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
-            smem[get_local_id(0)] = sum;
-        }
-
-        if (get_local_id(0) > 253)
-        {
-            const int right_x = x + 2;
-
-            col = idx_col(right_x, last_col);
-
-            sum =       co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
-            sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y    , last_row) * srcStep / 4))[col];
-            sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
-            sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
-            smem[4 + get_local_id(0)] = sum;
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < 128)
-    {
-        const int tid2 = get_local_id(0) * 2;
-
-        sum =       co3 * smem[2 + tid2 - 2];
-        sum = sum + co2 * smem[2 + tid2 - 1];
-        sum = sum + co1 * smem[2 + tid2    ];
-        sum = sum + co2 * smem[2 + tid2 + 1];
-        sum = sum + co3 * smem[2 + tid2 + 2];
-
-        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
-        if (dst_x < dstCols)
-            dst[y * dstStep / 16 + dst_x] = sum;
-    }
-}
diff --git a/modules/ocl/src/opencl/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl
deleted file mode 100644
index 157fee894..000000000
--- a/modules/ocl/src/opencl/pyr_up.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Zhang Chunpeng	chunpeng@multicorewareinc.com
-//    Dachuan Zhao, dachuan@multicorewareinc.com
-//    Yao Wang, yao@multicorewareinc.com
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////  Generic PyrUp  //////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrUp(__global Type* src, __global Type* dst,
-                          int srcRows, int dstRows, int srcCols, int dstCols,
-                          int srcOffset, int dstOffset, int srcStep, int dstStep)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    const int lsizex = get_local_size(0);
-    const int lsizey = get_local_size(1);
-
-    const int tidx = get_local_id(0);
-    const int tidy = get_local_id(1);
-
-    __local floatType s_srcPatch[10][10];
-    __local floatType s_dstPatch[20][16];
-
-    if( tidx < 10 && tidy < 10 )
-    {
-        int srcx = mad24((int)get_group_id(0), lsizex>>1, tidx) - 1;
-        int srcy = mad24((int)get_group_id(1), lsizey>>1, tidy) - 1;
-
-        srcx = abs(srcx);
-        srcx = min(srcCols - 1,srcx);
-
-        srcy = abs(srcy);
-        srcy = min(srcRows -1 ,srcy);
-
-        s_srcPatch[tidy][tidx] = convertToFloat(src[srcx + srcy * srcStep]);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    floatType sum = (floatType)0;
-    const floatType evenFlag = (floatType)((tidx & 1) == 0);
-    const floatType oddFlag = (floatType)((tidx & 1) != 0);
-    const bool  eveny = ((tidy & 1) == 0);
-
-    const floatType co1 = (floatType)0.375f;
-    const floatType co2 = (floatType)0.25f;
-    const floatType co3 = (floatType)0.0625f;
-
-    if(eveny)
-    {
-        sum =       ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 2) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx - 1) >> 1)];
-        sum = sum + ( evenFlag* co1 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx    ) >> 1)];
-        sum = sum + ( oddFlag * co2 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 1) >> 1)];
-        sum = sum + ( evenFlag* co3 ) * s_srcPatch[1 + (tidy >> 1)][1 + ((tidx + 2) >> 1)];
-    }
-
-    s_dstPatch[2 + tidy][tidx] = sum;
-
-    if (tidy < 2)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum =       (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
-        }
-
-        s_dstPatch[tidy][tidx] = sum;
-    }
-
-    if (tidy > 13)
-    {
-        sum = 0;
-
-        if (eveny)
-        {
-            sum =       (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx - 2) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx - 1) >> 1)];
-            sum = sum + (evenFlag * co1) * s_srcPatch[lsizey-7][1 + ((tidx    ) >> 1)];
-            sum = sum + ( oddFlag * co2) * s_srcPatch[lsizey-7][1 + ((tidx + 1) >> 1)];
-            sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-7][1 + ((tidx + 2) >> 1)];
-        }
-        s_dstPatch[4 + tidy][tidx] = sum;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    sum =       co3 * s_dstPatch[2 + tidy - 2][tidx];
-    sum = sum + co2 * s_dstPatch[2 + tidy - 1][tidx];
-    sum = sum + co1 * s_dstPatch[2 + tidy    ][tidx];
-    sum = sum + co2 * s_dstPatch[2 + tidy + 1][tidx];
-    sum = sum + co3 * s_dstPatch[2 + tidy + 2][tidx];
-
-    if ((x < dstCols) && (y < dstRows))
-        dst[x + y * dstStep] = convertToType(4.0f * sum);
-}
diff --git a/modules/ocl/src/opencl/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl
deleted file mode 100644
index b52b3c206..000000000
--- a/modules/ocl/src/opencl/split_mat.cl
+++ /dev/null
@@ -1,217 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#if DATA_DEPTH == 0
-#define BASE_TYPE uchar
-#elif DATA_DEPTH == 1
-#error data_depth char, use uchar datatype instead
-#elif DATA_DEPTH == 2
-#define BASE_TYPE ushort
-#elif DATA_DEPTH == 3
-#error data_depth short, use ushort datatype instead
-#elif DATA_DEPTH == 4
-#define BASE_TYPE int
-#elif DATA_DEPTH == 5
-#define BASE_TYPE float
-#elif DATA_DEPTH == 6
-#define BASE_TYPE double
-#else
-#error data_depth
-#endif
-
-#if DATA_CHAN == 2
-#define SRC_VEC_SIZE 2
-#elif DATA_CHAN == 3
-#define SRC_VEC_SIZE 4 // C3 is stored as C4
-#elif DATA_CHAN == 4
-#define SRC_VEC_SIZE 4
-#else
-#error data_chan
-#endif
-
-#define __CAT(x, y) x##y
-#define CAT(x, y) __CAT(x, y)
-
-#define uchar1 uchar
-#define char1 char
-#define ushort1 ushort
-#define short1 short
-#define int1 int
-#define float1 float
-#define double1 double
-
-#define TYPE BASE_TYPE
-
-#define SRC_TYPE CAT(BASE_TYPE, SRC_VEC_SIZE)
-
-#define DST_VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
-
-#define vstore1 vstore
-#define VSTORE CAT(vstore, VEC_SIZE)
-#define VSTORE_ALIGNED(ptr, v) *((__global DST_VEC_TYPE*)(ptr)) = (v)
-#define VSTORE_UNALIGNED(ptr, v) VSTORE((v), 0, (__global TYPE*)(ptr))
-
-#ifdef DST0_ALIGNED
-#define VSTORE_dst0 VSTORE_ALIGNED
-#else
-#define VSTORE_dst0 VSTORE_UNALIGNED
-#endif
-#ifdef DST1_ALIGNED
-#define VSTORE_dst1 VSTORE_ALIGNED
-#else
-#define VSTORE_dst1 VSTORE_UNALIGNED
-#endif
-#ifdef DST2_ALIGNED
-#define VSTORE_dst2 VSTORE_ALIGNED
-#else
-#define VSTORE_dst2 VSTORE_UNALIGNED
-#endif
-#ifdef DST3_ALIGNED
-#define VSTORE_dst3 VSTORE_ALIGNED
-#else
-#define VSTORE_dst3 VSTORE_UNALIGNED
-#endif
-
-__kernel void split_vector(
-        __global SRC_TYPE* src, int srcStepBytes, int2 srcOffset, // offset.x in bytes
-        __global TYPE* dst0, int dst0StepBytes, int2 dst0Offset,
-        __global TYPE* dst1, int dst1StepBytes, int2 dst1Offset,
-#if DATA_CHAN > 2
-        __global TYPE* dst2, int dst2StepBytes, int2 dst2Offset,
-#endif
-#if DATA_CHAN > 3
-        __global TYPE* dst3, int dst3StepBytes, int2 dst3Offset,
-#endif
-        int2 size)
-
-{
-    int x = get_global_id(0) * VEC_SIZE;
-    int y = get_global_id(1);
-
-    if (x < size.x && y < size.y)
-    {
-        SRC_TYPE srcData[VEC_SIZE];
-        int xOffsetLimitBytes = srcOffset.x + size.x * sizeof(SRC_TYPE);
-        int xOffsetBytes = srcOffset.x + x * sizeof(SRC_TYPE);
-        int yOffsetBytes = (srcOffset.y + y) * srcStepBytes;
-#pragma unroll
-        for (int i = 0; i < VEC_SIZE; i++, xOffsetBytes += sizeof(SRC_TYPE))
-        {
-            srcData[i] = (xOffsetBytes >= xOffsetLimitBytes) ? (SRC_TYPE)0 :
-                    *(__global SRC_TYPE*)((__global char*)src + yOffsetBytes + xOffsetBytes);
-        }
-
-#if VEC_SIZE == 1
-        TYPE dstC0 = srcData[0].s0;
-        TYPE dstC1 = srcData[0].s1;
-#if DATA_CHAN > 2
-        TYPE dstC2 = srcData[0].s2;
-#endif
-#if DATA_CHAN > 3
-        TYPE dstC3 = srcData[0].s3;
-#endif
-# define VEC_TO_ARRAY(v, a) TYPE a[1] = {v};
-#elif VEC_SIZE == 2
-        DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0);
-        DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1);
-#if DATA_CHAN > 2
-        DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2);
-#endif
-#if DATA_CHAN > 3
-        DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3);
-#endif
-# define VEC_TO_ARRAY(v, a) TYPE a[2] = {v.s0, v.s1};
-#elif VEC_SIZE == 4
-        DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0, srcData[2].s0, srcData[3].s0);
-        DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1, srcData[2].s1, srcData[3].s1);
-#if DATA_CHAN > 2
-        DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2, srcData[2].s2, srcData[3].s2);
-#endif
-#if DATA_CHAN > 3
-        DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3, srcData[2].s3, srcData[3].s3);
-#endif
-# define VEC_TO_ARRAY(v, a) TYPE a[4] = {v.s0, v.s1, v.s2, v.s3};
-#endif
-
-#ifndef BYPASS_VSTORE
-#define BYPASS_VSTORE false
-#endif
-
-#define WRITE_VEC_DST(dst, vecValue) \
-{ \
-        int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \
-        int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \
-        int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \
-        if (!BYPASS_VSTORE && dst ## xOffsetBytes + (int)sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \
-        { \
-            VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \
-        } \
-        else \
-        { \
-            VEC_TO_ARRAY(vecValue, vecValue##Array); \
-            for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
-            { \
-                if (dst ## xOffsetBytes + (int)sizeof(TYPE) <= dst ## xOffsetLimitBytes) \
-                    *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \
-                else \
-                    break; \
-            } \
-        } \
-}
-
-        WRITE_VEC_DST(dst0, dstC0);
-        WRITE_VEC_DST(dst1, dstC1);
-#if DATA_CHAN > 2
-        WRITE_VEC_DST(dst2, dstC2);
-#endif
-#if DATA_CHAN > 3
-        WRITE_VEC_DST(dst3, dstC3);
-#endif
-    }
-}
diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl
deleted file mode 100644
index d3efb5eb4..000000000
--- a/modules/ocl/src/opencl/stereobm.cl
+++ /dev/null
@@ -1,338 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Sen Liu, swjtuls1987@126.com
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#define ROWSperTHREAD 21     // the number of rows a thread will process
-#define BLOCK_W       128    // the thread block width (464)
-#define N_DISPARITIES 8
-
-#define STEREO_MIND 0                    // The minimum d range to check
-#define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
-
-#ifndef radius
-#define radius 64
-#endif
-
-inline unsigned int CalcSSD(__local unsigned int *col_ssd)
-{
-    unsigned int cache = col_ssd[0];
-
-#pragma unroll
-    for(int i = 1; i <= (radius << 1); i++)
-        cache += col_ssd[i];
-
-    return cache;
-}
-
-inline uint2 MinSSD(__local unsigned int *col_ssd)
-{
-    unsigned int ssd[N_DISPARITIES];
-    const int win_size = (radius << 1);
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
-    ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size));
-    ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size));
-    ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size));
-    ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size));
-    ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size));
-    ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size));
-    ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size));
-    ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size));
-
-    unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
-
-    int bestIdx = 0;
-
-    for (int i = 0; i < N_DISPARITIES; i++)
-    {
-        if (mssd == ssd[i])
-            bestIdx = i;
-    }
-
-    return (uint2)(mssd, bestIdx);
-}
-
-inline void StepDown(int idx1, int idx2, __global unsigned char* imageL,
-              __global unsigned char* imageR, int d,   __local unsigned int *col_ssd)
-{
-    uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
-    uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7)));
-    uint8 diff1 = (uint8)(imageL[idx1]) - imgR1;
-    uint8 diff2 = (uint8)(imageL[idx2]) - imgR2;
-    uint8 res = diff2 * diff2 - diff1 * diff1;
-    const int win_size = (radius << 1);
-    col_ssd[0 * (BLOCK_W + win_size)] += res.s7;
-    col_ssd[1 * (BLOCK_W + win_size)] += res.s6;
-    col_ssd[2 * (BLOCK_W + win_size)] += res.s5;
-    col_ssd[3 * (BLOCK_W + win_size)] += res.s4;
-    col_ssd[4 * (BLOCK_W + win_size)] += res.s3;
-    col_ssd[5 * (BLOCK_W + win_size)] += res.s2;
-    col_ssd[6 * (BLOCK_W + win_size)] += res.s1;
-    col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
-}
-
-inline void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
-                __global unsigned char* imageR, int d,
-                 __local unsigned int *col_ssd)
-{
-    uint8 leftPixel1;
-    uint8 diffa = 0;
-    int idx = y_tex * im_pitch + x_tex;
-    const int win_size = (radius << 1);
-    for(int i = 0; i < (win_size + 1); i++)
-    {
-        leftPixel1 = (uint8)(imageL[idx]);
-        uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7)));
-        uint8 res = leftPixel1 - imgR;
-        diffa += res * res;
-
-        idx += im_pitch;
-    }
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
-    col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7;
-    col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6;
-    col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5;
-    col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4;
-    col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3;
-    col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2;
-    col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1;
-    col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0;
-}
-
-__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
-                           __global unsigned int *cminSSDImage, int cminSSD_step,
-                           __global unsigned char *disp, int disp_step,int cwidth, int cheight,
-                           int img_step, int maxdisp,
-                           __local unsigned int *col_ssd_cache)
-{
-    __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0);
-    __local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0;
-
-    int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
-
-#define Y (int)(get_group_id(1) * ROWSperTHREAD + radius)
-
-    __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
-    __global unsigned char* disparImage = disp + X + Y * disp_step;
-
-    int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
-    int y_tex;
-    int x_tex = X - radius;
-
-    //if (x_tex >= cwidth)
-    //    return;
-
-    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
-    {
-        y_tex = Y - radius;
-
-        InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd);
-        if (col_ssd_extra > 0)
-            if (x_tex + BLOCK_W < cwidth)
-                InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
-
-        barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
-
-        uint2 minSSD = MinSSD(col_ssd);
-        if (X < cwidth - radius && Y < cheight - radius)
-        {
-            if (minSSD.x < minSSDImage[0])
-            {
-                disparImage[0] = (unsigned char)(d + minSSD.y);
-                minSSDImage[0] = minSSD.x;
-            }
-        }
-
-        for(int row = 1; row < end_row; row++)
-        {
-            int idx1 = y_tex * img_step + x_tex;
-            int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex;
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            StepDown(idx1, idx2, left, right, d, col_ssd);
-            if (col_ssd_extra > 0)
-                if (x_tex + BLOCK_W < cwidth)
-                    StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            uint2 minSSD = MinSSD(col_ssd);
-            if (X < cwidth - radius && row < cheight - radius - Y)
-            {
-                int idx = row * cminSSD_step;
-                if (minSSD.x < minSSDImage[idx])
-                {
-                    disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
-                    minSSDImage[idx] = minSSD.x;
-                }
-            }
-
-            y_tex++;
-        } // for row loop
-    } // for d loop
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
-                               int rows, int cols, int prefilterCap)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < cols && y < rows)
-    {
-        int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
-                  input[(y)   * cols + (x-1)] * (-2) + input[(y)   * cols + (x+1)] * (2) +
-                  input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
-
-        cov = min(min(max(-prefilterCap, cov), prefilterCap) + prefilterCap, 255);
-        output[y * cols + x] = cov & 0xFF;
-    }
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// Textureness filtering ////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-inline float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
-{
-    float conv = 0;
-    int y1 = y==0? 0 : y-1;
-    int x1 = x==0? 0 : x-1;
-    if(x < cols && y < rows && x > 0 && y > 0)
-    {
-        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) +
-               (float)input[(y)   * cols + (x1)] * (-2) + (float)input[(y)   * cols + (x+1)] * (2) +
-               (float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
-
-    }
-    return fabs(conv);
-}
-
-inline float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
-{
-    unsigned int cache = cols[0];
-
-    for(int i = 1; i <= winsz; i++)
-        cache += cols[i];
-
-    return cache;
-}
-
-#define RpT (2 * ROWSperTHREAD)  // got experimentally
-__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
-                                 int disp_step, __global unsigned char *input, int input_rows,
-                                 int input_cols,int winsz, float threshold,
-                                 __local float *cols_cache)
-{
-    int winsz2 = winsz/2;
-    int n_dirty_pixels = (winsz2) * 2;
-
-    int local_id_x = get_local_id(0);
-    int group_size_x = get_local_size(0);
-    int group_id_y = get_group_id(1);
-
-    __local float *cols = cols_cache + group_size_x + local_id_x;
-    __local float *cols_extra = local_id_x < n_dirty_pixels ? cols + group_size_x : 0;
-
-    int x = get_global_id(0);
-    int beg_row = group_id_y * RpT;
-    int end_row = min(beg_row + RpT, disp_rows);
-
-
-    int y = beg_row;
-
-    float sum = 0;
-    float sum_extra = 0;
-
-    for(int i = y - winsz2; i <= y + winsz2; ++i)
-    {
-        sum += sobel(input, x - winsz2, i, input_rows, input_cols);
-        if (cols_extra)
-            sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
-    }
-    *cols = sum;
-    if (cols_extra)
-        *cols_extra = sum_extra;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
-    if (sum_win < threshold)
-        disp[y * disp_step + x] = 0;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(int y = beg_row + 1; y < end_row; ++y)
-    {
-        sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
-              sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
-        *cols = sum;
-
-        if (cols_extra)
-        {
-            sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
-                        + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
-            *cols_extra = sum_extra;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        if (x < disp_cols)
-        {
-            float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
-            if (sum_win < threshold)
-                disp[y * disp_step + x] = 0;
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-}
diff --git a/modules/ocl/src/opencl/stereobp.cl b/modules/ocl/src/opencl/stereobp.cl
deleted file mode 100644
index 4b5864f4c..000000000
--- a/modules/ocl/src/opencl/stereobp.cl
+++ /dev/null
@@ -1,393 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Peng Xiao,   pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef T_FLOAT
-#define T float
-#define T4 float4
-#else
-#define T short
-#define T4 short4
-#endif
-
-///////////////////////////////////////////////////////////////
-/////////////////common///////////////////////////////////////
-/////////////////////////////////////////////////////////////
-inline T saturate_cast(float v){
-#ifdef T_SHORT
-    return convert_short_sat_rte(v);
-#else
-    return v;
-#endif
-}
-
-inline T4 saturate_cast4(float4 v){
-#ifdef T_SHORT
-    return convert_short4_sat_rte(v);
-#else
-    return v;
-#endif
-}
-
-#define FLOAT_MAX 3.402823466e+38f
-typedef struct
-{
-    int   cndisp;
-    float cmax_data_term;
-    float cdata_weight;
-    float cmax_disc_term;
-    float cdisc_single_jump;
-}con_srtuct_t;
-///////////////////////////////////////////////////////////////
-////////////////////////// comp data //////////////////////////
-///////////////////////////////////////////////////////////////
-
-inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
-{
-    return abs((int)(l.x) - *rs);
-}
-
-static float pix_diff_4(const uchar4 l, __global const uchar *rs)
-{
-    uchar4 r;
-    r = *((__global uchar4 *)rs);
-
-    const float tr = 0.299f;
-    const float tg = 0.587f;
-    const float tb = 0.114f;
-
-    float val;
-
-    val  = tb * abs((int)l.x - r.x);
-    val += tg * abs((int)l.y - r.y);
-    val += tr * abs((int)l.z - r.z);
-
-    return val;
-}
-
-inline float pix_diff_3(const uchar4 l, __global const uchar *rs)
-{
-    return pix_diff_4(l, rs);
-}
-
-#ifndef CN
-#define CN 4
-#endif
-
-#ifndef CNDISP
-#define CNDISP 64
-#endif
-
-#define CAT(X,Y) X##Y
-#define CAT2(X,Y) CAT(X,Y)
-
-#define PIX_DIFF CAT2(pix_diff_, CN)
-
-__kernel void comp_data(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
-                        __global uchar *right, int right_step,
-                        __global T *data, int data_step,
-                        __constant con_srtuct_t *con_st)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
-    {
-        data_step /= sizeof(T);
-        const __global uchar* ls = left  + y * left_step  + x * CN;
-        const __global uchar* rs = right + y * right_step + x * CN;
-
-        __global T *ds = data + y * data_step + x;
-
-        const unsigned int disp_step = data_step * left_rows;
-        const float weightXterm = con_st -> cdata_weight * con_st -> cmax_data_term;
-        const uchar4 ls_data = vload4(0, ls);
-
-        for (int disp = 0; disp < con_st -> cndisp; disp++)
-        {
-            if (x - disp >= 1)
-            {
-                float val = 0;
-                val = PIX_DIFF(ls_data, rs - disp * CN);
-                ds[disp * disp_step] =  saturate_cast(fmin(con_st -> cdata_weight * val, weightXterm));
-            }
-            else
-            {
-                ds[disp * disp_step] =  saturate_cast(weightXterm);
-            }
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////
-//////////////////////// data step down ///////////////////////
-///////////////////////////////////////////////////////////////
-__kernel void data_step_down(__global T *src, int src_rows,
-                             __global T *dst, int dst_rows, int dst_cols,
-                             int src_step, int dst_step,
-                             int cndisp)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        src_step /= sizeof(T);
-        dst_step /= sizeof(T);
-        int4 coor_step = (int4)(src_rows * src_step);
-        int4 coor = (int4)(min(2*y+0, src_rows-1) * src_step + 2*x+0,
-                           min(2*y+1, src_rows-1) * src_step + 2*x+0,
-                           min(2*y+0, src_rows-1) * src_step + 2*x+1,
-                           min(2*y+1, src_rows-1) * src_step + 2*x+1);
-
-        for (int d = 0; d < cndisp; ++d)
-        {
-            float dst_reg;
-            dst_reg  = src[coor.x];
-            dst_reg += src[coor.y];
-            dst_reg += src[coor.z];
-            dst_reg += src[coor.w];
-            coor += coor_step;
-
-            dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////
-/////////////////// level up messages  ////////////////////////
-///////////////////////////////////////////////////////////////
-__kernel void level_up_message(__global T *src, int src_rows, int src_step,
-                               __global T *dst, int dst_rows, int dst_cols, int dst_step,
-                               int cndisp)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        src_step /= sizeof(T);
-        dst_step /= sizeof(T);
-
-        const int dst_disp_step = dst_step * dst_rows;
-        const int src_disp_step = src_step * src_rows;
-
-        __global T       *dstr = dst + y * dst_step + x;
-        __global const T *srcr = src + (y / 2 * src_step) + (x / 2);
-
-        for (int d = 0; d < cndisp; ++d)
-            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-    }
-}
-
-///////////////////////////////////////////////////////////////
-////////////////////  calc all iterations /////////////////////
-///////////////////////////////////////////////////////////////
-static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
-              const __global T *dt,
-              int u_step, int msg_disp_step, int data_disp_step,
-              float4 cmax_disc_term, float4 cdisc_single_jump)
-{
-    __global T *us = us_ + u_step;
-    __global T *ds = ds_ - u_step;
-    __global T *ls = ls_ + 1;
-    __global T *rs = rs_ - 1;
-
-    float4 minimum = (float4)(FLOAT_MAX);
-
-    T4 t_dst[CNDISP];
-    float4 dst_reg;
-    float4 prev;
-    float4 cur;
-
-    T t_us = us[0];
-    T t_ds = ds[0];
-    T t_ls = ls[0];
-    T t_rs = rs[0];
-    T t_dt = dt[0];
-
-    prev = (float4)(t_us + t_ls + t_rs + t_dt,
-                    t_ds + t_ls + t_rs + t_dt,
-                    t_us + t_ds + t_rs + t_dt,
-                    t_us + t_ds + t_ls + t_dt);
-
-    minimum = min(prev, minimum);
-
-    t_dst[0] = saturate_cast4(prev);
-
-    for(int i = 1, idx = msg_disp_step; i < CNDISP; ++i, idx+=msg_disp_step)
-    {
-        t_us = us[idx];
-        t_ds = ds[idx];
-        t_ls = ls[idx];
-        t_rs = rs[idx];
-        t_dt = dt[data_disp_step * i];
-
-        dst_reg = (float4)(t_us + t_ls + t_rs + t_dt,
-                           t_ds + t_ls + t_rs + t_dt,
-                           t_us + t_ds + t_rs + t_dt,
-                           t_us + t_ds + t_ls + t_dt);
-
-        minimum = min(dst_reg, minimum);
-
-        prev += cdisc_single_jump;
-        prev = min(prev, dst_reg);
-
-        t_dst[i] = saturate_cast4(prev);
-    }
-
-    minimum += cmax_disc_term;
-
-    float4 sum = (float4)(0);
-    prev = convert_float4(t_dst[CNDISP - 1]);
-    for (int disp = CNDISP - 2; disp >= 0; disp--)
-    {
-        prev += cdisc_single_jump;
-        cur = convert_float4(t_dst[disp]);
-        prev = min(prev, cur);
-        cur = min(prev, minimum);
-        sum += cur;
-
-        t_dst[disp] = saturate_cast4(cur);
-    }
-
-    dst_reg = convert_float4(t_dst[CNDISP - 1]);
-    dst_reg = min(dst_reg, minimum);
-    t_dst[CNDISP - 1] = saturate_cast4(dst_reg);
-    sum += dst_reg;
-
-    sum /= (float4)(CNDISP);
-#pragma unroll
-    for(int i = 0, idx = 0; i < CNDISP; ++i, idx+=msg_disp_step)
-    {
-        T4 dst = t_dst[i];
-        us_[idx] = dst.x - sum.x;
-        ds_[idx] = dst.y - sum.y;
-        rs_[idx] = dst.z - sum.z;
-        ls_[idx] = dst.w - sum.w;
-    }
-}
-__kernel void one_iteration(__global T *u,    int u_step,
-                            __global T *data, int data_step,
-                            __global T *d,    __global T *l, __global T *r,
-                            int t, int cols, int rows,
-                            float cmax_disc_term, float cdisc_single_jump)
-{
-    const int y = get_global_id(1);
-    const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);
-
-    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
-    {
-        u_step    /= sizeof(T);
-        data_step /= sizeof(T);
-
-        __global T *us = u + y * u_step + x;
-        __global T *ds = d + y * u_step + x;
-        __global T *ls = l + y * u_step + x;
-        __global T *rs = r + y * u_step + x;
-        const __global  T *dt = data + y * data_step + x;
-
-        int msg_disp_step = u_step * rows;
-        int data_disp_step = data_step * rows;
-
-        message(us, ds, ls, rs, dt,
-                u_step, msg_disp_step, data_disp_step,
-                (float4)(cmax_disc_term), (float4)(cdisc_single_jump));
-    }
-}
-
-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
-__kernel void output(const __global T *u, int u_step,
-                     const __global T *d, const __global T *l,
-                     const __global T *r, const __global T *data,
-                     __global T *disp, int disp_rows, int disp_cols, int disp_step,
-                     int cndisp)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
-    {
-        u_step    /= sizeof(T);
-        disp_step /= sizeof(T);
-        const __global T *us = u + (y + 1) * u_step + x;
-        const __global T *ds = d + (y - 1) * u_step + x;
-        const __global T *ls = l + y * u_step + (x + 1);
-        const __global T *rs = r + y * u_step + (x - 1);
-        const __global T *dt = data + y * u_step + x;
-
-        int disp_steps = disp_rows * u_step;
-
-        int best = 0;
-        float best_val = FLOAT_MAX;
-        for (int d = 0; d < cndisp; ++d)
-        {
-            float val;
-            val  = us[d * disp_steps];
-            val += ds[d * disp_steps];
-            val += ls[d * disp_steps];
-            val += rs[d * disp_steps];
-            val += dt[d * disp_steps];
-
-            if (val < best_val)
-            {
-                best_val = val;
-                best = d;
-            }
-        }
-
-        (disp + y * disp_step)[x] = convert_short_sat(best);
-    }
-}
diff --git a/modules/ocl/src/opencl/stereocsbp.cl b/modules/ocl/src/opencl/stereocsbp.cl
deleted file mode 100644
index 23fc81481..000000000
--- a/modules/ocl/src/opencl/stereocsbp.cl
+++ /dev/null
@@ -1,1382 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Jin Ma, jin@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////get_first_k_initial_global//////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void get_first_k_initial_global_0(__global short *data_cost_selected_, __global short *selected_disp_pyr,
-    __global short *ctemp, int h, int w, int nr_plane,
-    int cmsg_step1, int cdisp_step1, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        __global short *selected_disparity = selected_disp_pyr      + y * cmsg_step1 + x;
-        __global short *data_cost_selected = data_cost_selected_    + y * cmsg_step1 + x;
-        __global short *data_cost          = ctemp + y * cmsg_step1 + x;
-
-        for(int i = 0; i < nr_plane; i++)
-        {
-            short minimum = SHRT_MAX;
-            int id = 0;
-
-            for(int d = 0; d < cndisp; d++)
-            {
-                short cur = data_cost[d * cdisp_step1];
-                if(cur < minimum)
-                {
-                    minimum = cur;
-                    id = d;
-                }
-            }
-
-            data_cost_selected[i  * cdisp_step1] = minimum;
-            selected_disparity[i  * cdisp_step1] = id;
-            data_cost         [id * cdisp_step1] = SHRT_MAX;
-        }
-    }
-}
-
-__kernel void get_first_k_initial_global_1(__global  float *data_cost_selected_, __global float *selected_disp_pyr,
-    __global  float *ctemp, int h, int w, int nr_plane,
-    int cmsg_step1, int cdisp_step1, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        __global   float *selected_disparity = selected_disp_pyr      + y * cmsg_step1 + x;
-        __global   float *data_cost_selected = data_cost_selected_    + y * cmsg_step1 + x;
-        __global   float *data_cost          = ctemp + y * cmsg_step1 + x;
-
-        for(int i = 0; i < nr_plane; i++)
-        {
-            float minimum = FLT_MAX;
-            int id = 0;
-
-            for(int d = 0; d < cndisp; d++)
-            {
-                float cur = data_cost[d * cdisp_step1];
-                if(cur < minimum)
-                {
-                    minimum = cur;
-                    id = d;
-                }
-            }
-
-            data_cost_selected[i  * cdisp_step1] = minimum;
-            selected_disparity[i  * cdisp_step1] = id;
-            data_cost         [id * cdisp_step1] = FLT_MAX;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////get_first_k_initial_local////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void get_first_k_initial_local_0(__global  short *data_cost_selected_, __global short *selected_disp_pyr,
-    __global  short *ctemp,int h, int w, int nr_plane,
-    int cmsg_step1, int cdisp_step1, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        __global short *selected_disparity = selected_disp_pyr   + y * cmsg_step1 + x;
-        __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        __global short *data_cost = ctemp + y * cmsg_step1 + x;
-
-        int nr_local_minimum = 0;
-
-        short prev = data_cost[0 * cdisp_step1];
-        short cur  = data_cost[1 * cdisp_step1];
-        short next = data_cost[2 * cdisp_step1];
-
-        for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
-        {
-
-            if (cur < prev && cur < next)
-            {
-                data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                selected_disparity[nr_local_minimum * cdisp_step1] = d;
-                data_cost[d * cdisp_step1] = SHRT_MAX;
-
-                nr_local_minimum++;
-            }
-
-            prev = cur;
-            cur = next;
-            next = data_cost[(d + 1) * cdisp_step1];
-        }
-
-        for (int i = nr_local_minimum; i < nr_plane; i++)
-        {
-            short minimum = SHRT_MAX;
-            int id = 0;
-
-            for (int d = 0; d < cndisp; d++)
-            {
-                cur = data_cost[d * cdisp_step1];
-                if (cur < minimum)
-                {
-                    minimum = cur;
-                    id = d;
-                }
-            }
-
-            data_cost_selected[i * cdisp_step1] = minimum;
-            selected_disparity[i * cdisp_step1] = id;
-            data_cost[id * cdisp_step1] = SHRT_MAX;
-        }
-    }
-}
-
-__kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, __global float *selected_disp_pyr,
-    __global float *ctemp,int h, int w, int nr_plane,
-    int cmsg_step1,  int cdisp_step1, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        __global float *selected_disparity = selected_disp_pyr   + y * cmsg_step1 + x;
-        __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        __global float *data_cost = ctemp + y * cmsg_step1 + x;
-
-        int nr_local_minimum = 0;
-
-        float prev = data_cost[0 * cdisp_step1];
-        float cur  = data_cost[1 * cdisp_step1];
-        float next = data_cost[2 * cdisp_step1];
-
-        for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
-        {
-            if (cur < prev && cur < next)
-            {
-                data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                selected_disparity[nr_local_minimum * cdisp_step1] = d;
-                data_cost[d * cdisp_step1] = FLT_MAX ;
-
-                nr_local_minimum++;
-            }
-
-            prev = cur;
-            cur = next;
-            next = data_cost[(d + 1) * cdisp_step1];
-        }
-
-
-        for (int i = nr_local_minimum; i < nr_plane; i++)
-        {
-            float minimum = FLT_MAX;
-            int id = 0;
-
-            for (int d = 0; d < cndisp; d++)
-            {
-                cur = data_cost[d * cdisp_step1];
-                if (cur < minimum)
-                {
-                    minimum = cur;
-                    id = d;
-                }
-            }
-
-            data_cost_selected[i * cdisp_step1] = minimum;
-            selected_disparity[i * cdisp_step1] = id;
-            data_cost[id * cdisp_step1] = FLT_MAX;
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////
-/////////////////////// init data cost ////////////////////////
-///////////////////////////////////////////////////////////////
-
-inline float compute_3(__global uchar* left, __global uchar* right,
-    float cdata_weight,  float cmax_data_term)
-{
-    float tb = 0.114f * abs((int)left[0] - right[0]);
-    float tg = 0.587f * abs((int)left[1] - right[1]);
-    float tr = 0.299f * abs((int)left[2] - right[2]);
-
-    return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-}
-
-inline float compute_1(__global uchar* left, __global uchar* right,
-    float cdata_weight,  float cmax_data_term)
-{
-    return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term);
-}
-
-inline short round_short(float v)
-{
-    return convert_short_sat_rte(v);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////init_data_cost///////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
-    int h, int w, int level, int channels,
-    int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
-    int cth, int cimg_step, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
-
-        int x0 = x << level;
-        int xt = (x + 1) << level;
-
-        __global short *data_cost = ctemp + y * cmsg_step1 + x;
-
-        for(int d = 0; d < cndisp; ++d)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
-                {
-                    int xr = xi - d;
-                    if(d < cth || xr < 0)
-                        val += cdata_weight * cmax_data_term;
-                    else
-                    {
-                        __global uchar *lle = cleft  + yi * cimg_step + xi * channels;
-                        __global uchar *lri = cright + yi * cimg_step + xr * channels;
-
-                        if(channels == 1)
-                            val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                        else
-                            val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-                    }
-                }
-            }
-            data_cost[cdisp_step1 * d] = round_short(val);
-        }
-    }
-}
-
-__kernel void init_data_cost_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright,
-    int h, int w, int level, int channels,
-    int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
-    int cth, int cimg_step, int cndisp)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
-
-        int x0 = x << level;
-        int xt = (x + 1) << level;
-
-        __global float *data_cost = ctemp + y * cmsg_step1 + x;
-
-        for(int d = 0; d < cndisp; ++d)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
-                {
-                    int xr = xi - d;
-                    if(d < cth || xr < 0)
-                        val += cdata_weight * cmax_data_term;
-                    else
-                    {
-                        __global uchar* lle = cleft  + yi * cimg_step + xi * channels;
-                        __global uchar* lri = cright + yi * cimg_step + xr * channels;
-
-                        if(channels == 1)
-                            val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                        else
-                            val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-                    }
-                }
-            }
-            data_cost[cdisp_step1 * d] = val;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////init_data_cost_reduce//////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void init_data_cost_reduce_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
-    __local float *smem, int level, int rows, int cols, int h, int winsz, int channels,
-    int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth,
-    int cdisp_step1, int cmsg_step1)
-{
-    int x_out = get_group_id(0);
-    int y_out = get_group_id(1) % h;
-    //int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
-    int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2);
-
-    int tid = get_local_id(0);
-
-    if (d < cndisp)
-    {
-        int x0 = x_out << level;
-        int y0 = y_out << level;
-
-        int len = min(y0 + winsz, rows) - y0;
-
-        float val = 0.0f;
-        if (x0 + tid < cols)
-        {
-            if (x0 + tid - d < 0 || d < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                __global uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
-
-                for(int y = 0; y < len; ++y)
-                {
-                    if(channels == 1)
-                        val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                    else
-                        val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
-            }
-        }
-
-        __local float* dline = smem + winsz * get_local_id(2);
-
-        dline[tid] = val;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local float* dline = smem + winsz * get_local_id(2);
-        if (winsz >= 256)
-        {
-            if (tid < 128)
-                dline[tid] += dline[tid + 128];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local float* dline = smem + winsz * get_local_id(2);
-        if (winsz >= 128)
-        {
-            if (tid <  64)
-                dline[tid] += dline[tid + 64];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 64)
-            if (tid < 32)
-                vdline[tid] += vdline[tid + 32];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 32)
-            if (tid < 16)
-                vdline[tid] += vdline[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d<cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 16)
-            if (tid <  8)
-                vdline[tid] += vdline[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d<cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 8)
-            if (tid <  4)
-                vdline[tid] += vdline[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d<cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 4)
-            if (tid <  2)
-                vdline[tid] += vdline[tid + 2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d<cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 2)
-            if (tid <  1)
-                vdline[tid] += vdline[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local float* dline = smem + winsz * get_local_id(2);
-        __global short* data_cost = ctemp + y_out * cmsg_step1 + x_out;
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] = convert_short_sat_rte(dline[0]);
-    }
-}
-
-__kernel void init_data_cost_reduce_1(__global float *ctemp, __global uchar *cleft, __global uchar *cright,
-    __local float *smem, int level, int rows, int cols, int h, int winsz, int channels,
-    int cndisp,int cimg_step, float cdata_weight, float cmax_data_term, int cth,
-    int cdisp_step1, int cmsg_step1)
-{
-    int x_out = get_group_id(0);
-    int y_out = get_group_id(1) % h;
-    int d = (get_group_id(1) / h ) * get_local_size(2) + get_local_id(2);
-
-    int tid = get_local_id(0);
-
-    if (d < cndisp)
-    {
-        int x0 = x_out << level;
-        int y0 = y_out << level;
-
-        int len = min(y0 + winsz, rows) - y0;
-
-        float val = 0.0f;
-        //float val = 528.0f;
-
-        if (x0 + tid < cols)
-        {
-            if (x0 + tid - d < 0 || d < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                __global uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
-
-                for(int y = 0; y < len; ++y)
-                {
-                    if(channels == 1)
-                        val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                    else
-                        val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
-            }
-        }
-
-        __local float* dline = smem + winsz * get_local_id(2);
-
-        dline[tid] = val;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local float* dline = smem + winsz * get_local_id(2);
-        if (winsz >= 256)
-            if (tid < 128)
-                dline[tid] += dline[tid + 128];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local float* dline = smem + winsz * get_local_id(2);
-        if (winsz >= 128)
-            if (tid < 64)
-                dline[tid] += dline[tid + 64];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 64)
-            if (tid < 32)
-                vdline[tid] += vdline[tid + 32];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 32)
-            if (tid < 16)
-                vdline[tid] += vdline[tid + 16];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 16)
-            if (tid < 8)
-                vdline[tid] += vdline[tid + 8];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 8)
-            if (tid < 4)
-                vdline[tid] += vdline[tid + 4];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 4)
-            if (tid < 2)
-                vdline[tid] += vdline[tid + 2];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 2)
-            if (tid < 1)
-                vdline[tid] += vdline[tid + 1];
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < cndisp)
-    {
-        __global float *data_cost = ctemp + y_out * cmsg_step1 + x_out;
-        __local float* dline = smem + winsz * get_local_id(2);
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] =  dline[0];
-    }
-}
-
-///////////////////////////////////////////////////////////////
-////////////////////// compute data cost //////////////////////
-///////////////////////////////////////////////////////////////
-
-__kernel void compute_data_cost_0(__global const short *selected_disp_pyr, __global short *data_cost_,
-    __global uchar *cleft, __global uchar *cright,
-    int h, int w, int level, int nr_plane, int channels,
-    int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight,
-    float cmax_data_term, int cimg_step, int cth)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
-
-        int x0 = x << level;
-        int xt = (x + 1) << level;
-
-        __global const short *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
-        __global       short *data_cost          = data_cost_ + y * cmsg_step1 + x;
-
-        for(int d = 0; d < nr_plane; d++)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
-                {
-                    int sel_disp = selected_disparity[d * cdisp_step2];
-                    int xr = xi - sel_disp;
-
-                    if (xr < 0 || sel_disp < cth)
-                        val += cdata_weight * cmax_data_term;
-
-                    else
-                    {
-                        __global uchar* left_x  = cleft + yi * cimg_step + xi * channels;
-                        __global uchar* right_x = cright + yi * cimg_step + xr * channels;
-
-                        if(channels == 1)
-                            val += compute_1(left_x, right_x, cdata_weight, cmax_data_term);
-                        else
-                            val += compute_3(left_x, right_x, cdata_weight, cmax_data_term);
-                    }
-                }
-            }
-            data_cost[cdisp_step1 * d] = convert_short_sat_rte(val);
-        }
-    }
-}
-
-__kernel void compute_data_cost_1(__global const float *selected_disp_pyr, __global float *data_cost_,
-    __global uchar *cleft, __global uchar *cright,
-    int h, int w, int level, int nr_plane, int channels,
-    int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2, float cdata_weight,
-    float cmax_data_term, int cimg_step, int cth)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        int y0 = y << level;
-        int yt = (y + 1) << level;
-
-        int x0 = x << level;
-        int xt = (x + 1) << level;
-
-        __global const float *selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
-        __global       float *data_cost          = data_cost_ + y * cmsg_step1 + x;
-
-        for(int d = 0; d < nr_plane; d++)
-        {
-            float val = 0.0f;
-            for(int yi = y0; yi < yt; yi++)
-            {
-                for(int xi = x0; xi < xt; xi++)
-                {
-                    int sel_disp = selected_disparity[d * cdisp_step2];
-                    int xr = xi - sel_disp;
-
-                    if (xr < 0 || sel_disp < cth)
-                        val += cdata_weight * cmax_data_term;
-                    else
-                    {
-                        __global uchar* left_x  = cleft + yi * cimg_step + xi * channels;
-                        __global uchar* right_x = cright + yi * cimg_step + xr * channels;
-
-                        if(channels == 1)
-                            val += compute_1(left_x, right_x, cdata_weight, cmax_data_term);
-                        else
-                            val += compute_3(left_x, right_x, cdata_weight, cmax_data_term);
-                    }
-                }
-            }
-            data_cost[cdisp_step1 * d] = val;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////compute_data_cost_reduce//////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void compute_data_cost_reduce_0(__global const short* selected_disp_pyr, __global short* data_cost_,
-    __global uchar *cleft, __global uchar *cright,__local float *smem,
-    int level, int rows, int cols, int h, int nr_plane,
-    int channels, int winsz,
-    int cmsg_step1, int cmsg_step2, int cdisp_step1, int cdisp_step2,
-    float cdata_weight,  float cmax_data_term, int cimg_step,int cth)
-
-{
-    int x_out = get_group_id(0);
-    int y_out = get_group_id(1) % h;
-    int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2);
-
-    int tid = get_local_id(0);
-
-    __global const short* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
-    __global short* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
-
-    if (d < nr_plane)
-    {
-        int sel_disp = selected_disparity[d * cdisp_step2];
-
-        int x0 = x_out << level;
-        int y0 = y_out << level;
-
-        int len = min(y0 + winsz, rows) - y0;
-
-        float val = 0.0f;
-        if (x0 + tid < cols)
-        {
-            if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                __global uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
-
-                for(int y = 0; y < len; ++y)
-                {
-                    if(channels == 1)
-                        val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                    else
-                        val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
-            }
-        }
-
-        __local float* dline = smem + winsz * get_local_id(2);
-
-        dline[tid] = val;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
-    //if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 64)
-        {
-            if (tid < 32)
-                vdline[tid] += vdline[tid + 32];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 32)
-        {
-            if (tid < 16)
-                vdline[tid] += vdline[tid + 16];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 16)
-        {
-            if (tid < 8)
-                vdline[tid] += vdline[tid + 8];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 8)
-        {
-            if (tid < 4)
-                vdline[tid] += vdline[tid + 4];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 4)
-        {
-            if (tid < 2)
-                vdline[tid] += vdline[tid + 2];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 2)
-        {
-            if (tid < 1)
-                vdline[tid] += vdline[tid + 1];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] = convert_short_sat_rte(vdline[0]);
-    }
-}
-
-__kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr, __global float *data_cost_,
-    __global uchar *cleft, __global uchar *cright, __local float *smem,
-    int level, int rows, int cols, int h, int nr_plane,
-    int channels, int winsz,
-    int cmsg_step1, int cmsg_step2, int cdisp_step1,int cdisp_step2, float cdata_weight,
-    float cmax_data_term, int cimg_step, int cth)
-
-{
-    int x_out = get_group_id(0);
-    int y_out = get_group_id(1) % h;
-    int d = (get_group_id(1)/ h) * get_local_size(2) + get_local_id(2);
-
-    int tid = get_local_id(0);
-
-    __global const float *selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
-    __global float *data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
-
-    if (d < nr_plane)
-    {
-        int sel_disp = selected_disparity[d * cdisp_step2];
-
-        int x0 = x_out << level;
-        int y0 = y_out << level;
-
-        int len = min(y0 + winsz, rows) - y0;
-
-        float val = 0.0f;
-        if (x0 + tid < cols)
-        {
-            if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                val = cdata_weight * cmax_data_term * len;
-            else
-            {
-                __global uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                __global uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
-
-                for(int y = 0; y < len; ++y)
-                {
-                    if(channels == 1)
-                        val += compute_1(lle, lri, cdata_weight, cmax_data_term);
-                    else
-                        val += compute_3(lle, lri, cdata_weight, cmax_data_term);
-
-                    lle += cimg_step;
-                    lri += cimg_step;
-                }
-            }
-        }
-
-        __local float* dline = smem + winsz * get_local_id(2);
-
-        dline[tid] = val;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 64)
-        {
-            if (tid < 32)
-                vdline[tid] += vdline[tid + 32];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 32)
-        {
-            if (tid < 16)
-                vdline[tid] += vdline[tid + 16];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >= 16)
-        {
-            if (tid <  8)
-                vdline[tid] += vdline[tid + 8];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >=  8)
-        {
-            if (tid <  4)
-                vdline[tid] += vdline[tid + 4];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >=  4)
-        {
-            if (tid <  2)
-                vdline[tid] += vdline[tid + 2];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (winsz >=  2)
-        {
-            if (tid <  1)
-                vdline[tid] += vdline[tid + 1];
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(d < nr_plane)
-    {
-        __local volatile float* vdline = smem + winsz * get_local_id(2);
-        if (tid == 0)
-            data_cost[cdisp_step1 * d] = vdline[0];
-    }
-}
-
-///////////////////////////////////////////////////////////////
-//////////////////////// init message /////////////////////////
-///////////////////////////////////////////////////////////////
-
-inline void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
-    __global short *r_new, __global const short *u_cur, __global const short *d_cur,
-    __global const short *l_cur, __global const short *r_cur,
-    __global short *data_cost_selected, __global short *disparity_selected_new,
-    __global short *data_cost_new, __global const short* data_cost_cur,
-    __global const short *disparity_selected_cur,
-    int nr_plane, int nr_plane2,
-    int cdisp_step1, int cdisp_step2)
-{
-    for(int i = 0; i < nr_plane; i++)
-    {
-        short minimum = SHRT_MAX;
-        int id = 0;
-        for(int j = 0; j < nr_plane2; j++)
-        {
-            short cur = data_cost_new[j * cdisp_step1];
-            if(cur < minimum)
-            {
-                minimum = cur;
-                id = j;
-            }
-        }
-
-        data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
-        disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
-
-        u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-        d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-        l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-        r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
-
-        data_cost_new[id * cdisp_step1] = SHRT_MAX;
-    }
-}
-
-__kernel void init_message_0(__global short *u_new_, __global short *d_new_, __global short *l_new_,
-    __global short *r_new_, __global  short *u_cur_, __global const short *d_cur_,
-    __global const short *l_cur_, __global const short *r_cur_, __global short *ctemp,
-    __global short *selected_disp_pyr_new, __global const short *selected_disp_pyr_cur,
-    __global short *data_cost_selected_, __global const short *data_cost_,
-    int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
-    int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < h && x < w)
-    {
-        __global const short *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
-        __global const short *d_cur = d_cur_ + max(0, y/2 - 1)    * cmsg_step2 + x/2;
-        __global const short *l_cur = l_cur_ + y/2                * cmsg_step2 + min(w2-1, x/2 + 1);
-        __global const short *r_cur = r_cur_ + y/2                * cmsg_step2 + max(0, x/2 - 1);
-
-        __global short *data_cost_new = ctemp + y * cmsg_step1 + x;
-
-        __global const short *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
-        __global const short *data_cost = data_cost_ + y * cmsg_step1 + x;
-
-        for(int d = 0; d < nr_plane2; d++)
-        {
-            int idx2 = d * cdisp_step2;
-
-            short val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-            data_cost_new[d * cdisp_step1] = val;
-        }
-
-        __global short *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-        __global short *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
-
-        __global short *u_new = u_new_ + y * cmsg_step1 + x;
-        __global short *d_new = d_new_ + y * cmsg_step1 + x;
-        __global short *l_new = l_new_ + y * cmsg_step1 + x;
-        __global short *r_new = r_new_ + y * cmsg_step1 + x;
-
-        u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
-        d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
-        l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
-        r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
-
-        get_first_k_element_increase_0(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-            data_cost_selected, disparity_selected_new, data_cost_new,
-            data_cost, disparity_selected_cur, nr_plane, nr_plane2,
-            cdisp_step1, cdisp_step2);
-    }
-}
-
-__kernel void init_message_1(__global float *u_new_, __global float *d_new_, __global float *l_new_,
-    __global float *r_new_, __global const float *u_cur_, __global const float *d_cur_,
-    __global const float *l_cur_, __global const float *r_cur_, __global float *ctemp,
-    __global float *selected_disp_pyr_new, __global const float *selected_disp_pyr_cur,
-    __global float *data_cost_selected_, __global const float *data_cost_,
-    int h, int w, int nr_plane, int h2, int w2, int nr_plane2,
-    int cdisp_step1, int cdisp_step2, int cmsg_step1, int cmsg_step2)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-
-    __global const float *u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
-    __global const float *d_cur = d_cur_ + max(0, y/2 - 1)    * cmsg_step2 + x/2;
-    __global const float *l_cur = l_cur_ + y/2                * cmsg_step2 + min(w2-1, x/2 + 1);
-    __global const float *r_cur = r_cur_ + y/2                * cmsg_step2 + max(0, x/2 - 1);
-
-    __global float *data_cost_new = ctemp + y * cmsg_step1 + x;
-
-    __global const float *disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
-    __global const float *data_cost = data_cost_ + y * cmsg_step1 + x;
-
-    if (y < h && x < w)
-    {
-        for(int d = 0; d < nr_plane2; d++)
-        {
-            int idx2 = d * cdisp_step2;
-
-            float val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-            data_cost_new[d * cdisp_step1] = val;
-        }
-    }
-
-    __global float *data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
-    __global float *disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
-
-    __global float *u_new = u_new_ + y * cmsg_step1 + x;
-    __global float *d_new = d_new_ + y * cmsg_step1 + x;
-    __global float *l_new = l_new_ + y * cmsg_step1 + x;
-    __global float *r_new = r_new_ + y * cmsg_step1 + x;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(y < h && x < w)
-    {
-        u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
-        d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
-        l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
-        r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
-
-        for(int i = 0; i < nr_plane; i++)
-        {
-            float minimum = FLT_MAX;
-            int id = 0;
-
-            for(int j = 0; j < nr_plane2; j++)
-            {
-                float cur = data_cost_new[j * cdisp_step1];
-                if(cur < minimum)
-                {
-                    minimum = cur;
-                    id = j;
-                }
-            }
-            data_cost_selected[i * cdisp_step1] = data_cost[id * cdisp_step1];
-            disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
-            u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-            d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-            l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-            r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
-            data_cost_new[id * cdisp_step1] = FLT_MAX;
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////
-////////////////////  calc all iterations /////////////////////
-///////////////////////////////////////////////////////////////
-
-inline void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
-    __global const short *msg2, __global const short *msg3,
-    __global const short *dst_disp, __global const short *src_disp,
-    int nr_plane, __global short *temp,
-    float cmax_disc_term, int cdisp_step1, float cdisc_single_jump)
-{
-    short minimum = SHRT_MAX;
-    for(int d = 0; d < nr_plane; d++)
-    {
-        int idx = d * cdisp_step1;
-        short val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
-
-        if(val < minimum)
-            minimum = val;
-
-        msg_dst[idx] = val;
-    }
-
-    float sum = 0;
-    for(int d = 0; d < nr_plane; d++)
-    {
-        float cost_min = minimum + cmax_disc_term;
-        short src_disp_reg = src_disp[d * cdisp_step1];
-
-        for(int d2 = 0; d2 < nr_plane; d2++)
-            cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] +
-            cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg)));
-
-        temp[d * cdisp_step1] = convert_short_sat_rte(cost_min);
-        sum += cost_min;
-    }
-    sum /= nr_plane;
-
-    for(int d = 0; d < nr_plane; d++)
-        msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum);
-}
-
-inline void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
-    __global const float *msg2, __global const float *msg3,
-    __global const float *dst_disp, __global const float *src_disp,
-    int nr_plane, __global float *temp,
-    float cmax_disc_term, int cdisp_step1, float cdisc_single_jump)
-{
-    float minimum = FLT_MAX;
-    for(int d = 0; d < nr_plane; d++)
-    {
-        int idx = d * cdisp_step1;
-        float val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
-
-        if(val < minimum)
-            minimum = val;
-
-        msg_dst[idx] = val;
-    }
-
-    float sum = 0;
-    for(int d = 0; d < nr_plane; d++)
-    {
-        float cost_min = minimum + cmax_disc_term;
-        float src_disp_reg = src_disp[d * cdisp_step1];
-
-        for(int d2 = 0; d2 < nr_plane; d2++)
-            cost_min = fmin(cost_min, (msg_dst[d2 * cdisp_step1] +
-            cdisc_single_jump * fabs(dst_disp[d2 * cdisp_step1] - src_disp_reg)));
-
-        temp[d * cdisp_step1] = cost_min;
-        sum += cost_min;
-    }
-    sum /= nr_plane;
-
-    for(int d = 0; d < nr_plane; d++)
-        msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum;
-}
-
-__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_,
-    __global const short *data_cost_selected, __global const short *selected_disp_pyr_cur,
-    __global short *ctemp, int h, int w, int nr_plane, int i,
-    float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump)
-{
-    int y = get_global_id(1);
-    int x = ((get_global_id(0)) << 1) + ((y + i) & 1);
-
-    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-    {
-        __global const short *data = data_cost_selected + y * cmsg_step1 + x;
-
-        __global short *u = u_ + y * cmsg_step1 + x;
-        __global short *d = d_ + y * cmsg_step1 + x;
-        __global short *l = l_ + y * cmsg_step1 + x;
-        __global short *r = r_ + y * cmsg_step1 + x;
-
-        __global const short *disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
-
-        __global short *temp = ctemp + y * cmsg_step1 + x;
-
-        message_per_pixel_0(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_0(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_0(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_0(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-    }
-}
-
-__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_,
-    __global const float *data_cost_selected, __global const float *selected_disp_pyr_cur,
-    __global float *ctemp, int h, int w, int nr_plane, int i,
-    float cmax_disc_term, int cdisp_step1, int cmsg_step1, float cdisc_single_jump)
-{
-    int y = get_global_id(1);
-    int x = ((get_global_id(0)) << 1) + ((y + i) & 1);
-
-    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-    {
-        __global const float *data = data_cost_selected + y * cmsg_step1 + x;
-
-        __global float *u = u_ + y * cmsg_step1 + x;
-        __global float *d = d_ + y * cmsg_step1 + x;
-        __global float *l = l_ + y * cmsg_step1 + x;
-        __global float *r = r_ + y * cmsg_step1 + x;
-
-        __global const float *disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
-        __global float *temp = ctemp + y * cmsg_step1 + x;
-
-        message_per_pixel_1(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_1(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_1(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-        message_per_pixel_1(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp,
-            cmax_disc_term, cdisp_step1, cdisc_single_jump);
-    }
-}
-
-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
-
-__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_,
-    __global const short *r_, __global const short * data_cost_selected,
-    __global const short *disp_selected_pyr,
-    __global short* disp,
-    int res_step, int cols, int rows, int nr_plane,
-    int cmsg_step1, int cdisp_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
-    {
-        __global const short *data = data_cost_selected + y * cmsg_step1 + x;
-        __global const short *disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
-
-        __global const short *u = u_ + (y+1) * cmsg_step1 + (x+0);
-        __global const short *d = d_ + (y-1) * cmsg_step1 + (x+0);
-        __global const short *l = l_ + (y+0) * cmsg_step1 + (x+1);
-        __global const short *r = r_ + (y+0) * cmsg_step1 + (x-1);
-
-        short best = 0;
-        short best_val = SHRT_MAX;
-
-        for (int i = 0; i < nr_plane; ++i)
-        {
-            int idx = i * cdisp_step1;
-            short val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
-
-            if (val < best_val)
-            {
-                best_val = val;
-                best = disp_selected[idx];
-            }
-        }
-        disp[res_step * y + x] = best;
-    }
-}
-
-__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_,
-    __global const float *r_, __global const float *data_cost_selected,
-    __global const float *disp_selected_pyr,
-    __global short *disp,
-    int res_step, int cols, int rows, int nr_plane,
-    int cmsg_step1, int cdisp_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
-    {
-        __global const float *data = data_cost_selected + y * cmsg_step1 + x;
-        __global const float *disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
-
-        __global const float *u = u_ + (y+1) * cmsg_step1 + (x+0);
-        __global const float *d = d_ + (y-1) * cmsg_step1 + (x+0);
-        __global const float *l = l_ + (y+0) * cmsg_step1 + (x+1);
-        __global const float *r = r_ + (y+0) * cmsg_step1 + (x-1);
-
-        short best = 0;
-        short best_val = SHRT_MAX;
-        for (int i = 0; i < nr_plane; ++i)
-        {
-            int idx = i * cdisp_step1;
-            float val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
-
-            if (val < best_val)
-            {
-                best_val = val;
-                best = convert_short_sat_rte(disp_selected[idx]);
-            }
-        }
-        disp[res_step * y + x] = best;
-    }
-}
diff --git a/modules/ocl/src/opencl/svm.cl b/modules/ocl/src/opencl/svm.cl
deleted file mode 100644
index c10494070..000000000
--- a/modules/ocl/src/opencl/svm.cl
+++ /dev/null
@@ -1,211 +0,0 @@
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Erping Pang, erping@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#ifdef DOUBLE_SUPPORT
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#define TYPE double
-#else
-#define TYPE float
-#endif
-#if defined ADDEXP
-#define EXP(X) exp(X)
-#else
-#define EXP(X) X
-#endif
-#if defined ADDPOW
-#define POW(X,Y) pow(fabs(X),(Y))
-#else
-#define POW(X,Y) X
-#endif
-#define MAX_VAL   (FLT_MAX*1e-3)
-
-#define BLOCK_SIZE 16
-
-__kernel void svm_linear(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
-                         int width, TYPE alpha, TYPE beta)
-{
-    const int  col = get_global_id(0);
-    const int  row = get_global_id(1);
-
-    if(row < src_rows && col < src2_cols)
-    {
-        int t = 0;
-        TYPE temp = 0.0;
-        for(t = 0; t < width - BLOCK_SIZE; t += BLOCK_SIZE)
-        {
-            float16 t0 = vload16(0, src + row * src_step + t);
-            float16 t1 = vload16(0, src2 + col * src2_step + t);
-            t0 *= t1;
-            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
-                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
-        }
-        for(; t < width; t++)
-        {
-            temp += src[row * src_step + t] * src2[col * src2_step + t];
-        }
-
-        TYPE temp1 = (TYPE) (temp * alpha + beta);
-
-        if( temp1 > MAX_VAL )
-        {
-            dst[row * dst_step + col] = MAX_VAL;
-        }
-        else
-        {
-            dst[row * dst_step + col] = temp1;
-        }
-
-    }
-
-}
-__kernel void svm_sigmod(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
-                         int width, TYPE alpha, TYPE beta)
-{
-    const int  col = get_global_id(0);
-    const int  row = get_global_id(1);
-
-    if(row < src_rows && col < src2_cols)
-    {
-        int t = 0;
-        TYPE temp = 0.0;
-        for(t = 0; t < width - BLOCK_SIZE; t += BLOCK_SIZE)
-        {
-            float16 t0 = vload16(0, src + row * src_step + t);
-            float16 t1 = vload16(0, src2 + col * src2_step + t);
-            t0 *= t1;
-            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
-                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
-        }
-        for(; t < width; t++)
-        {
-            temp += src[row * src_step + t] * src2[col * src2_step + t];
-        }
-        TYPE tp = (TYPE) (temp * alpha + beta);
-        TYPE e = exp(-fabs(tp));
-        TYPE temp1;
-        if(tp > 0)
-        {
-            temp1 = (TYPE)((1. - e) / (1. + e));
-        }
-        else
-        {
-            temp1 = (TYPE)((e - 1.) / (e + 1.));
-        }
-
-        if( temp1 > MAX_VAL )
-        {
-            dst[row * dst_step + col] = MAX_VAL;
-        }
-        else
-        {
-            dst[row * dst_step + col] = temp1;
-        }
-    }
-
-}
-__kernel void svm_poly(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
-                       int width, TYPE alpha, TYPE beta, TYPE degree)
-{
-    const int  col = get_global_id(0);
-    const int  row = get_global_id(1);
-
-    if(row < src_rows && col < src2_cols)
-    {
-        int t = 0;
-        TYPE temp = 0.0;
-        for(t = 0; t < width - BLOCK_SIZE; t += BLOCK_SIZE)
-        {
-            float16 t0 = vload16(0, src + row * src_step + t);
-            float16 t1 = vload16(0, src2 + col * src2_step + t);
-            t0 *= t1;
-            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
-                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
-        }
-        for(; t < width; t++)
-        {
-            temp += src[row * src_step + t] * src2[col * src2_step + t];
-        }
-        TYPE temp1 = (TYPE)(POW((temp * alpha + beta), degree));
-
-        if( temp1 > MAX_VAL )
-        {
-            dst[row * dst_step + col] = MAX_VAL;
-        }
-        else
-        {
-            dst[row * dst_step + col] = temp1;
-        }
-    }
-
-}
-__kernel void svm_rbf(__global float* src, int src_step, __global float* src2, int src2_step, __global TYPE* dst, int dst_step, int src_rows, int src2_cols,
-                      int width, TYPE gamma)
-{
-    const int  col = get_global_id(0);
-    const int  row = get_global_id(1);
-
-    if(row < src_rows && col < src2_cols)
-    {
-        int t = 0;
-        TYPE temp = 0.0;
-        for(t = 0; t < width - BLOCK_SIZE; t += BLOCK_SIZE)
-        {
-            float16 t0 = vload16(0, src + row * src_step + t);
-            float16 t1 = vload16(0, src2 + col * src2_step + t);
-            t0 = (t0 - t1) * (t0 - t1);
-            temp += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
-                    t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
-        }
-        for(; t < width; t++)
-        {
-            temp += (src[row * src_step + t] - src2[col * src2_step + t]) * (src[row * src_step + t] - src2[col * src2_step + t]);
-        }
-        TYPE temp1 = EXP((TYPE)(temp * gamma));
-
-        if( temp1 > MAX_VAL )
-        {
-            dst[row * dst_step + col] = MAX_VAL;
-        }
-        else
-        {
-            dst[row * dst_step + col] = temp1;
-        }
-    }
-}
diff --git a/modules/ocl/src/opencl/tvl1flow.cl b/modules/ocl/src/opencl/tvl1flow.cl
deleted file mode 100644
index 6111a4a38..000000000
--- a/modules/ocl/src/opencl/tvl1flow.cl
+++ /dev/null
@@ -1,386 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step,
-                                     __global float* dx, __global float* dy, int dx_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if((x < src_col)&&(y < src_row))
-    {
-        int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
-        int src_x2 = (x - 1) > 0 ? (x -1) : 0;
-        dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]);
-
-        int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
-        int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
-        dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]);
-    }
-
-}
-
-static float bicubicCoeff(float x_)
-{
-
-    float x = fabs(x_);
-    if (x <= 1.0f)
-        return x * x * (1.5f * x - 2.5f) + 1.0f;
-    else if (x < 2.0f)
-        return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-    else
-        return 0.0f;
-}
-
-__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
-    image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
-    __global const float* u1, int u1_step,
-    __global const float* u2,
-    __global float* I1w,
-    __global float* I1wx, /*int I1wx_step,*/
-    __global float* I1wy, /*int I1wy_step,*/
-    __global float* grad, /*int grad_step,*/
-    __global float* rho,
-    int I1w_step,
-    int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < I0_col&&y < I0_row)
-    {
-        //float u1Val = u1(y, x);
-        float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        //float u2Val = u2(y, x);
-        float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-
-        float wx = x + u1Val;
-        float wy = y + u2Val;
-
-        int xmin = ceil(wx - 2.0f);
-        int xmax = floor(wx + 2.0f);
-
-        int ymin = ceil(wy - 2.0f);
-        int ymax = floor(wy + 2.0f);
-
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-        sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                //sum  += w * tex2D(tex_I1 , cx, cy);
-                int2 cood = (int2)(cx, cy);
-                sum += w * read_imagef(tex_I1, sampleri, cood).x;
-                //sumx += w * tex2D(tex_I1x, cx, cy);
-                sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
-                //sumy += w * tex2D(tex_I1y, cx, cy);
-                sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
-
-                wsum += w;
-            }
-        }
-
-        float coeff = 1.0f / wsum;
-
-        float I1wVal  = sum  * coeff;
-        float I1wxVal = sumx * coeff;
-        float I1wyVal = sumy * coeff;
-
-        I1w[y * I1w_step + x]  = I1wVal;
-        I1wx[y * I1w_step + x] = I1wxVal;
-        I1wy[y * I1w_step + x] = I1wyVal;
-
-        float Ix2 = I1wxVal * I1wxVal;
-        float Iy2 = I1wyVal * I1wyVal;
-
-        // store the |Grad(I1)|^2
-        grad[y * I1w_step + x] = Ix2 + Iy2;
-
-        // compute the constant part of the rho function
-        float I0Val = I0[y * I0_step + x];
-        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-
-}
-
-static float readImage(__global float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)
-{
-    int i0 = clamp(x, 0, cols - 1);
-    int j0 = clamp(y, 0, rows - 1);
-
-    return image[j0 * elemCntPerRow + i0];
-}
-
-__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
-    __global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
-    __global const float* u1, int u1_step,
-    __global const float* u2,
-    __global float* I1w,
-    __global float* I1wx, /*int I1wx_step,*/
-    __global float* I1wy, /*int I1wy_step,*/
-    __global float* grad, /*int grad_step,*/
-    __global float* rho,
-    int I1w_step,
-    int u2_step,
-    int I1_step,
-    int I1x_step)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < I0_col&&y < I0_row)
-    {
-        //float u1Val = u1(y, x);
-        float u1Val = u1[y * u1_step + x];
-        //float u2Val = u2(y, x);
-        float u2Val = u2[y * u2_step + x];
-
-        float wx = x + u1Val;
-        float wy = y + u2Val;
-
-        int xmin = ceil(wx - 2.0f);
-        int xmax = floor(wx + 2.0f);
-
-        int ymin = ceil(wy - 2.0f);
-        int ymax = floor(wy + 2.0f);
-
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                int2 cood = (int2)(cx, cy);
-                sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
-                sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
-                sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
-                wsum += w;
-            }
-        }
-
-        float coeff = 1.0f / wsum;
-
-        float I1wVal  = sum  * coeff;
-        float I1wxVal = sumx * coeff;
-        float I1wyVal = sumy * coeff;
-
-        I1w[y * I1w_step + x]  = I1wVal;
-        I1wx[y * I1w_step + x] = I1wxVal;
-        I1wy[y * I1w_step + x] = I1wyVal;
-
-        float Ix2 = I1wxVal * I1wxVal;
-        float Iy2 = I1wyVal * I1wyVal;
-
-        // store the |Grad(I1)|^2
-        grad[y * I1w_step + x] = Ix2 + Iy2;
-
-        // compute the constant part of the rho function
-        float I0Val = I0[y * I0_step + x];
-        rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-
-}
-
-
-__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
-    __global const float* u2,
-    __global float* p11, int p11_step,
-    __global float* p12,
-    __global float* p21,
-    __global float* p22,
-    float taut,
-    int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < u1_col && y < u1_row)
-    {
-        int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
-        float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-
-        int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
-        float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-
-        int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
-        float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-
-        int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);
-        float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-
-        float g1 = hypot(u1x, u1y);
-        float g2 = hypot(u2x, u2y);
-
-        float ng1 = 1.0f + taut * g1;
-        float ng2 = 1.0f + taut * g2;
-
-        p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
-        p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
-        p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
-        p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
-    }
-
-}
-
-static float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
-{
-
-    if (x > 0 && y > 0)
-    {
-        float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
-        float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
-        return v1x + v2y;
-    }
-    else
-    {
-        if (y > 0)
-            return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
-        else
-        {
-            if (x > 0)
-                return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
-            else
-                return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
-        }
-    }
-
-}
-
-__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
-    __global const float* I1wy, /*int I1wy_step,*/
-    __global const float* grad, /*int grad_step,*/
-    __global const float* rho_c, /*int rho_c_step,*/
-    __global const float* p11, /*int p11_step,*/
-    __global const float* p12, /*int p12_step,*/
-    __global const float* p21, /*int p21_step,*/
-    __global const float* p22, /*int p22_step,*/
-    __global float* u1, int u1_step,
-    __global float* u2,
-    __global float* error, float l_t, float theta, int u2_step,
-    int u1_offset_x,
-    int u1_offset_y,
-    int u2_offset_x,
-    int u2_offset_y,
-    char calc_error)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if(x < I1wx_col && y < I1wx_row)
-    {
-        float I1wxVal = I1wx[y * I1wx_step + x];
-        float I1wyVal = I1wy[y * I1wx_step + x];
-        float gradVal = grad[y * I1wx_step + x];
-        float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
-        float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
-
-        float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
-
-        // estimate the values of the variable (v1, v2) (thresholding operator TH)
-
-        float d1 = 0.0f;
-        float d2 = 0.0f;
-
-        if (rho < -l_t * gradVal)
-        {
-            d1 = l_t * I1wxVal;
-            d2 = l_t * I1wyVal;
-        }
-        else if (rho > l_t * gradVal)
-        {
-            d1 = -l_t * I1wxVal;
-            d2 = -l_t * I1wyVal;
-        }
-        else if (gradVal > 1.192092896e-07f)
-        {
-            float fi = -rho / gradVal;
-            d1 = fi * I1wxVal;
-            d2 = fi * I1wyVal;
-        }
-
-        float v1 = u1OldVal + d1;
-        float v2 = u2OldVal + d2;
-
-        // compute the divergence of the dual variable (p1, p2)
-
-        float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
-        float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
-
-        // estimate the values of the optical flow (u1, u2)
-
-        float u1NewVal = v1 + theta * div_p1;
-        float u2NewVal = v2 + theta * div_p2;
-
-        u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
-        u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
-
-        if(calc_error)
-        {
-            float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
-            float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
-            error[y * I1wx_step + x] = n1 + n2;
-        }
-    }
-}
diff --git a/modules/ocl/src/optical_flow_farneback.cpp b/modules/ocl/src/optical_flow_farneback.cpp
deleted file mode 100644
index 198f9106b..000000000
--- a/modules/ocl/src/optical_flow_farneback.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//      Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-#include "opencv2/video/tracking.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-#define MIN_SIZE 32
-
-namespace cv {
-namespace ocl {
-namespace optflow_farneback
-{
-oclMat g;
-oclMat xg;
-oclMat xxg;
-oclMat gKer;
-
-float ig[4];
-
-inline void setGaussianBlurKernel(const float *c_gKer, int ksizeHalf)
-{
-    cv::Mat t_gKer(1, ksizeHalf + 1, CV_32FC1, const_cast<float *>(c_gKer));
-    gKer.upload(t_gKer);
-}
-
-static void gaussianBlurOcl(const oclMat &src, int ksizeHalf, oclMat &dst)
-{
-    String kernelName("gaussianBlur");
-#ifdef ANDROID
-    size_t localThreads[3] = { 128, 1, 1 };
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-#endif
-    size_t globalThreads[3] = { src.cols, src.rows, 1 };
-    int smem_size = (localThreads[0] + 2*ksizeHalf) * sizeof(float);
-
-    CV_Assert(dst.size() == src.size());
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data));
-    args.push_back(std::make_pair(smem_size, (void *)NULL));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf));
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1);
-}
-
-static void polynomialExpansionOcl(const oclMat &src, int polyN, oclMat &dst)
-{
-    String kernelName("polynomialExpansion");
-
-#ifdef ANDROID
-    size_t localThreads[3] = { 128, 1, 1 };
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-#endif
-    size_t globalThreads[3] = { divUp(src.cols, localThreads[0] - 2*polyN) * localThreads[0], src.rows, 1 };
-    int smem_size = 3 * localThreads[0] * sizeof(float);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&g.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&xg.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&xxg.data));
-    args.push_back(std::make_pair(smem_size, (void *)NULL));
-    args.push_back(std::make_pair(sizeof(cl_float4), (void *)&ig));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-
-    char opt [128];
-    sprintf(opt, "-D polyN=%d", polyN);
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1, opt);
-}
-
-static void updateMatricesOcl(const oclMat &flowx, const oclMat &flowy, const oclMat &R0, const oclMat &R1, oclMat &M)
-{
-    String kernelName("updateMatrices");
-#ifdef ANDROID
-    size_t localThreads[3] = { 32, 4, 1 };
-#else
-    size_t localThreads[3] = { 32, 8, 1 };
-#endif
-    size_t globalThreads[3] = { flowx.cols, flowx.rows, 1 };
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowy.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&R0.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&R1.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowx.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowx.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&M.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowx.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowy.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&R0.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&R1.step));
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1);
-}
-
-static void boxFilter5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
-{
-    String kernelName("boxFilter5");
-    int height = src.rows / 5;
-#ifdef ANDROID
-    size_t localThreads[3] = { 128, 1, 1 };
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-#endif
-    size_t globalThreads[3] = { src.cols, height, 1 };
-    int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(smem_size, (void *)NULL));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&height));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf));
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1);
-}
-
-static void updateFlowOcl(const oclMat &M, oclMat &flowx, oclMat &flowy)
-{
-    String kernelName("updateFlow");
-    int cols = divUp(flowx.cols, 4);
-#ifdef ANDROID
-    size_t localThreads[3] = { 32, 4, 1 };
-#else
-    size_t localThreads[3] = { 32, 8, 1 };
-#endif
-    size_t globalThreads[3] = { cols, flowx.rows, 1 };
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowx.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&flowy.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&M.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowx.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowx.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&flowy.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&M.step));
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1);
-}
-
-static void gaussianBlur5Ocl(const oclMat &src, int ksizeHalf, oclMat &dst)
-{
-    String kernelName("gaussianBlur5");
-    int height = src.rows / 5;
-#ifdef ANDROID
-    size_t localThreads[3] = { 128, 1, 1 };
-#else
-    size_t localThreads[3] = { 256, 1, 1 };
-#endif
-    size_t globalThreads[3] = { src.cols, height, 1 };
-    int smem_size = (localThreads[0] + 2*ksizeHalf) * 5 * sizeof(float);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&gKer.data));
-    args.push_back(std::make_pair(smem_size, (void *)NULL));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&height));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&ksizeHalf));
-
-    openCLExecuteKernel(Context::getContext(), &optical_flow_farneback, kernelName,
-                        globalThreads, localThreads, args, -1, -1);
-}
-}
-}
-} // namespace cv { namespace ocl { namespace optflow_farneback
-
-static oclMat allocMatFromBuf(int rows, int cols, int type, oclMat &mat)
-{
-    if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
-        return mat(Rect(0, 0, cols, rows));
-    return mat = oclMat(rows, cols, type);
-}
-
-cv::ocl::FarnebackOpticalFlow::FarnebackOpticalFlow()
-{
-    numLevels = 5;
-    pyrScale = 0.5;
-    fastPyramids = false;
-    winSize = 13;
-    numIters = 10;
-    polyN = 5;
-    polySigma = 1.1;
-    flags = 0;
-}
-
-void cv::ocl::FarnebackOpticalFlow::releaseMemory()
-{
-    frames_[0].release();
-    frames_[1].release();
-    pyrLevel_[0].release();
-    pyrLevel_[1].release();
-    M_.release();
-    bufM_.release();
-    R_[0].release();
-    R_[1].release();
-    blurredFrame_[0].release();
-    blurredFrame_[1].release();
-    pyramid0_.clear();
-    pyramid1_.clear();
-}
-
-void cv::ocl::FarnebackOpticalFlow::prepareGaussian(
-    int n, double sigma, float *g, float *xg, float *xxg,
-    double &ig11, double &ig03, double &ig33, double &ig55)
-{
-    double s = 0.;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
-        s += g[x];
-    }
-
-    s = 1./s;
-    for (int x = -n; x <= n; x++)
-    {
-        g[x] = (float)(g[x]*s);
-        xg[x] = (float)(x*g[x]);
-        xxg[x] = (float)(x*x*g[x]);
-    }
-
-    Mat_<double> G(6, 6);
-    G.setTo(0);
-
-    for (int y = -n; y <= n; y++)
-    {
-        for (int x = -n; x <= n; x++)
-        {
-            G(0,0) += g[y]*g[x];
-            G(1,1) += g[y]*g[x]*x*x;
-            G(3,3) += g[y]*g[x]*x*x*x*x;
-            G(5,5) += g[y]*g[x]*x*x*y*y;
-        }
-    }
-
-    //G[0][0] = 1.;
-    G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
-    G(4,4) = G(3,3);
-    G(3,4) = G(4,3) = G(5,5);
-
-    // invG:
-    // [ x        e  e    ]
-    // [    y             ]
-    // [       y          ]
-    // [ e        z       ]
-    // [ e           z    ]
-    // [                u ]
-    Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-
-    ig11 = invG(1,1);
-    ig03 = invG(0,3);
-    ig33 = invG(3,3);
-    ig55 = invG(5,5);
-}
-
-void cv::ocl::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double sigma)
-{
-    std::vector<float> buf(n*6 + 3);
-    float* g = &buf[0] + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
-
-    if (sigma < FLT_EPSILON)
-        sigma = n*0.3;
-
-    double ig11, ig03, ig33, ig55;
-    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
-
-    cv::Mat t_g(1, n + 1, CV_32FC1, g);
-    cv::Mat t_xg(1, n + 1, CV_32FC1, xg);
-    cv::Mat t_xxg(1, n + 1, CV_32FC1, xxg);
-
-    optflow_farneback::g.upload(t_g);
-    optflow_farneback::xg.upload(t_xg);
-    optflow_farneback::xxg.upload(t_xxg);
-
-    optflow_farneback::ig[0] = static_cast<float>(ig11);
-    optflow_farneback::ig[1] = static_cast<float>(ig03);
-    optflow_farneback::ig[2] = static_cast<float>(ig33);
-    optflow_farneback::ig[3] = static_cast<float>(ig55);
-}
-
-void cv::ocl::FarnebackOpticalFlow::updateFlow_boxFilter(
-    const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat &flowy,
-    oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices)
-{
-    optflow_farneback::boxFilter5Ocl(M, blockSize/2, bufM);
-
-    swap(M, bufM);
-
-    optflow_farneback::updateFlowOcl(M, flowx, flowy);
-
-    if (updateMatrices)
-        optflow_farneback::updateMatricesOcl(flowx, flowy, R0, R1, M);
-}
-
-
-void cv::ocl::FarnebackOpticalFlow::updateFlow_gaussianBlur(
-    const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat& flowy,
-    oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices)
-{
-    optflow_farneback::gaussianBlur5Ocl(M, blockSize/2, bufM);
-
-    swap(M, bufM);
-
-    optflow_farneback::updateFlowOcl(M, flowx, flowy);
-
-    if (updateMatrices)
-        optflow_farneback::updateMatricesOcl(flowx, flowy, R0, R1, M);
-}
-
-
-void cv::ocl::FarnebackOpticalFlow::operator ()(
-    const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy)
-{
-    CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
-    CV_Assert(frame0.size() == frame1.size());
-    CV_Assert(polyN == 5 || polyN == 7);
-    CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
-
-    Size size = frame0.size();
-    oclMat prevFlowX, prevFlowY, curFlowX, curFlowY;
-
-    flowx.create(size, CV_32F);
-    flowy.create(size, CV_32F);
-    oclMat flowx0 = flowx;
-    oclMat flowy0 = flowy;
-
-    // Crop unnecessary levels
-    double scale = 1;
-    int numLevelsCropped = 0;
-    for (; numLevelsCropped < numLevels; numLevelsCropped++)
-    {
-        scale *= pyrScale;
-        if (size.width*scale < MIN_SIZE || size.height*scale < MIN_SIZE)
-            break;
-    }
-
-    frame0.convertTo(frames_[0], CV_32F);
-    frame1.convertTo(frames_[1], CV_32F);
-
-    if (fastPyramids)
-    {
-        // Build Gaussian pyramids using pyrDown()
-        pyramid0_.resize(numLevelsCropped + 1);
-        pyramid1_.resize(numLevelsCropped + 1);
-        pyramid0_[0] = frames_[0];
-        pyramid1_[0] = frames_[1];
-        for (int i = 1; i <= numLevelsCropped; ++i)
-        {
-            pyrDown(pyramid0_[i - 1], pyramid0_[i]);
-            pyrDown(pyramid1_[i - 1], pyramid1_[i]);
-        }
-    }
-
-    setPolynomialExpansionConsts(polyN, polySigma);
-
-    for (int k = numLevelsCropped; k >= 0; k--)
-    {
-        scale = 1;
-        for (int i = 0; i < k; i++)
-            scale *= pyrScale;
-
-        double sigma = (1./scale - 1) * 0.5;
-        int smoothSize = cvRound(sigma*5) | 1;
-        smoothSize = std::max(smoothSize, 3);
-
-        int width = cvRound(size.width*scale);
-        int height = cvRound(size.height*scale);
-
-        if (fastPyramids)
-        {
-            width = pyramid0_[k].cols;
-            height = pyramid0_[k].rows;
-        }
-
-        if (k > 0)
-        {
-            curFlowX.create(height, width, CV_32F);
-            curFlowY.create(height, width, CV_32F);
-        }
-        else
-        {
-            curFlowX = flowx0;
-            curFlowY = flowy0;
-        }
-
-        if (!prevFlowX.data)
-        {
-            if (flags & cv::OPTFLOW_USE_INITIAL_FLOW)
-            {
-                resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR);
-                resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR);
-                multiply(scale, curFlowX, curFlowX);
-                multiply(scale, curFlowY, curFlowY);
-            }
-            else
-            {
-                curFlowX.setTo(0);
-                curFlowY.setTo(0);
-            }
-        }
-        else
-        {
-            resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR);
-            resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR);
-            multiply(1./pyrScale, curFlowX, curFlowX);
-            multiply(1./pyrScale, curFlowY, curFlowY);
-        }
-
-        oclMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
-        oclMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
-        oclMat R[2] =
-        {
-            allocMatFromBuf(5*height, width, CV_32F, R_[0]),
-            allocMatFromBuf(5*height, width, CV_32F, R_[1])
-        };
-
-        if (fastPyramids)
-        {
-            optflow_farneback::polynomialExpansionOcl(pyramid0_[k], polyN, R[0]);
-            optflow_farneback::polynomialExpansionOcl(pyramid1_[k], polyN, R[1]);
-        }
-        else
-        {
-            oclMat blurredFrame[2] =
-            {
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
-                allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
-            };
-            oclMat pyrLevel[2] =
-            {
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
-                allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
-            };
-
-            Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
-            optflow_farneback::setGaussianBlurKernel(g.ptr<float>(smoothSize/2), smoothSize/2);
-
-            for (int i = 0; i < 2; i++)
-            {
-                optflow_farneback::gaussianBlurOcl(frames_[i], smoothSize/2, blurredFrame[i]);
-                resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR);
-                optflow_farneback::polynomialExpansionOcl(pyrLevel[i], polyN, R[i]);
-            }
-        }
-
-        optflow_farneback::updateMatricesOcl(curFlowX, curFlowY, R[0], R[1], M);
-
-        if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-        {
-            Mat g = getGaussianKernel(winSize, winSize/2*0.3f, CV_32F);
-            optflow_farneback::setGaussianBlurKernel(g.ptr<float>(winSize/2), winSize/2);
-        }
-        for (int i = 0; i < numIters; i++)
-        {
-            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
-                updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1);
-            else
-                updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1);
-        }
-
-        prevFlowX = curFlowX;
-        prevFlowY = curFlowY;
-    }
-
-    flowx = curFlowX;
-    flowy = curFlowY;
-}
diff --git a/modules/ocl/src/pyrdown.cpp b/modules/ocl/src/pyrdown.cpp
deleted file mode 100644
index 7e5e35a3e..000000000
--- a/modules/ocl/src/pyrdown.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//        Dachuan Zhao, dachuan@multicorewareinc.com
-//        Yao Wang, yao@multicorewareinc.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////// add subtract multiply divide /////////////////////////
-//////////////////////////////////////////////////////////////////////////////
-static void pyrdown_run(const oclMat &src, const oclMat &dst)
-{
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert(src.depth() != CV_8S);
-
-    Context  *clCxt = src.clCxt;
-    String kernelName = "pyrDown";
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { src.cols, dst.rows, 1};
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
-
-    openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
-}
-//////////////////////////////////////////////////////////////////////////////
-// pyrDown
-
-void cv::ocl::pyrDown(const oclMat &src, oclMat &dst)
-{
-    int depth = src.depth(), channels = src.channels();
-    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
-    CV_Assert(channels == 1 || channels == 3 || channels == 4);
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-
-    pyrdown_run(src, dst);
-}
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
deleted file mode 100644
index 2ff3dcd1b..000000000
--- a/modules/ocl/src/pyrlk.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//      Dachuan Zhao, dachuan@multicorewareinc.com
-//      Yao Wang, yao@multicorewareinc.com
-//      Nathan, liujun@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-struct dim3
-{
-    unsigned int x, y, z;
-};
-
-static void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDeviceArch11)
-{
-    winSize.width *= cn;
-
-    if (winSize.width > 32 && winSize.width > 2 * winSize.height)
-    {
-        block.x = isDeviceArch11 ? 16 : 32;
-        block.y = 8;
-    }
-    else
-    {
-        block.x = 16;
-        block.y = isDeviceArch11 ? 8 : 16;
-    }
-
-    patch.x = (winSize.width  + block.x - 1) / block.x;
-    patch.y = (winSize.height + block.y - 1) / block.y;
-
-    block.z = patch.z = 1;
-}
-
-static void pyrdown_run_cus(const oclMat &src, const oclMat &dst)
-{
-
-    CV_Assert(src.type() == dst.type());
-    CV_Assert(src.depth() != CV_8S);
-
-    Context  *clCxt = src.clCxt;
-
-    String kernelName = "pyrDown";
-
-    size_t localThreads[3]  = { 256, 1, 1 };
-    size_t globalThreads[3] = { src.cols, dst.rows, 1};
-
-    std::vector<std::pair<size_t , const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
-
-    openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH);
-}
-
-static void pyrDown_cus(const oclMat &src, oclMat &dst)
-{
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    pyrdown_run_cus(src, dst);
-}
-
-static void lkSparse_run(oclMat &I, oclMat &J,
-                  const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount,
-                  int level, /*dim3 block, */dim3 patch, Size winSize, int iters)
-{
-    Context  *clCxt = I.clCxt;
-    String kernelName = "lkSparse";
-    size_t localThreads[3]  = { 8, 8, 1 };
-    size_t globalThreads[3] = { 8 * ptcount, 8, 1};
-    int cn = I.oclchannels();
-    char calcErr = level==0?1:0;
-
-    std::vector<std::pair<size_t , const void *> > args;
-
-    cl_mem ITex = bindTexture(I);
-    cl_mem JTex = bindTexture(J);
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevPts.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nextPts.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&status.data ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&err.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&level ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.x ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&patch.y ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cn ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.width ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.height ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
-    args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
-
-    bool is_cpu = isCpuDevice();
-    if (is_cpu)
-    {
-        openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU");
-    }
-    else
-    {
-        std::stringstream idxStr;
-        idxStr << kernelName << "_C" << I.oclchannels() << "_D" << I.depth();
-        cl_kernel kernel = openCLGetKernelFromSource(clCxt, &pyrlk, idxStr.str());
-        int wave_size = (int)queryWaveFrontSize(kernel);
-        openCLSafeCall(clReleaseKernel(kernel));
-
-        static char opt[32] = {0};
-        sprintf(opt, "-D WAVE_SIZE=%d", wave_size);
-
-        openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads,
-                            args, I.oclchannels(), I.depth(), opt);
-    }
-    releaseTexture(ITex);
-    releaseTexture(JTex);
-}
-
-void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat *err)
-{
-    if (prevPts.empty())
-    {
-        nextPts.release();
-        status.release();
-        return;
-    }
-
-    derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
-
-    iters = std::min(std::max(iters, 0), 100);
-
-    const int cn = prevImg.oclchannels();
-
-    dim3 block, patch;
-    calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);
-
-    CV_Assert(derivLambda >= 0);
-    CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
-
-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
-
-    oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    oclMat temp2 = nextPts.reshape(1);
-    multiply(1.0f/(1<<maxLevel)/2.0f, temp1, temp2);
-
-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
-
-    bool errMat = false;
-    if (!err)
-    {
-        err = new oclMat(1, prevPts.cols, CV_32FC1);
-        errMat = true;
-    }
-    else
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
-
-    // build the image pyramids.
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    // dI/dx ~ Ix, dI/dy ~ Iy
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        lkSparse_run(prevPyr_[level], nextPyr_[level],
-                     prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols,
-                     level, /*block, */patch, winSize, iters);
-    }
-
-    clFinish(*(cl_command_queue*)prevImg.clCxt->getOpenCLCommandQueuePtr());
-
-    if(errMat)
-        delete err;
-}
-
-static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v,
-                 oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters)
-{
-    Context  *clCxt = I.clCxt;
-
-    String kernelName = "lkDense";
-
-    size_t localThreads[3]  = { 16, 16, 1 };
-    size_t globalThreads[3] = { I.cols, I.rows, 1};
-
-    cl_char calcErr = err ? 1 : 0;
-
-    cl_mem ITex;
-    cl_mem JTex;
-
-    ITex = bindTexture(I);
-    JTex = bindTexture(J);
-
-    std::vector<std::pair<size_t , const void *> > args;
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&ITex ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&JTex ));
-
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&u.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&u.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&v.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&v.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevU.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevU.step ));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&prevV.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&prevV.step ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.rows ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&I.cols ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.width ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&winSize.height ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&iters ));
-    args.push_back( std::make_pair( sizeof(cl_char), (void *)&calcErr ));
-
-    openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth());
-
-    releaseTexture(ITex);
-    releaseTexture(JTex);
-}
-
-void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-
-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
-
-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
-
-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
-
-    for (int level = 1; level <= maxLevel; ++level)
-    {
-        pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
-        pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
-    }
-
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
-
-    Size winSize2i(winSize.width, winSize.height);
-
-    int idx = 0;
-
-    for (int level = maxLevel; level >= 0; level--)
-    {
-        int idx2 = (idx + 1) & 1;
-
-        lkDense_run(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-                    level == 0 ? err : 0, winSize2i, iters);
-
-        if (level > 0)
-            idx = idx2;
-    }
-
-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
-
-    clFinish(*(cl_command_queue*)prevImg.clCxt->getOpenCLCommandQueuePtr());
-}
diff --git a/modules/ocl/src/pyrup.cpp b/modules/ocl/src/pyrup.cpp
deleted file mode 100644
index 95d3a3443..000000000
--- a/modules/ocl/src/pyrup.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//        Zhang Chunpeng chunpeng@multicorewareinc.com
-//        Yao Wang, yao@multicorewareinc.com
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-/* Haar features calculation */
-//#define EMU
-
-namespace cv
-{
-    namespace ocl
-    {
-        void pyrUp(const cv::ocl::oclMat &src, cv::ocl::oclMat &dst)
-        {
-            int depth = src.depth(), channels = src.channels(), oclChannels = src.oclchannels();
-
-            CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F);
-            CV_Assert(channels == 1 || channels == 3 || channels == 4);
-
-            dst.create(src.rows * 2, src.cols * 2, src.type());
-
-            Context *clCxt = src.clCxt;
-
-            const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float" };
-            char buildOptions[250], convertString[50];
-            const char * const channelsString = oclChannels == 1 ? "" : "4";
-            sprintf(convertString, "convert_%s%s_sat_rte", typeMap[depth], channelsString);
-            sprintf(buildOptions, "-D Type=%s%s -D floatType=float%s -D convertToType=%s -D convertToFloat=%s",
-                    typeMap[depth], channelsString, channelsString,
-                    depth == CV_32F ? "" : convertString,
-                    oclChannels == 4 ? "convert_float4" : "(float)");
-
-            const String kernelName = "pyrUp";
-            int dststep = dst.step / dst.elemSize(), srcstep = src.step / src.elemSize();
-
-            std::vector< std::pair<size_t, const void *> > args;
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-            args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep));
-            args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep));
-
-            size_t globalThreads[3] = {dst.cols, dst.rows, 1};
-            size_t localThreads[3]  = {16, 16, 1};
-
-
-            openCLExecuteKernel(clCxt, &pyr_up, kernelName, globalThreads, localThreads, args, -1, -1,
-                                buildOptions);
-        }
-    }
-}
diff --git a/modules/ocl/src/sort_by_key.cpp b/modules/ocl/src/sort_by_key.cpp
deleted file mode 100644
index 596f94e1c..000000000
--- a/modules/ocl/src/sort_by_key.cpp
+++ /dev/null
@@ -1,472 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-namespace ocl
-{
-void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
-
-#ifndef ANDROID
-//TODO(pengx17): change this value depending on device other than a constant
-const static unsigned int GROUP_SIZE = 256;
-#endif
-
-const char * depth_strings[] =
-{
-    "uchar",   //CV_8U
-    "char",    //CV_8S
-    "ushort",  //CV_16U
-    "short",   //CV_16S
-    "int",     //CV_32S
-    "float",   //CV_32F
-    "double"   //CV_64F
-};
-
-void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
-{
-    sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
-            isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
-    if(vals.oclchannels() > 1)
-    {
-        sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
-    }
-}
-inline bool isSizePowerOf2(size_t size)
-{
-    return ((size - 1) & (size)) == 0;
-}
-
-namespace bitonic_sort
-{
-static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
-{
-    CV_Assert(isSizePowerOf2(vecSize));
-
-    Context * cxt = Context::getContext();
-    size_t globalThreads[3] = {vecSize / 2, 1, 1};
-
-    // 2^numStages should be equal to vecSize or the output is invalid
-    int numStages = 0;
-    for(int i = vecSize; i > 1; i >>= 1)
-    {
-        ++numStages;
-    }
-    char build_opt_buf [100];
-    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
-    const int argc = 5;
-    std::vector< std::pair<size_t, const void *> > args(argc);
-    String kernelname = "bitonicSort";
-
-    args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
-    args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
-    args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
-
-    for(int stage = 0; stage < numStages; ++stage)
-    {
-        args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
-        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
-        {
-            args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
-#ifdef ANDROID
-            openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);
-#else
-            size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-            openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
-#endif
-        }
-    }
-}
-}  /* bitonic_sort */
-
-namespace selection_sort
-{
-// FIXME:
-// This function cannot sort arrays with duplicated keys
-static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
-{
-    CV_Error(-1, "This function is incorrect at the moment.");
-    Context * cxt = Context::getContext();
-
-    size_t globalThreads[3] = {vecSize, 1, 1};
-
-    std::vector< std::pair<size_t, const void *> > args;
-    char build_opt_buf [100];
-    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
-
-    //local
-    String kernelname = "selectionSortLocal";
-#ifdef ANDROID
-    int lds_size = cxt->getDeviceInfo().maxWorkGroupSize * keys.elemSize();
-#else
-    int lds_size = GROUP_SIZE * keys.elemSize();
-#endif
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
-    args.push_back(std::make_pair(lds_size,       (void*)NULL));
-
-#ifdef ANDROID
-    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);
-#else
-    size_t localThreads[3] = {GROUP_SIZE, 1, 1};
-    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
-#endif
-
-    //final
-    kernelname = "selectionSortFinal";
-    args.pop_back();
-#ifdef ANDROID
-    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, NULL, args, -1, -1, build_opt_buf);
-#else
-    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
-#endif
-}
-
-}  /* selection_sort */
-
-
-namespace radix_sort
-{
-//FIXME(pengx17):
-// exclusive scan, need to be optimized as this is too naive...
-//void naive_scan_addition(oclMat& input, oclMat& output)
-//{
-//    Context * cxt = Context::getContext();
-//    size_t vecSize = input.cols;
-//    size_t globalThreads[3] = {1, 1, 1};
-//    size_t localThreads[3]  = {1, 1, 1};
-//
-//    String kernelname = "naiveScanAddition";
-//
-//    std::vector< std::pair<size_t, const void *> > args;
-//    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
-//    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
-//    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
-//    openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
-//}
-
-void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
-{
-    Mat m_input = input, m_output(output.size(), output.type());
-    MatIterator_<int> i_mit = m_input.begin<int>();
-    MatIterator_<int> o_mit = m_output.begin<int>();
-    *o_mit = 0;
-    ++i_mit;
-    ++o_mit;
-    for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
-    {
-        *o_mit = *(o_mit - 1) + *(i_mit - 1);
-    }
-    output = m_output;
-}
-
-
-//radix sort ported from Bolt
-static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
-{
-    CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
-
-    bool isKeyFloat = keys.type() == CV_32F;
-
-    const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
-                         //local array of 16 elements in the histogram kernel.
-    const int RADICES = (1 << RADIX); //Values handeled by each work-item?
-
-    bool  newBuffer = false;
-    size_t vecSize = origVecSize;
-
-    unsigned int groupSize  = RADICES;
-
-    size_t mulFactor = groupSize * RADICES;
-
-    oclMat buffer_keys, buffer_vals;
-
-    if(origVecSize % mulFactor != 0)
-    {
-        vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
-        buffer_keys.create(1, vecSize, keys.type());
-        buffer_vals.create(1, vecSize, vals.type());
-        Scalar padding_value;
-        oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
-
-        if(isGreaterThan)
-        {
-            switch(buffer_keys.depth())
-            {
-            case CV_32F:
-                padding_value = Scalar::all(-FLT_MAX);
-                break;
-            case CV_32S:
-                padding_value = Scalar::all(INT_MIN);
-                break;
-            }
-        }
-        else
-        {
-            switch(buffer_keys.depth())
-            {
-            case CV_32F:
-                padding_value = Scalar::all(FLT_MAX);
-                break;
-            case CV_32S:
-                padding_value = Scalar::all(INT_MAX);
-                break;
-            }
-        }
-        ocl::copyMakeBorder(
-            keys(Rect(0,0,origVecSize,1)), buffer_keys,
-            0, 0, 0, vecSize - origVecSize,
-            BORDER_CONSTANT, padding_value);
-        vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
-        newBuffer = true;
-    }
-    else
-    {
-        buffer_keys = keys;
-        buffer_vals = vals;
-        newBuffer = false;
-    }
-    oclMat swap_input_keys(1, vecSize, keys.type());
-    oclMat swap_input_vals(1, vecSize, vals.type());
-    oclMat hist_bin_keys(1, vecSize, CV_32SC1);
-    oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
-
-    Context * cxt = Context::getContext();
-
-    size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
-    size_t localThreads[3]  = {groupSize, 1, 1};
-
-    std::vector< std::pair<size_t, const void *> > args;
-    char build_opt_buf [100];
-    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
-
-    //additional build option for radix sort
-    sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
-
-    String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
-
-    int swap = 0;
-    for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
-    {
-        args.clear();
-        //Do a histogram pass locally
-        if(swap == 0)
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
-        }
-        else
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
-        }
-        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
-        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
-        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
-            args, -1, -1, build_opt_buf);
-
-        args.clear();
-        //Perform a global scan
-        naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
-        // end of scan
-        if(swap == 0)
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
-        }
-        else
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
-        }
-        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
-        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
-
-        if(swap == 0)
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
-        }
-        else
-        {
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
-            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
-        }
-        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
-            args, -1, -1, build_opt_buf);
-        swap = swap ? 0 : 1;
-    }
-    if(newBuffer)
-    {
-        buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
-        buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
-    }
-}
-
-}  /* radix_sort */
-
-namespace merge_sort
-{
-static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
-{
-    Context * cxt = Context::getContext();
-
-    const size_t GROUP_SIZE = cxt->getDeviceInfo().maxWorkGroupSize >= 256 ? 256: 128;
-
-    size_t globalThreads[3] = {vecSize, 1, 1};
-    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-    std::vector< std::pair<size_t, const void *> > args;
-    char build_opt_buf [100];
-    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
-
-    String kernelname[] = {String("blockInsertionSort"), String("merge")};
-    int keylds_size = GROUP_SIZE * keys.elemSize();
-    int vallds_size = GROUP_SIZE * vals.elemSize();
-    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&keys.data));
-    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&vals.data));
-    args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
-    args.push_back(std::make_pair(keylds_size,     (void*)NULL));
-    args.push_back(std::make_pair(vallds_size,     (void*)NULL));
-
-    openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
-
-    //  Early exit for the case of no merge passes, values are already in destination vector
-    if(vecSize <= GROUP_SIZE)
-    {
-        return;
-    }
-
-    //  An odd number of elements requires an extra merge pass to sort
-    size_t numMerges = 0;
-    //  Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
-    //  this is how many merge passes we want
-    size_t log2BlockSize = vecSize >> 6;
-    for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
-    {
-        ++numMerges;
-    }
-    //  Check to see if the input vector size is a power of 2, if not we will need last merge pass
-    numMerges += isSizePowerOf2(vecSize)? 1: 0;
-
-    //  Allocate a flipflop buffer because the merge passes are out of place
-    oclMat tmpKeyBuffer(keys.size(), keys.type());
-    oclMat tmpValBuffer(vals.size(), vals.type());
-    args.resize(8);
-
-    args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
-    args[6] = std::make_pair(keylds_size,    (void*)NULL);
-    args[7] = std::make_pair(vallds_size,    (void*)NULL);
-
-    for(size_t pass = 1; pass <= numMerges; ++pass )
-    {
-        //  For each pass, flip the input-output buffers
-        if( pass & 0x1 )
-        {
-            args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
-            args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
-            args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
-            args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
-        }
-        else
-        {
-            args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
-            args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
-            args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
-            args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
-        }
-        //  For each pass, the merge window doubles
-        unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
-        args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
-        openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
-    }
-    //  If there are an odd number of merges, then the output data is sitting in the temp buffer.  We need to copy
-    //  the results back into the input array
-    if( numMerges & 1 )
-    {
-        tmpKeyBuffer.copyTo(keys);
-        tmpValBuffer.copyTo(vals);
-    }
-}
-}  /* merge_sort */
-
-}
-} /* namespace cv { namespace ocl */
-
-
-void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
-{
-    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
-    CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
-    CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
-    switch(method)
-    {
-    case SORT_BITONIC:
-        bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
-        break;
-    case SORT_SELECTION:
-        selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
-        break;
-    case SORT_RADIX:
-        radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
-        break;
-    case SORT_MERGE:
-        merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
-        break;
-    }
-}
-
-void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
-{
-    CV_Assert( keys.size() == vals.size() );
-    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
-    size_t vecSize = static_cast<size_t>(keys.cols);
-    sortByKey(keys, vals, vecSize, method, isGreaterThan);
-}
diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp
deleted file mode 100644
index 583869747..000000000
--- a/modules/ocl/src/split_merge.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-namespace cv
-{
-    namespace ocl
-    {
-        namespace split_merge
-        {
-            static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
-            {
-                if(!mat_dst.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_dst.type() == CV_64F)
-                {
-                    CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-                    return;
-                }
-
-                Context  *clCxt = mat_dst.clCxt;
-                int channels = mat_dst.oclchannels();
-                int depth = mat_dst.depth();
-
-                String kernelName = "merge_vector";
-
-                int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
-                    {2, 2, 1, 1, 1, 1, 1},
-                    {4, 4, 2, 2 , 1, 1, 1},
-                    {1, 1, 1, 1, 1, 1, 1}
-                };
-
-                size_t vector_length = vector_lengths[channels - 1][depth];
-                int offset_cols = (mat_dst.offset / mat_dst.elemSize()) & (vector_length - 1);
-                int cols = divUp(mat_dst.cols + offset_cols, vector_length);
-
-                size_t localThreads[3]  = { 64, 4, 1 };
-                size_t globalThreads[3] = { cols, mat_dst.rows, 1 };
-
-                int dst_step1 = mat_dst.cols * mat_dst.elemSize();
-                std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[0].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[0].offset));
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[1].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].step));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[1].offset));
-
-                if(channels == 4)
-                {
-                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
-
-                    if(n == 3)
-                    {
-                        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[2].data));
-                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].step));
-                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[2].offset));
-                    }
-                    else if( n == 4)
-                    {
-                        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src[3].data));
-                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].step));
-                        args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src[3].offset));
-                    }
-                }
-
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst.rows));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));
-
-                openCLExecuteKernel(clCxt, &merge_mat, kernelName, globalThreads, localThreads, args, channels, depth);
-            }
-            static void merge(const oclMat *mat_src, size_t n, oclMat &mat_dst)
-            {
-                CV_Assert(mat_src);
-                CV_Assert(n > 0);
-
-                int depth = mat_src[0].depth();
-                Size size = mat_src[0].size();
-
-                int total_channels = 0;
-
-                for(size_t i = 0; i < n; ++i)
-                {
-                    CV_Assert(depth == mat_src[i].depth());
-                    CV_Assert(size == mat_src[i].size());
-
-                    total_channels += mat_src[i].oclchannels();
-                }
-
-                CV_Assert(total_channels <= 4);
-
-                if(total_channels == 1)
-                {
-                    mat_src[0].copyTo(mat_dst);
-                    return;
-                }
-
-                mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
-                merge_vector_run(mat_src, n, mat_dst);
-            }
-            static void split_vector_run(const oclMat &src, oclMat *dst)
-            {
-
-                if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
-                {
-                    CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
-                    return;
-                }
-
-                Context  *clCtx = src.clCxt;
-                int channels = src.channels();
-                int depth = src.depth();
-                depth = (depth == CV_8S) ? CV_8U : depth;
-                depth = (depth == CV_16S) ? CV_16U : depth;
-
-                String kernelName = "split_vector";
-
-                size_t VEC_SIZE = 4;
-
-                std::vector<std::pair<size_t , const void *> > args;
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
-                int srcOffsetXBytes = src.offset % src.step;
-                int srcOffsetY = src.offset / src.step;
-                cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}};
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&srcOffset));
-
-                bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false;
-                int alignSize = dst[0].elemSize1() * VEC_SIZE;
-                int alignMask = alignSize - 1;
-
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[0].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[0].step));
-                int dst0OffsetXBytes = dst[0].offset % dst[0].step;
-                int dst0OffsetY = dst[0].offset / dst[0].step;
-                cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}};
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst0Offset));
-                if ((dst0OffsetXBytes & alignMask) == 0)
-                    dst0Aligned = true;
-
-                args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[1].data));
-                args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[1].step));
-                int dst1OffsetXBytes = dst[1].offset % dst[1].step;
-                int dst1OffsetY = dst[1].offset / dst[1].step;
-                cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}};
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst1Offset));
-                if ((dst1OffsetXBytes & alignMask) == 0)
-                    dst1Aligned = true;
-
-                // DON'T MOVE VARIABLES INTO 'IF' BODY
-                int dst2OffsetXBytes, dst2OffsetY;
-                cl_int2 dst2Offset;
-                int dst3OffsetXBytes, dst3OffsetY;
-                cl_int2 dst3Offset;
-                if (channels >= 3)
-                {
-                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[2].data));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[2].step));
-                    dst2OffsetXBytes = dst[2].offset % dst[2].step;
-                    dst2OffsetY = dst[2].offset / dst[2].step;
-                    dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY;
-                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst2Offset));
-                    if ((dst2OffsetXBytes & alignMask) == 0)
-                        dst2Aligned = true;
-                }
-
-                if (channels >= 4)
-                {
-                    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[3].data));
-                    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[3].step));
-                    dst3OffsetXBytes = dst[3].offset % dst[3].step;
-                    dst3OffsetY = dst[3].offset / dst[3].step;
-                    dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY;
-                    args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst3Offset));
-                    if ((dst3OffsetXBytes & alignMask) == 0)
-                        dst3Aligned = true;
-                }
-
-                cl_int2 size = {{ src.cols, src.rows }};
-                args.push_back( std::make_pair( sizeof(cl_int2), (void *)&size));
-
-                String build_options =
-                        cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d",
-                                   (int)VEC_SIZE, depth, channels);
-
-                if (dst0Aligned)
-                    build_options += " -D DST0_ALIGNED";
-                if (dst1Aligned)
-                    build_options += " -D DST1_ALIGNED";
-                if (dst2Aligned)
-                    build_options += " -D DST2_ALIGNED";
-                if (dst3Aligned)
-                    build_options += " -D DST3_ALIGNED";
-
-                const DeviceInfo& devInfo = clCtx->getDeviceInfo();
-
-                // TODO Workaround for issues. Need to investigate a problem.
-                if (channels == 2
-                        && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
-                        && devInfo.platform->platformVendor.find("Intel") != std::string::npos
-                        && (devInfo.deviceVersion.find("Build 56860") != std::string::npos
-                            || devInfo.deviceVersion.find("Build 76921") != std::string::npos
-                            || devInfo.deviceVersion.find("Build 78712") != std::string::npos))
-                    build_options += " -D BYPASS_VSTORE=true";
-
-                size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
-                openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
-            }
-            static void split(const oclMat &mat_src, oclMat *mat_dst)
-            {
-                CV_Assert(mat_dst);
-
-                int depth = mat_src.depth();
-                int num_channels = mat_src.channels();
-                Size size = mat_src.size();
-
-                if (num_channels == 1)
-                {
-                    mat_src.copyTo(mat_dst[0]);
-                    return;
-                }
-
-                for (int i = 0; i < mat_src.oclchannels(); i++)
-                    mat_dst[i].create(size, CV_MAKETYPE(depth, 1));
-
-                split_vector_run(mat_src, mat_dst);
-            }
-        }
-    }
-}
-
-void cv::ocl::merge(const oclMat *src, size_t n, oclMat &dst)
-{
-    split_merge::merge(src, n, dst);
-}
-void cv::ocl::merge(const std::vector<oclMat> &src, oclMat &dst)
-{
-    split_merge::merge(&src[0], src.size(), dst);
-}
-
-void cv::ocl::split(const oclMat &src, oclMat *dst)
-{
-    split_merge::split(src, dst);
-}
-void cv::ocl::split(const oclMat &src, std::vector<oclMat> &dst)
-{
-    dst.resize(src.oclchannels()); // TODO Why oclchannels?
-    if(src.oclchannels() > 0)
-        split_merge::split(src, &dst[0]);
-}
diff --git a/modules/ocl/src/stereo_csbp.cpp b/modules/ocl/src/stereo_csbp.cpp
deleted file mode 100644
index 2f9391c00..000000000
--- a/modules/ocl/src/stereo_csbp.cpp
+++ /dev/null
@@ -1,698 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Jin Ma, jin@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        namespace stereoCSBP
-        {
-            static inline int divUp(int total, int grain)
-            {
-                return (total + grain - 1) / grain;
-            }
-            static String get_kernel_name(String kernel_name, int data_type)
-            {
-                return kernel_name + (data_type == CV_16S ? "0" : "1");
-            }
-            using cv::ocl::StereoConstantSpaceBP;
-            //////////////////////////////////////////////////////////////////////////////////
-            /////////////////////////////////init_data_cost//////////////////////////////////
-            //////////////////////////////////////////////////////////////////////////////////
-            static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp,
-                StereoConstantSpaceBP &rthis,
-                int msg_step, int h, int w, int level)
-            {
-                Context  *clCxt = left.clCxt;
-                int data_type = rthis.msg_type;
-                int channels = left.oclchannels();
-
-                String kernelName = get_kernel_name("init_data_cost_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                //size_t blockSize = 256;
-                size_t localThreads[]  = {32, 8 ,1};
-                size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
-                    divUp(h, localThreads[1]) *localThreads[1],
-                    1
-                };
-
-                int cdisp_step1 = msg_step * h;
-                openCLVerifyKernel(clCxt, kernel,  localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem),  (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem),  (void *)&left.data));
-                openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem),  (void *)&right.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int),  (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int),  (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int),  (void *)&level));
-                openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int),  (void *)&channels));
-                openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int),  (void *)&msg_step));
-                openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight));
-                openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-
-            static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp,
-                StereoConstantSpaceBP &rthis,
-                int msg_step, int h, int w, int level)
-            {
-
-                Context  *clCxt = left.clCxt;
-                int data_type = rthis.msg_type;
-                int channels = left.oclchannels();
-                int win_size = (int)std::pow(2.f, level);
-
-                String kernelName = get_kernel_name("init_data_cost_reduce_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                const int threadsNum = 256;
-                //size_t blockSize = threadsNum;
-                size_t localThreads[3]  = {win_size, 1, threadsNum / win_size};
-                size_t globalThreads[3] = { w *localThreads[0],
-                    h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
-                };
-
-                int local_mem_size = threadsNum * sizeof(float);
-                int cdisp_step1 = msg_step * h;
-
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem),  (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem),  (void *)&left.data));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem),  (void *)&right.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  local_mem_size,  (void *)NULL));
-                openCLSafeCall(clSetKernelArg(kernel, 4,  sizeof(cl_int),  (void *)&level));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_int),  (void *)&left.rows));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_int),  (void *)&left.cols));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_int),  (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_int),  (void *)&win_size));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_int),  (void *)&channels));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int),  (void *)&rthis.ndisp));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int),  (void *)&left.step));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term));
-                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&cdisp_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&msg_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-
-            static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
-                oclMat &temp, StereoConstantSpaceBP &rthis,
-                int h, int w, int nr_plane, int msg_step)
-            {
-                Context  *clCxt = temp.clCxt;
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("get_first_k_initial_local_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                //size_t blockSize = 256;
-                size_t localThreads[]  = {32, 8 ,1};
-                size_t globalThreads[] = { roundUp(w, localThreads[0]), roundUp(h, localThreads[1]), 1 };
-
-                int disp_step = msg_step * h;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
-                openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
-                openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
-                openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-            static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
-                oclMat &temp, StereoConstantSpaceBP &rthis,
-                int h, int w, int nr_plane, int msg_step)
-            {
-                Context  *clCxt = temp.clCxt;
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("get_first_k_initial_global_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                //size_t blockSize = 256;
-                size_t localThreads[]  = {32, 8, 1};
-                size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
-                    divUp(h, localThreads[1]) *localThreads[1],
-                    1
-                };
-
-                int disp_step = msg_step * h;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
-                openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
-                openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
-                openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-
-            static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis,
-                uchar *disp_selected_pyr, uchar *data_cost_selected,
-                size_t msg_step, int h, int w, int level, int nr_plane)
-            {
-
-                if(level <= 1)
-                    init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level);
-                else
-                    init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level);
-
-                if(rthis.use_local_init_data_cost == true)
-                {
-                    get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step);
-                }
-                else
-                {
-                    get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w,
-                        nr_plane, msg_step);
-                }
-            }
-
-            ///////////////////////////////////////////////////////////////////////////////////////////////////
-            ///////////////////////////////////compute_data_cost//////////////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////////////////////////////
-            static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost,
-                StereoConstantSpaceBP &rthis, int msg_step1,
-                int msg_step2, const oclMat &left, const oclMat &right, int h,
-                int w, int h2, int level, int nr_plane)
-            {
-                Context  *clCxt = left.clCxt;
-                int channels = left.oclchannels();
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("compute_data_cost_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                size_t localThreads[]  = { 32, 8, 1 };
-                size_t globalThreads[] = { roundUp(w, localThreads[0]), roundUp(h, localThreads[1]), 1 };
-
-                int disp_step1 = msg_step1 * h;
-                int disp_step2 = msg_step2 * h2;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem),  (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem),  (void *)&data_cost));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem),  (void *)&left.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  sizeof(cl_mem),  (void *)&right.data));
-                openCLSafeCall(clSetKernelArg(kernel, 4,  sizeof(cl_int),  (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_int),  (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_int),  (void *)&level));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_int),  (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_int),  (void *)&channels));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_int),  (void *)&msg_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int),  (void *)&msg_step2));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int),  (void *)&disp_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int),  (void *)&disp_step2));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight));
-                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
-                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&left.step));
-                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-            static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
-                StereoConstantSpaceBP &rthis, int msg_step1,
-                int msg_step2, const oclMat &left, const oclMat &right, int h,
-                int w, int h2, int level, int nr_plane)
-            {
-                Context  *clCxt = left.clCxt;
-                int data_type = rthis.msg_type;
-                int channels = left.oclchannels();
-                int win_size = (int)std::pow(2.f, level);
-
-                String kernelName = get_kernel_name("compute_data_cost_reduce_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                const size_t threadsNum = 256;
-                //size_t blockSize = threadsNum;
-                size_t localThreads[3]  = { win_size, 1, threadsNum / win_size };
-                size_t globalThreads[3] = { w *localThreads[0],
-                    h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
-                };
-
-                int disp_step1 = msg_step1 * h;
-                int disp_step2 = msg_step2 * h2;
-                size_t local_mem_size = threadsNum * sizeof(float);
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem),  (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem),  (void *)&data_cost));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem),  (void *)&left.data));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  sizeof(cl_mem),  (void *)&right.data));
-                openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size,   (void *)NULL));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_int),  (void *)&level));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_int),  (void *)&left.rows));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_int),  (void *)&left.cols));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_int),  (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_int),  (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int),  (void *)&channels));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int),  (void *)&win_size));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int),  (void *)&msg_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int),  (void *)&msg_step2));
-                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int),  (void *)&disp_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int),  (void *)&disp_step2));
-                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight));
-                openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
-                openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int),  (void *)&left.step));
-                openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int),  (void *)&rthis.min_disp_th));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 3, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-            static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
-                int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w,
-                int h2, int level, int nr_plane)
-            {
-                if(level <= 1)
-                    compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
-                    left, right, h, w, h2, level, nr_plane);
-                else
-                    compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis,  msg_step1, msg_step2,
-                    left, right, h, w, h2, level, nr_plane);
-            }
-            ////////////////////////////////////////////////////////////////////////////////////////////////
-            //////////////////////////////////////init message//////////////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////////////////////////////
-            static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new,
-                uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur,
-                uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur,
-                uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis,
-                size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane,
-                int h2, int w2, int nr_plane2)
-            {
-                Context  *clCxt = temp.clCxt;
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("init_message_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                //size_t blockSize = 256;
-                size_t localThreads[]  = {32, 8, 1};
-                size_t globalThreads[] = { roundUp(w, localThreads[0]), roundUp(h, localThreads[1]), 1 };
-
-                int disp_step1 = msg_step1 * h;
-                int disp_step2 = msg_step2 * h2;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem), (void *)&u_new));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem), (void *)&d_new));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem), (void *)&l_new));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  sizeof(cl_mem), (void *)&r_new));
-                openCLSafeCall(clSetKernelArg(kernel, 4,  sizeof(cl_mem), (void *)&u_cur));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_mem), (void *)&d_cur));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_mem), (void *)&l_cur));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_mem), (void *)&r_cur));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_mem), (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_mem), (void *)&disp_selected_pyr_new));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2));
-                openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2));
-                openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2));
-                openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
-                openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
-                openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-            ////////////////////////////////////////////////////////////////////////////////////////////////
-            ///////////////////////////calc_all_iterations////////////////////////////////////////////////
-            //////////////////////////////////////////////////////////////////////////////////////////////
-            static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
-                uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
-                int msg_step, int h, int w, int nr_plane, int i)
-            {
-                Context  *clCxt = temp.clCxt;
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("compute_message_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-                size_t localThreads[]  = {32, 8, 1};
-                size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0],
-                    divUp(h, localThreads[1]) *localThreads[1],
-                    1
-                };
-
-                int disp_step = msg_step * h;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem),  (void *)&u));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem),  (void *)&d));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem),  (void *)&l));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  sizeof(cl_mem),  (void *)&r));
-                openCLSafeCall(clSetKernelArg(kernel, 4,  sizeof(cl_mem),  (void *)&data_cost_selected));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_mem),  (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_mem),  (void *)&temp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_int),  (void *)&h));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_int),  (void *)&w));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_int),  (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int),  (void *)&i));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int),  (void *)&disp_step));
-                openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int),  (void *)&msg_step));
-                openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-            static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
-                uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
-                int msg_step, int h, int w, int nr_plane)
-            {
-                for(int t = 0; t < rthis.iters; t++)
-                    calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis,
-                    msg_step, h, w, nr_plane, t & 1);
-            }
-
-            ///////////////////////////////////////////////////////////////////////////////////////////////
-            //////////////////////////compute_disp////////////////////////////////////////////////////////
-            /////////////////////////////////////////////////////////////////////////////////////////////
-            static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
-                uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step,
-                oclMat &disp, int nr_plane)
-            {
-                Context  *clCxt = disp.clCxt;
-                int data_type = rthis.msg_type;
-
-                String kernelName = get_kernel_name("compute_disp_", data_type);
-
-                cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
-
-                //size_t blockSize = 256;
-                size_t localThreads[]  = { 32, 8, 1 };
-                size_t globalThreads[] = { roundUp(disp.cols, localThreads[0]), roundUp(disp.rows, localThreads[1]), 1 };
-
-                int step_size = disp.step / disp.elemSize();
-                int disp_step = disp.rows * msg_step;
-                openCLVerifyKernel(clCxt, kernel, localThreads);
-                openCLSafeCall(clSetKernelArg(kernel, 0,  sizeof(cl_mem), (void *)&u));
-                openCLSafeCall(clSetKernelArg(kernel, 1,  sizeof(cl_mem), (void *)&d));
-                openCLSafeCall(clSetKernelArg(kernel, 2,  sizeof(cl_mem), (void *)&l));
-                openCLSafeCall(clSetKernelArg(kernel, 3,  sizeof(cl_mem), (void *)&r));
-                openCLSafeCall(clSetKernelArg(kernel, 4,  sizeof(cl_mem), (void *)&data_cost_selected));
-                openCLSafeCall(clSetKernelArg(kernel, 5,  sizeof(cl_mem), (void *)&disp_selected_pyr));
-                openCLSafeCall(clSetKernelArg(kernel, 6,  sizeof(cl_mem), (void *)&disp.data));
-                openCLSafeCall(clSetKernelArg(kernel, 7,  sizeof(cl_int), (void *)&step_size));
-                openCLSafeCall(clSetKernelArg(kernel, 8,  sizeof(cl_int), (void *)&disp.cols));
-                openCLSafeCall(clSetKernelArg(kernel, 9,  sizeof(cl_int), (void *)&disp.rows));
-                openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
-                openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
-                openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
-                openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getClCommandQueuePtr(), kernel, 2, NULL,
-                    globalThreads, localThreads, 0, NULL, NULL));
-
-                clFinish(*(cl_command_queue*)getClCommandQueuePtr());
-                openCLSafeCall(clReleaseKernel(kernel));
-            }
-        }
-    }
-}
-namespace
-{
-    const float DEFAULT_MAX_DATA_TERM = 30.0f;
-    const float DEFAULT_DATA_WEIGHT = 1.0f;
-    const float DEFAULT_MAX_DISC_TERM = 160.0f;
-    const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
-}
-
-void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane)
-{
-    ndisp = (int) ((float) width / 3.14f);
-    if ((ndisp & 1) != 0)
-        ndisp++;
-
-    int mm = ::max(width, height);
-    iters = mm / 100 + ((mm > 1200) ? - 4 : 4);
-
-    levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
-    if (levels == 0) levels++;
-
-    nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
-}
-
-cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-    int msg_type_)
-
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-    max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-    max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
-    msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-
-cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
-    float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
-    int min_disp_th_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
-    max_data_term(max_data_term_), data_weight(data_weight_),
-    max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
-    msg_type(msg_type_), use_local_init_data_cost(true)
-{
-    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
-}
-
-template<class T>
-static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
-    oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
-    oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp)
-{
-    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
-        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
-
-    CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3));
-
-    const Scalar zero = Scalar::all(0);
-
-    ////////////////////////////////////Init///////////////////////////////////////////////////
-    int rows = left.rows;
-    int cols = left.cols;
-
-    rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
-    int levels = rthis.levels;
-
-    AutoBuffer<int> buf(levels * 4);
-
-    int *cols_pyr = buf;
-    int *rows_pyr = cols_pyr + levels;
-    int *nr_plane_pyr = rows_pyr + levels;
-    int *step_pyr = nr_plane_pyr + levels;
-
-    cols_pyr[0] = cols;
-    rows_pyr[0] = rows;
-    nr_plane_pyr[0] = rthis.nr_plane;
-
-    const int n = 64;
-    step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
-    for (int i = 1; i < levels; i++)
-    {
-        cols_pyr[i] = cols_pyr[i - 1]  / 2;
-        rows_pyr[i] = rows_pyr[i - 1]/ 2;
-
-        nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2;
-
-        step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
-    }
-
-    Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
-    Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);
-
-    u[0].create(msg_size, DataType<T>::type);
-    d[0].create(msg_size, DataType<T>::type);
-    l[0].create(msg_size, DataType<T>::type);
-    r[0].create(msg_size, DataType<T>::type);
-
-    u[1].create(msg_size, DataType<T>::type);
-    d[1].create(msg_size, DataType<T>::type);
-    l[1].create(msg_size, DataType<T>::type);
-    r[1].create(msg_size, DataType<T>::type);
-
-    disp_selected_pyr[0].create(msg_size, DataType<T>::type);
-    disp_selected_pyr[1].create(msg_size, DataType<T>::type);
-
-    data_cost.create(data_cost_size, DataType<T>::type);
-    data_cost_selected.create(msg_size, DataType<T>::type);
-
-    Size temp_size = data_cost_size;
-    if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp)
-        temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp);
-
-    temp.create(temp_size, DataType<T>::type);
-    temp = zero;
-
-    ///////////////////////////////// Compute////////////////////////////////////////////////
-
-    //csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
-    //   rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
-
-    l[0] = zero;
-    d[0] = zero;
-    r[0] = zero;
-    u[0] = zero;
-    disp_selected_pyr[0] = zero;
-
-    l[1] = zero;
-    d[1] = zero;
-    r[1] = zero;
-    u[1] = zero;
-    disp_selected_pyr[1] = zero;
-
-    data_cost = zero;
-
-    data_cost_selected = zero;
-
-    int cur_idx = 0;
-
-    for (int i = levels - 1; i >= 0; i--)
-    {
-        if (i == levels - 1)
-        {
-            cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data,
-                data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i],
-                i, nr_plane_pyr[i]);
-        }
-        else
-        {
-            cv::ocl::stereoCSBP::compute_data_cost(
-                disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0],
-                step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i,
-                nr_plane_pyr[i + 1]);
-
-            int new_idx = (cur_idx + 1) & 1;
-
-            cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data,
-                u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
-                disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data,
-                data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0],
-                step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1],
-                cols_pyr[i + 1], nr_plane_pyr[i + 1]);
-            cur_idx = new_idx;
-        }
-        cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
-            data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp,
-            rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]);
-    }
-
-    if (disp.empty())
-        disp.create(rows, cols, CV_16S);
-
-    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-    out = zero;
-
-    stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
-        data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0],
-        out, nr_plane_pyr[0]);
-    if (disp.type() != CV_16S)
-        out.convertTo(disp, disp.type());
-}
-
-
-typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
-    oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
-    oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp);
-
-const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
-
-void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
-{
-
-    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out,
-        left, right, disp);
-}
diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp
deleted file mode 100644
index 4bfa80f8f..000000000
--- a/modules/ocl/src/stereobm.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Xiaopeng Fu, xiaopeng@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-namespace cv
-{
-namespace ocl
-{
-namespace stereoBM
-{
-/////////////////////////////////////////////////////////////////////////
-//////////////////////////prefilter_xsbel////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
-{
-    String kernelName = "prefilter_xsobel";
-
-    size_t blockSize = 1;
-    size_t globalThreads[3] = { input.cols, input.rows, 1 };
-    size_t localThreads[3]  = { blockSize, blockSize, 1 };
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap));
-
-    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
-        globalThreads, localThreads, args, -1, -1);
-}
-//////////////////////////////////////////////////////////////////////////
-//////////////////////////////common////////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-#define N_DISPARITIES 8
-#define ROWSperTHREAD 21
-#define BLOCK_W 128
-
-////////////////////////////////////////////////////////////////////////////
-///////////////////////////////stereoBM_GPU////////////////////////////////
-////////////////////////////////////////////////////////////////////////////
-static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
-               int maxdisp, int winSize,  oclMat &minSSD_buf)
-{
-    int winsz2 = winSize >> 1;
-
-    String kernelName = "stereoKernel";
-
-    disp.setTo(Scalar_<unsigned char>::all(0));
-    minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));
-
-    size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
-    size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
-                            sizeof(cl_uint);
-    //size_t blockSize = 1;
-    size_t localThreads[]  = { BLOCK_W, 1, 1 };
-    size_t globalThreads[] = { left.cols - maxdisp - 2 * winsz2,
-                               divUp(left.rows - 2 * winsz2, ROWSperTHREAD),
-                               1 };
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp));
-    args.push_back(std::make_pair(local_mem_size, (void *)NULL));
-
-    char opt [128];
-    sprintf(opt, "-D radius=%d", winsz2);
-    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
-        globalThreads, localThreads, args, -1, -1, opt);
-}
-////////////////////////////////////////////////////////////////////////////
-///////////////////////////////postfilter_textureness///////////////////////
-////////////////////////////////////////////////////////////////////////////
-static void postfilter_textureness(oclMat &left, int winSize,
-                            float avergeTexThreshold, oclMat &disparity)
-{
-    String kernelName = "textureness_kernel";
-
-    size_t blockSize = 1;
-    size_t localThreads[]  = { BLOCK_W, blockSize ,1};
-    size_t globalThreads[] = { left.cols,
-                               divUp(left.rows, 2 * ROWSperTHREAD),
-                               1 };
-
-    size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize));
-    args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold));
-    args.push_back(std::make_pair(local_mem_size, (void*)NULL));
-    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
-        globalThreads, localThreads, args, -1, -1);
-}
-//////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////operator/////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////
-static void operator_(oclMat &minSSD, oclMat &leBuf, oclMat &riBuf, int preset, int ndisp,
-               int winSize, float avergeTexThreshold, const oclMat &left,
-               const oclMat &right, oclMat &disparity)
-
-{
-    CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
-    CV_DbgAssert(left.type() == CV_8UC1);
-    CV_DbgAssert(right.type() == CV_8UC1);
-
-    disparity.create(left.size(), CV_8UC1);
-    minSSD.create(left.size(), CV_32SC1);
-
-    oclMat le_for_bm =  left;
-    oclMat ri_for_bm = right;
-
-    if (preset == cv::ocl::StereoBM_OCL::PREFILTER_XSOBEL)
-    {
-        leBuf.create( left.size(),  left.type());
-        riBuf.create(right.size(), right.type());
-
-        prefilter_xsobel( left, leBuf, 31);
-        prefilter_xsobel(right, riBuf, 31);
-
-        le_for_bm = leBuf;
-        ri_for_bm = riBuf;
-    }
-
-    stereo_bm(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD);
-
-    if (avergeTexThreshold)
-    {
-        postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity);
-    }
-}
-}
-}
-}
-const float defaultAvgTexThreshold = 3;
-
-cv::ocl::StereoBM_OCL::StereoBM_OCL()
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ),
-      avergeTexThreshold(defaultAvgTexThreshold)  {}
-
-cv::ocl::StereoBM_OCL::StereoBM_OCL(int preset_, int ndisparities_, int winSize_)
-    : preset(preset_), ndisp(ndisparities_), winSize(winSize_),
-      avergeTexThreshold(defaultAvgTexThreshold)
-{
-    const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
-    CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp);
-    CV_Assert(ndisp % 8 == 0);
-    CV_Assert(winSize % 2 == 1);
-}
-
-bool cv::ocl::StereoBM_OCL::checkIfGpuCallReasonable()
-{
-    return true;
-}
-
-void cv::ocl::StereoBM_OCL::operator() ( const oclMat &left, const oclMat &right,
-        oclMat &disparity)
-{
-    cv::ocl::stereoBM::operator_(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity);
-}
diff --git a/modules/ocl/src/stereobp.cpp b/modules/ocl/src/stereobp.cpp
deleted file mode 100644
index a564c3d33..000000000
--- a/modules/ocl/src/stereobp.cpp
+++ /dev/null
@@ -1,502 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Peng Xiao,   pengxiao@outlook.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-namespace cv
-{
-    namespace ocl
-    {
-        namespace stereoBP
-        {
-            //////////////////////////////////////////////////////////////////////////
-            //////////////////////////////common////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////
-            typedef struct
-            {
-                int   cndisp;
-                float cmax_data_term;
-                float cdata_weight;
-                float cmax_disc_term;
-                float cdisc_single_jump;
-            } con_struct_t;
-
-            cl_mem cl_con_struct =  NULL;
-            static void load_constants(int ndisp, float max_data_term, float data_weight,
-                                float max_disc_term, float disc_single_jump)
-            {
-                con_struct_t *con_struct = new con_struct_t;
-                con_struct -> cndisp            = ndisp;
-                con_struct -> cmax_data_term    = max_data_term;
-                con_struct -> cdata_weight      = data_weight;
-                con_struct -> cmax_disc_term    = max_disc_term;
-                con_struct -> cdisc_single_jump = disc_single_jump;
-
-                Context* clCtx = Context::getContext();
-                cl_context clContext = *(cl_context*)(clCtx->getOpenCLContextPtr());
-                cl_command_queue clCmdQueue = *(cl_command_queue*)(clCtx->getOpenCLCommandQueuePtr());
-                cl_con_struct = load_constant(clContext, clCmdQueue, (void *)con_struct,
-                                              sizeof(con_struct_t));
-
-                delete con_struct;
-            }
-            static void release_constants()
-            {
-                openCLFree(cl_con_struct);
-            }
-
-            /////////////////////////////////////////////////////////////////////////////
-            ///////////////////////////comp data////////////////////////////////////////
-            /////////////////////////////////////////////////////////////////////////
-            static void  comp_data_call(const oclMat &left, const oclMat &right, oclMat &data, int /*disp*/,
-                float /*cmax_data_term*/, float /*cdata_weight*/)
-            {
-                Context  *clCxt = left.clCxt;
-                int channels = left.oclchannels();
-                int data_type = data.type();
-
-                String kernelName = "comp_data";
-
-                std::vector<std::pair<size_t , const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&left.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.rows));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.cols));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&left.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&right.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&right.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&cl_con_struct));
-
-                size_t gt[3] = {left.cols, left.rows, 1}, lt[3] = {16, 16, 1};
-
-                const int OPT_SIZE = 50;
-                char cn_opt [OPT_SIZE] = "";
-                sprintf( cn_opt, "%s -D CN=%d",
-                    (data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT"),
-                    channels
-                    );
-                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, cn_opt);
-            }
-            ///////////////////////////////////////////////////////////////////////////////////
-            /////////////////////////data set down////////////////////////////////////////////
-            /////////////////////////////////////////////////////////////////////////////////
-            static void data_step_down_call(int dst_cols, int dst_rows, int src_rows,
-                const oclMat &src, oclMat &dst, int disp)
-            {
-                Context  *clCxt = src.clCxt;
-                int data_type = src.type();
-
-                String kernelName = "data_step_down";
-
-                std::vector<std::pair<size_t , const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp));
-
-                size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
-                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
-                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
-            }
-            /////////////////////////////////////////////////////////////////////////////////
-            ///////////////////////////live up message////////////////////////////////////////
-            /////////////////////////////////////////////////////////////////////////////////
-            static void level_up_message_call(int dst_cols, int dst_rows, int src_rows,
-                oclMat &src, oclMat &dst, int ndisp)
-            {
-                Context  *clCxt = src.clCxt;
-                int data_type = src.type();
-
-                String kernelName = "level_up_message";
-                std::vector<std::pair<size_t , const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_rows));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_rows));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_cols));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.step));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
-
-                size_t gt[3] = {dst_cols, dst_rows, 1}, lt[3] = {16, 16, 1};
-                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
-                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
-            }
-            static void level_up_messages_calls(int dst_idx, int dst_cols, int dst_rows, int src_rows,
-                                         oclMat *mus, oclMat *mds, oclMat *mls, oclMat *mrs,
-                                         int ndisp)
-            {
-                int src_idx = (dst_idx + 1) & 1;
-
-                level_up_message_call(dst_cols, dst_rows, src_rows,
-                                      mus[src_idx], mus[dst_idx], ndisp);
-
-                level_up_message_call(dst_cols, dst_rows, src_rows,
-                                      mds[src_idx], mds[dst_idx], ndisp);
-
-                level_up_message_call(dst_cols, dst_rows, src_rows,
-                                      mls[src_idx], mls[dst_idx], ndisp);
-
-                level_up_message_call(dst_cols, dst_rows, src_rows,
-                                      mrs[src_idx], mrs[dst_idx], ndisp);
-            }
-            //////////////////////////////////////////////////////////////////////////////////
-            //////////////////////////////cals_all_iterations_call///////////////////////////
-            /////////////////////////////////////////////////////////////////////////////////
-            static void calc_all_iterations_call(int cols, int rows, oclMat &u, oclMat &d,
-                oclMat &l, oclMat &r, oclMat &data,
-                int t, int cndisp, float cmax_disc_term,
-                float cdisc_single_jump)
-            {
-                Context  *clCxt = l.clCxt;
-                int data_type = u.type();
-
-                String kernelName = "one_iteration";
-
-                std::vector<std::pair<size_t , const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&data.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&rows));
-                args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cmax_disc_term));
-                args.push_back( std::make_pair( sizeof(cl_float) , (void *)&cdisc_single_jump));
-
-                size_t gt[3] = {cols, rows, 1}, lt[3] = {16, 16, 1};
-                char opt[80] = "";
-                sprintf(opt, "-D %s -D CNDISP=%d", data_type == CV_16S ? "T_SHORT":"T_FLOAT", cndisp);
-                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, opt);
-            }
-
-            static void calc_all_iterations_calls(int cols, int rows, int iters, oclMat &u,
-                                           oclMat &d, oclMat &l, oclMat &r,
-                                           oclMat &data, int cndisp, float cmax_disc_term,
-                                           float cdisc_single_jump)
-            {
-                for(int t = 0; t < iters; ++t)
-                    calc_all_iterations_call(cols, rows, u, d, l, r, data, t, cndisp,
-                                             cmax_disc_term, cdisc_single_jump);
-            }
-            ///////////////////////////////////////////////////////////////////////////////
-            ///////////////////////output///////////////////////////////////////////////////
-            ////////////////////////////////////////////////////////////////////////////////
-            static void output_call(const oclMat &u, const oclMat &d, const oclMat l, const oclMat &r,
-                const oclMat &data, oclMat &disp, int ndisp)
-            {
-                Context  *clCxt = u.clCxt;
-                int data_type = u.type();
-
-                String kernelName = "output";
-
-                std::vector<std::pair<size_t , const void *> > args;
-
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&u.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&u.step));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&d.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&l.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&r.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data));
-                args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&disp.data));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.rows));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.cols));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&disp.step));
-                args.push_back( std::make_pair( sizeof(cl_int) , (void *)&ndisp));
-
-                size_t gt[3] = {disp.cols, disp.rows, 1}, lt[3] = {16, 16, 1};
-                const char* t_opt  = data_type == CV_16S ? "-D T_SHORT":"-D T_FLOAT";
-                openCLExecuteKernel(clCxt, &stereobp, kernelName, gt, lt, args, -1, -1, t_opt);
-            }
-        }
-    }
-}
-namespace
-{
-    const float DEFAULT_MAX_DATA_TERM = 10.0f;
-    const float DEFAULT_DATA_WEIGHT = 0.07f;
-    const float DEFAULT_MAX_DISC_TERM = 1.7f;
-    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
-}
-
-void cv::ocl::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels)
-{
-    ndisp = width / 4;
-    if ((ndisp & 1) != 0)
-        ndisp++;
-
-    int mm = ::max(width, height);
-    iters = mm / 100 + 2;
-
-    levels = (int)(::log(static_cast<double>(mm)) + 1) * 4 / 5;
-    if (levels == 0) levels++;
-}
-
-cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
-      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-cv::ocl::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp_, int iters_, int levels_, float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, int msg_type_)
-    : ndisp(ndisp_), iters(iters_), levels(levels_),
-      max_data_term(max_data_term_), data_weight(data_weight_),
-      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_),
-      msg_type(msg_type_), datas(levels_)
-{
-}
-
-namespace
-{
-    class StereoBeliefPropagationImpl
-    {
-    public:
-        StereoBeliefPropagationImpl(StereoBeliefPropagation &rthis_,
-                                    oclMat &u_, oclMat &d_, oclMat &l_, oclMat &r_,
-                                    oclMat &u2_, oclMat &d2_, oclMat &l2_, oclMat &r2_,
-                                    std::vector<oclMat> &datas_, oclMat &out_)
-            : rthis(rthis_), u(u_), d(d_), l(l_), r(r_), u2(u2_), d2(d2_), l2(l2_), r2(r2_), datas(datas_), out(out_),
-              zero(Scalar::all(0)), scale(rthis_.msg_type == CV_32F ? 1.0f : 10.0f)
-        {
-            CV_Assert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels);
-            CV_Assert(rthis.msg_type == CV_32F || rthis.msg_type == CV_16S);
-            CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < std::numeric_limits<short>::max());
-        }
-
-        void operator()(const oclMat &left, const oclMat &right, oclMat &disp)
-        {
-            CV_Assert(left.size() == right.size() && left.type() == right.type());
-            CV_Assert(left.type() == CV_8UC1 || left.type() == CV_8UC3 || left.type() == CV_8UC4);
-
-            rows = left.rows;
-            cols = left.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init();
-
-            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
-            datas[0].setTo(Scalar_<short>::all(0));
-
-            cv::ocl::stereoBP::comp_data_call(left, right, datas[0], rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight);
-            calcBP(disp);
-        }
-
-        void operator()(const oclMat &data, oclMat &disp)
-        {
-            CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
-
-            rows = data.rows / rthis.ndisp;
-            cols = data.cols;
-
-            int divisor = (int)pow(2.f, rthis.levels - 1.0f);
-            int lowest_cols = cols / divisor;
-            int lowest_rows = rows / divisor;
-            const int min_image_dim_size = 2;
-            CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
-
-            init();
-
-            datas[0] = data;
-
-            calcBP(disp);
-        }
-    private:
-        void init()
-        {
-            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
-
-            if (rthis.levels & 1)
-            {
-                //can clear less area
-                u = zero;
-                d = zero;
-                l = zero;
-                r = zero;
-            }
-
-            if (rthis.levels > 1)
-            {
-                int less_rows = (rows + 1) / 2;
-                int less_cols = (cols + 1) / 2;
-
-                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-
-                if ((rthis.levels & 1) == 0)
-                {
-                    u2 = zero;
-                    d2 = zero;
-                    l2 = zero;
-                    r2 = zero;
-                }
-            }
-
-            cv::ocl::stereoBP::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight,
-                                              scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
-
-            datas.resize(rthis.levels);
-            cols_all.resize(rthis.levels);
-            rows_all.resize(rthis.levels);
-
-            cols_all[0] = cols;
-            rows_all[0] = rows;
-        }
-
-        void calcBP(oclMat &disp)
-        {
-            using namespace cv::ocl::stereoBP;
-
-            for (int i = 1; i < rthis.levels; ++i)
-            {
-                cols_all[i] = (cols_all[i - 1] + 1) / 2;
-                rows_all[i] = (rows_all[i - 1] + 1) / 2;
-
-                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
-                datas[i].setTo(Scalar_<short>::all(0));
-
-                data_step_down_call(cols_all[i], rows_all[i], rows_all[i - 1],
-                                    datas[i - 1], datas[i], rthis.ndisp);
-            }
-
-            oclMat mus[] = {u, u2};
-            oclMat mds[] = {d, d2};
-            oclMat mrs[] = {r, r2};
-            oclMat mls[] = {l, l2};
-
-            int mem_idx = (rthis.levels & 1) ? 0 : 1;
-
-            for (int i = rthis.levels - 1; i >= 0; --i)
-            {
-                // for lower level we have already computed messages by setting to zero
-                if (i != rthis.levels - 1)
-                    level_up_messages_calls(mem_idx, cols_all[i], rows_all[i], rows_all[i + 1],
-                                            mus, mds, mls, mrs, rthis.ndisp);
-
-                calc_all_iterations_calls(cols_all[i], rows_all[i], rthis.iters, mus[mem_idx],
-                                          mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i],
-                                          rthis.ndisp, scale * rthis.max_disc_term,
-                                          scale * rthis.disc_single_jump);
-
-                mem_idx = (mem_idx + 1) & 1;
-            }
-            if (disp.empty())
-                disp.create(rows, cols, CV_16S);
-
-            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
-            out = zero;
-
-            output_call(u, d, l, r, datas.front(), out, rthis.ndisp);
-
-            if (disp.type() != CV_16S)
-                out.convertTo(disp, disp.type());
-
-            release_constants();
-        }
-        StereoBeliefPropagationImpl& operator=(const StereoBeliefPropagationImpl&);
-
-        StereoBeliefPropagation &rthis;
-
-        oclMat &u;
-        oclMat &d;
-        oclMat &l;
-        oclMat &r;
-
-        oclMat &u2;
-        oclMat &d2;
-        oclMat &l2;
-        oclMat &r2;
-
-        std::vector<oclMat> &datas;
-        oclMat &out;
-
-        const Scalar zero;
-        const float scale;
-
-        int rows, cols;
-
-        std::vector<int> cols_all, rows_all;
-    };
-}
-
-void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
-{
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(left, right, disp);
-}
-
-void cv::ocl::StereoBeliefPropagation::operator()(const oclMat &data, oclMat &disp)
-{
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
-    impl(data, disp);
-}
diff --git a/modules/ocl/src/svm.cpp b/modules/ocl/src/svm.cpp
deleted file mode 100644
index 892743888..000000000
--- a/modules/ocl/src/svm.cpp
+++ /dev/null
@@ -1,1136 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Erping Pang, erping@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-// TODO Remove this after HAVE_CLAMDBLAS eliminating
-#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
-#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-#endif
-
-using namespace cv;
-using namespace ocl;
-
-namespace cv { namespace ocl {
-
-#if 1
-typedef float Qfloat;
-#define QFLOAT_TYPE CV_32F
-#else
-typedef double Qfloat;
-#define QFLOAT_TYPE CV_64F
-#endif
-
-class CvSVMKernel_ocl: public CvSVMKernel
-{
-public:
-    typedef void (CvSVMKernel_ocl::*Calc_ocl)( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-    CvSVMKernel_ocl(const CvSVMParams* params, Calc_ocl _calc_func , Calc _calc_func1);
-
-    Calc_ocl calc_func_ocl;
-    bool create( const CvSVMParams* params, Calc_ocl _calc_func, Calc _calc_func1);
-
-    void calc( int vcount, const int row_idx, Qfloat* results, Mat& src);
-    void calc_linear( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-
-    void calc_poly( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-    void calc_sigmoid( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-    void calc_non_rbf_base( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-    void calc_rbf( int vec_count, const int row_idx, Qfloat* results, Mat& src);
-};
-
-class CvSVMSolver_ocl: public CvSVMSolver
-{
-public:
-    CvSVMSolver_ocl();
-    CvSVMSolver_ocl(const CvSVMParams *);
-    float* get_row_base( int i, bool* _existed, Mat& src);
-    bool solve_generic( CvSVMSolutionInfo& si );
-    float* get_row( int i, float* dst, Mat& src);
-};
-
-typedef struct CvSparseVecElem32f
-{
-    int idx;
-    float val;
-} CvSparseVecElem32f;
-
-static int icvCmpSparseVecElems( const void* a, const void* b )
-{
-    return ((CvSparseVecElem32f*)a)->idx - ((CvSparseVecElem32f*)b)->idx;
-}
-
-void cvPreparePredictData( const CvArr* sample, int dims_all, const CvMat* comp_idx,
-                           int class_count, const CvMat* prob, float** row_sample,
-                           int as_sparse CV_DEFAULT(0) );
-
-void  cvPreparePredictData( const CvArr* _sample, int dims_all,
-                            const CvMat* comp_idx, int class_count,
-                            const CvMat* prob, float** _row_sample,
-                            int as_sparse )
-{
-    float* row_sample = 0;
-    int* inverse_comp_idx = 0;
-
-    CV_FUNCNAME( "cvPreparePredictData" );
-
-    __CV_BEGIN__;
-
-    const CvMat* sample = (const CvMat*)_sample;
-    float* sample_data;
-    int sample_step;
-    int is_sparse = CV_IS_SPARSE_MAT(sample);
-    int d, sizes[CV_MAX_DIM];
-    int i, dims_selected;
-    int vec_size;
-
-    if( !is_sparse && !CV_IS_MAT(sample) )
-    {
-        CV_ERROR( !sample ? CV_StsNullPtr : CV_StsBadArg, "The sample is not a valid vector" );
-    }
-
-    if( cvGetElemType( sample ) != CV_32FC1 )
-    {
-        CV_ERROR( CV_StsUnsupportedFormat, "Input sample must have 32fC1 type" );
-    }
-
-    CV_CALL( d = cvGetDims( sample, sizes ));
-
-    if( !((is_sparse && d == 1) || (!is_sparse && d == 2 && (sample->rows == 1 || sample->cols == 1))) )
-    {
-        CV_ERROR( CV_StsBadSize, "Input sample must be 1-dimensional vector" );
-    }
-
-    if( d == 1 )
-        sizes[1] = 1;
-
-    if( sizes[0] + sizes[1] - 1 != dims_all )
-        CV_ERROR( CV_StsUnmatchedSizes,
-                  "The sample size is different from what has been used for training" );
-
-    if( !_row_sample )
-    {
-        CV_ERROR( CV_StsNullPtr, "INTERNAL ERROR: The row_sample pointer is NULL" );
-    }
-
-    if( comp_idx && (!CV_IS_MAT(comp_idx) || comp_idx->rows != 1 ||
-                     CV_MAT_TYPE(comp_idx->type) != CV_32SC1) )
-    {
-        CV_ERROR( CV_StsBadArg, "INTERNAL ERROR: invalid comp_idx" );
-    }
-
-    dims_selected = comp_idx ? comp_idx->cols : dims_all;
-
-    if( prob )
-    {
-        if( !CV_IS_MAT(prob) )
-        {
-            CV_ERROR( CV_StsBadArg, "The output matrix of probabilities is invalid" );
-        }
-
-        if( (prob->rows != 1 && prob->cols != 1) ||
-                (CV_MAT_TYPE(prob->type) != CV_32FC1 &&
-                 CV_MAT_TYPE(prob->type) != CV_64FC1) )
-            CV_ERROR( CV_StsBadSize,
-                      "The matrix of probabilities must be 1-dimensional vector of 32fC1 type" );
-
-        if( prob->rows + prob->cols - 1 != class_count )
-            CV_ERROR( CV_StsUnmatchedSizes,
-                      "The vector of probabilities must contain as many elements as "
-                      "the number of classes in the training set" );
-    }
-
-    vec_size = !as_sparse ? dims_selected * sizeof(row_sample[0]) :
-               (dims_selected + 1) * sizeof(CvSparseVecElem32f);
-
-    if( CV_IS_MAT(sample) )
-    {
-        sample_data = sample->data.fl;
-        sample_step = CV_IS_MAT_CONT(sample->type) ? 1 : sample->step / sizeof(row_sample[0]);
-
-        if( !comp_idx && CV_IS_MAT_CONT(sample->type) && !as_sparse )
-            *_row_sample = sample_data;
-        else
-        {
-            CV_CALL( row_sample = (float*)cvAlloc( vec_size ));
-
-            if( !comp_idx )
-                for( i = 0; i < dims_selected; i++ )
-                    row_sample[i] = sample_data[sample_step * i];
-            else
-            {
-                int* comp = comp_idx->data.i;
-                for( i = 0; i < dims_selected; i++ )
-                    row_sample[i] = sample_data[sample_step * comp[i]];
-            }
-
-            *_row_sample = row_sample;
-        }
-
-        if( as_sparse )
-        {
-            const float* src = (const float*)row_sample;
-            CvSparseVecElem32f* dst = (CvSparseVecElem32f*)row_sample;
-
-            dst[dims_selected].idx = -1;
-            for( i = dims_selected - 1; i >= 0; i-- )
-            {
-                dst[i].idx = i;
-                dst[i].val = src[i];
-            }
-        }
-    }
-    else
-    {
-        CvSparseNode* node;
-        CvSparseMatIterator mat_iterator;
-        const CvSparseMat* sparse = (const CvSparseMat*)sample;
-        assert( is_sparse );
-
-        node = cvInitSparseMatIterator( sparse, &mat_iterator );
-        CV_CALL( row_sample = (float*)cvAlloc( vec_size ));
-
-        if( comp_idx )
-        {
-            CV_CALL( inverse_comp_idx = (int*)cvAlloc( dims_all * sizeof(int) ));
-            memset( inverse_comp_idx, -1, dims_all * sizeof(int) );
-            for( i = 0; i < dims_selected; i++ )
-                inverse_comp_idx[comp_idx->data.i[i]] = i;
-        }
-
-        if( !as_sparse )
-        {
-            memset( row_sample, 0, vec_size );
-
-            for( ; node != 0; node = cvGetNextSparseNode(&mat_iterator) )
-            {
-                int idx = *CV_NODE_IDX( sparse, node );
-                if( inverse_comp_idx )
-                {
-                    idx = inverse_comp_idx[idx];
-                    if( idx < 0 )
-                        continue;
-                }
-                row_sample[idx] = *(float*)CV_NODE_VAL( sparse, node );
-            }
-        }
-        else
-        {
-            CvSparseVecElem32f* ptr = (CvSparseVecElem32f*)row_sample;
-
-            for( ; node != 0; node = cvGetNextSparseNode(&mat_iterator) )
-            {
-                int idx = *CV_NODE_IDX( sparse, node );
-                if( inverse_comp_idx )
-                {
-                    idx = inverse_comp_idx[idx];
-                    if( idx < 0 )
-                        continue;
-                }
-                ptr->idx = idx;
-                ptr->val = *(float*)CV_NODE_VAL( sparse, node );
-                ptr++;
-            }
-
-            qsort( row_sample, ptr - (CvSparseVecElem32f*)row_sample,
-                   sizeof(ptr[0]), icvCmpSparseVecElems );
-            ptr->idx = -1;
-        }
-
-        *_row_sample = row_sample;
-    }
-
-    __CV_END__;
-
-    if( inverse_comp_idx )
-        cvFree( &inverse_comp_idx );
-
-    if( cvGetErrStatus() < 0 && _row_sample )
-    {
-        cvFree( &row_sample );
-        *_row_sample = 0;
-    }
-}
-
-float CvSVM_OCL::predict( const int row_index, int row_len, Mat& src, bool returnDFVal ) const
-{
-    assert( kernel );
-
-    (void)row_len;
-
-    int class_count = class_labels ? class_labels->cols :
-                      params.svm_type == ONE_CLASS ? 1 : 0;
-
-    float result = 0;
-    cv::AutoBuffer<float> _buffer(sv_total + (class_count + 1) * 2);
-    float* buffer = _buffer;
-
-    if( params.svm_type == EPS_SVR ||
-            params.svm_type == NU_SVR ||
-            params.svm_type == ONE_CLASS )
-    {
-        CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
-        int i, sv_count = df->sv_count;
-        double sum = -df->rho;
-
-        ((CvSVMKernel_ocl*)kernel)->calc( sv_count, row_index, buffer, src);
-        for( i = 0; i < sv_count; i++ )
-            sum += buffer[i] * df->alpha[i];
-
-        result = params.svm_type == ONE_CLASS ? (float)(sum > 0) : (float)sum;
-    }
-    else if( params.svm_type == C_SVC ||
-             params.svm_type == NU_SVC )
-    {
-        CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
-        int* vote = (int*)(buffer + sv_total);
-        int i, j, k;
-
-        memset( vote, 0, class_count * sizeof(vote[0]));
-        ((CvSVMKernel_ocl*)kernel)->calc( sv_total, row_index, buffer, src);
-        double sum = 0.;
-
-        for( i = 0; i < class_count; i++ )
-            for( j = i + 1; j < class_count; j++, df++ )
-            {
-                sum = -df->rho;
-                int sv_count = df->sv_count;
-                for( k = 0; k < sv_count; k++ )
-                    sum += df->alpha[k] * buffer[df->sv_index[k]];
-
-                vote[sum > 0 ? i : j]++;
-            }
-
-        for( i = 1, k = 0; i < class_count; i++ )
-            if( vote[i] > vote[k] )
-                k = i;
-
-        result = returnDFVal && class_count == 2 ? (float)sum : (float)(class_labels->data.i[k]);
-    }
-    else
-        CV_Error( CV_StsBadArg, "INTERNAL ERROR: Unknown SVM type, "
-                  "the SVM structure is probably corrupted" );
-
-    return result;
-}
-
-float CvSVM_OCL::predict( const Mat& _sample, bool returnDFVal ) const
-{
-    CvMat sample = _sample;
-    return CvSVM::predict(&sample, returnDFVal);
-}
-
-float CvSVM_OCL::predict( const int row_index, Mat& src, bool returnDFVal) const
-{
-    float result = 0;
-
-    result = predict( row_index, get_var_count(), src, returnDFVal);
-
-    return result;
-}
-
-#undef get_C
-#define get_C(i) (C[y[i]>0])
-#undef is_upper_bound
-#define is_upper_bound(i) (alpha_status[i] > 0)
-#undef is_lower_bound
-#define is_lower_bound(i) (alpha_status[i] < 0)
-#undef update_alpha_status
-#define update_alpha_status(i) \
-    alpha_status[i] = (schar)(alpha[i] >= get_C(i) ? 1 : alpha[i] <= 0 ? -1 : 0)
-
-CvSVMSolver_ocl::CvSVMSolver_ocl(const CvSVMParams* _params)
-{
-    params = _params;
-}
-
-float* CvSVMSolver_ocl::get_row( int i, float* dst, Mat& src )
-{
-    bool existed = false;
-    float* row = get_row_base( i, &existed, src);
-    return (this->*get_row_func)( i, row, dst, existed );
-}
-
-float* CvSVMSolver_ocl::get_row_base( int i, bool* _existed, Mat& src )
-{
-    int i1 = i < sample_count ? i : i - sample_count;
-    CvSVMKernelRow* row = rows + i1;
-    bool existed = row->data != 0;
-    Qfloat* data;
-
-    if( existed || cache_size <= 0 )
-    {
-        CvSVMKernelRow* del_row = existed ? row : lru_list.prev;
-        data = del_row->data;
-        assert( data != 0 );
-
-        // delete row from the LRU list
-        del_row->data = 0;
-        del_row->prev->next = del_row->next;
-        del_row->next->prev = del_row->prev;
-    }
-    else
-    {
-        data = (Qfloat*)cvMemStorageAlloc( storage, cache_line_size );
-        cache_size -= cache_line_size;
-    }
-
-    // insert row into the LRU list
-    row->data = data;
-    row->prev = &lru_list;
-    row->next = lru_list.next;
-    row->prev->next = row->next->prev = row;
-
-    if( !existed )
-        ((CvSVMKernel_ocl*)kernel)->calc( sample_count, i1, row->data, src);
-
-    if( _existed )
-        *_existed = existed;
-
-    return row->data;
-}
-
-#ifndef HAVE_CLAMDBLAS
-
-static void matmul_sigmod(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "svm_sigmod";
-    int src_step = (int)src.step / src.elemSize();
-    int src2_step = (int)src2.step / src2.elemSize();
-    int dst_step = (int)dst.step / dst.elemSize();
-    int x = MIN(16, src_rows);
-    int y = MIN(16, src2_cols);
-    size_t localThreads[] = {x, y, 1};
-    size_t globalThreads[] = {src2_cols, src_rows, 1};
-    int width = var_count;
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
-
-    float alpha = 0.0f, beta = 0.0f;
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        alpha = (float)alpha1;
-        beta = (float)beta1;
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
-    }
-    else
-    {
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
-    }
-    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-static void matmul_poly(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1, double degree1, bool flag)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "svm_poly";
-    int src_step = (int)src.step / src.elemSize();
-    int src2_step = (int)src2.step / src2.elemSize();
-    int dst_step = (int)dst.step / dst.elemSize();
-    int x = MIN(16, src_rows);
-    int y = MIN(16, src2_cols);
-    size_t localThreads[] = {x, y, 1};
-    size_t globalThreads[] = {src2_cols, src_rows, 1};
-    int width = var_count;
-
-    char build_options[50];
-
-    if(flag)
-    {
-        sprintf(build_options, "-D ADDPOW");
-    }
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
-
-    float alpha = 0.0f, beta = 0.0f, degree = 0.0f;
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        alpha = (float)alpha1;
-        beta = (float)beta1;
-        degree = (float)degree1;
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&degree));
-    }
-    else
-    {
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&degree1));
-    }
-    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-static void matmul_linear(oclMat & src, oclMat & src2, oclMat & dst, int src_rows, int src2_cols, int var_count, double alpha1, double beta1)
-{
-    Context *clCxt = Context::getContext();
-    String kernelName = "svm_linear";
-    int src_step = (int)src.step / src.elemSize();
-    int src2_step = (int)src2.step / src2.elemSize();
-    int dst_step = (int)dst.step / dst.elemSize();
-    int x = MIN(16, src_rows);
-    int y = MIN(16, src2_cols);
-    size_t localThreads[] = {x, y, 1};
-    size_t globalThreads[] = {src2_cols, src_rows, 1};
-    int width = var_count;
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src2.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
-
-    float alpha = 0.0f, beta = 0.0f;
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        alpha = (float)alpha1;
-        beta = (float)beta1;
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&alpha));
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&beta));
-    }
-    else
-    {
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&alpha1));
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&beta1));
-    }
-    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-
-#endif // #ifndef HAVE_CLAMDBLAS
-
-static void matmul_rbf(oclMat& src, oclMat& src_e, oclMat& dst, int src_rows, int src2_cols, int var_count, double gamma1, bool flag)
-{
-
-    Context *clCxt = Context::getContext();
-
-    String kernelName = "svm_rbf";
-
-    int width = var_count;
-    int src_step = (int)src.step / src.elemSize();
-    int src_e_step = (int)src_e.step / src_e.elemSize();
-    int dst_step = (int)dst.step / dst.elemSize();
-
-    int x = MIN(16, src_rows);
-    int y = MIN(16, src2_cols);
-    size_t localThreads[] = {x, y, 1};
-    size_t globalThreads[] = {src2_cols,  src_rows, 1};
-    char build_options[50];
-
-    if(flag)
-        sprintf(build_options, "-D ADDEXP");
-
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&src_e.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_e_step));
-    args.push_back(std::make_pair(sizeof(cl_mem), (void* )&dst.data));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&dst_step));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src_rows));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&src2_cols));
-    args.push_back(std::make_pair(sizeof(cl_int), (void* )&width));
-    float gamma = 0.0f;
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        gamma = (float)gamma1;
-        args.push_back(std::make_pair(sizeof(cl_float), (void* )&gamma));
-    }
-    else
-        args.push_back(std::make_pair(sizeof(cl_double), (void* )&gamma1));
-
-    openCLExecuteKernel(clCxt, &svm, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
-}
-
-float CvSVM_OCL::predict(const CvMat* samples, CV_OUT CvMat* results) const
-{
-    int var_count = get_var_count();
-    int sample_count = samples->rows;
-
-    //float* row_sample = 0;
-    Mat src_temp = Mat(sample_count, var_count, CV_32FC1);
-    CV_FUNCNAME( "CvSVM::predict" );
-
-
-    for(int i = 0; i < samples->rows; i++)
-    {
-        __CV_BEGIN__;
-        CvMat sample;
-        float* row_sample = 0;
-        cvGetRow( samples, &sample, i );
-        int class_count;
-        if( !kernel )
-        {
-            CV_ERROR( CV_StsBadArg, "The SVM should be trained first" );
-        }
-
-        class_count = class_labels ? class_labels->cols :
-                      params.svm_type == ONE_CLASS ? 1 : 0;
-
-        CV_CALL( cvPreparePredictData(&sample, var_all, var_idx,
-                                      class_count, 0, &row_sample ));
-        for(int j = 0; j < var_count; ++j)
-            src_temp.at<float>(i, j) = row_sample[j];
-        __CV_END__;
-    }
-
-    Mat dst1;
-    double alpha1 = 0.0, beta1 = 0.0, gamma1 = 0.0;
-    if(params.kernel_type == CvSVM::LINEAR)
-    {
-        alpha1 = 1;
-        beta1 = 0;
-    }
-    if(params.kernel_type == CvSVM::POLY)
-    {
-        alpha1 = params.gamma;
-        beta1 = params.coef0;
-    }
-    if(params.kernel_type == CvSVM::SIGMOID)
-    {
-        alpha1 = - 2 * params.gamma;
-        beta1 = - 2 * params.coef0;
-    }
-    if(params.kernel_type == CvSVM::RBF)
-        gamma1 = - params.gamma;
-
-    Mat sv_temp = Mat(sv_total, var_count, CV_32FC1, Scalar::all(0));
-
-
-    for(int i = 0; i < sv_total; ++i)
-        for(int j = 0; j < var_count; ++j)
-            sv_temp.at<float>(i, j) = sv[i][j];
-
-    oclMat src(sample_count, var_count, CV_32FC1, Scalar::all(0));
-    oclMat sv_;
-
-    src.upload(src_temp);
-    oclMat dst;
-
-#ifdef HAVE_CLAMDBLAS
-
-    dst = oclMat(sample_count, sv_total, CV_32FC1);
-    oclMat src3(sample_count, sv_total, CV_32FC1, Scalar::all(1));
-    if(params.kernel_type != CvSVM::RBF)
-    {
-        Mat sv_temp1;
-        transpose(sv_temp, sv_temp1);
-        sv_.upload(sv_temp1);
-        gemm(src, sv_, alpha1, src3, beta1, dst);
-    }
-
-#else
-    double degree1 = 0.0;
-    if (params.kernel_type == CvSVM::POLY)
-        degree1 = params.degree;
-
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-        dst = oclMat(sample_count, sv_total, CV_32FC1);
-    else
-        dst = oclMat(sample_count, sv_total, CV_64FC1);
-
-    if(params.kernel_type == CvSVM::LINEAR)
-    {
-        sv_.upload(sv_temp);
-        matmul_linear(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1);
-    }
-    if( params.kernel_type == CvSVM::SIGMOID)
-    {
-        sv_.upload(sv_temp);
-        matmul_sigmod(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1);
-    }
-
-    if(params.kernel_type == CvSVM::POLY)
-    {
-        sv_.upload(sv_temp);
-        if(sample_count > 0)
-            matmul_poly(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1, degree1, true);
-        else
-            matmul_poly(src, sv_, dst, sample_count, sv_total, var_count, alpha1, beta1, degree1, false);
-    }
-#endif
-
-    if(params.kernel_type == CvSVM::RBF)
-    {
-        sv_.upload(sv_temp);
-        if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-            dst = oclMat(sample_count, sv_total, CV_32FC1);
-        else
-            dst = oclMat(sample_count, sv_total, CV_64FC1);
-
-        if(sample_count > 0)
-            matmul_rbf(src, sv_, dst, sample_count, sv_total, var_count, gamma1, true);
-        else
-            matmul_rbf(src, sv_, dst, sample_count, sv_total, var_count, gamma1, false);
-    }
-    dst.download(dst1);
-
-    float result = 0;
-    for(int i = 0; i < samples->rows; i++ )
-    {
-        int r = (int)this->predict(i, dst1);
-        if (results)
-            results->data.fl[i] = (float)r;
-        if (i == 0)
-            result = (float)r;
-    }
-    return result;
-}
-
-void CvSVM_OCL::predict( cv::InputArray _samples, cv::OutputArray _results ) const
-{
-    _results.create(_samples.size().height, 1, CV_32F);
-    CvMat samples = _samples.getMat(), results = _results.getMat();
-    predict(&samples, &results);
-}
-
-bool CvSVMSolver_ocl::solve_generic( CvSVMSolutionInfo& si )
-{
-    int iter = 0;
-    int i, j, k;
-
-    // 1. initialize gradient and alpha status
-    for( i = 0; i < alpha_count; i++ )
-    {
-        update_alpha_status(i);
-        G[i] = b[i];
-        if( fabs(G[i]) > 1e200 )
-        {
-            return false;
-        }
-    }
-    Mat dst1;
-    double alpha1 = 0.0, beta1 = 0.0, gamma1 = 0.0;
-    if(params->kernel_type == CvSVM::LINEAR)
-    {
-        alpha1 = 1;
-        beta1 = 0;
-    }
-    if(params->kernel_type == CvSVM::POLY)
-    {
-        alpha1 = params->gamma;
-        beta1 = params->coef0;
-    }
-    if(params->kernel_type == CvSVM::SIGMOID)
-    {
-        alpha1 = -2 * params->gamma;
-        beta1 = -2 * params->coef0;
-    }
-    if(params->kernel_type == CvSVM::RBF)
-    {
-        gamma1 = -params->gamma;
-    }
-    Mat src1 = Mat(sample_count, var_count, CV_32FC1);
-
-    for(int i = 0; i < sample_count; ++i)
-    {
-        for(int j = 0; j < var_count; ++j)
-        {
-            src1.at<float>(i, j) = samples[i][j];
-        }
-    }
-    oclMat src, src_e;
-    src.upload(src1);
-    oclMat dst;
-
-#ifdef HAVE_CLAMDBLAS
-
-    dst = oclMat(sample_count, sample_count, CV_32FC1);
-    oclMat src3(sample_count, sample_count, CV_32FC1, Scalar::all(1));
-    if(params->kernel_type != CvSVM::RBF)
-    {
-        ocl::transpose(src, src_e);
-        gemm(src, src_e, alpha1, src3, beta1, dst);
-    }
-
-#else
-    double degree1 = 0.0;
-    if(params->kernel_type == CvSVM::POLY)
-        degree1 = params->degree;
-
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-        dst = oclMat(sample_count, sample_count, CV_32FC1);
-    else
-        dst = oclMat(sample_count, sample_count, CV_64FC1);
-
-    if(params->kernel_type == CvSVM::LINEAR )
-    {
-        src_e = src;
-        matmul_linear(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1);
-    }
-    if( params->kernel_type == CvSVM::SIGMOID)
-    {
-        src_e = src;
-        matmul_sigmod(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1);
-    }
-
-    if(params->kernel_type == CvSVM::POLY)
-    {
-        src_e = src;
-        if(sample_count > 0)
-            matmul_poly(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1, degree1, true);
-        else
-            matmul_poly(src, src_e, dst, sample_count, sample_count, var_count, alpha1, beta1, degree1, false);
-    }
-
-#endif
-
-    if(params->kernel_type == CvSVM::RBF)
-    {
-        src_e = src;
-        if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-            dst = oclMat(sample_count, sample_count, CV_32FC1);
-        else
-            dst = oclMat(sample_count, sample_count, CV_64FC1);
-
-        if(sample_count > 0)
-            matmul_rbf(src, src_e, dst, sample_count, sample_count, var_count, gamma1, true);
-        else
-            matmul_rbf(src, src_e, dst, sample_count, sample_count, var_count, gamma1, false);
-    }
-    dst.download(dst1);
-    for( i = 0; i < alpha_count; i++ )
-    {
-        if( !is_lower_bound(i) )
-        {
-            const Qfloat *Q_i = CvSVMSolver::get_row( i, buf[0]);
-            double alpha_i = alpha[i];
-
-            for( j = 0; j < alpha_count; j++ )
-                G[j] += alpha_i * Q_i[j];
-        }
-    }
-
-    // 2. optimization loop
-    for(;;)
-    {
-        const Qfloat *Q_i, *Q_j;
-        double C_i, C_j;
-        double old_alpha_i, old_alpha_j, alpha_i, alpha_j;
-        double delta_alpha_i, delta_alpha_j;
-
-#ifdef _DEBUG
-        for( i = 0; i < alpha_count; i++ )
-        {
-            if( fabs(G[i]) > 1e+300 )
-                return false;
-
-            if( fabs(alpha[i]) > 1e16 )
-                return false;
-        }
-#endif
-
-        if( (this->*select_working_set_func)( i, j ) != 0 || iter++ >= max_iter )
-        {
-            break;
-        }
-        Q_i = get_row( i, buf[0], dst1);
-        Q_j = get_row( j, buf[1], dst1);
-
-        C_i = get_C(i);
-        C_j = get_C(j);
-
-        alpha_i = old_alpha_i = alpha[i];
-        alpha_j = old_alpha_j = alpha[j];
-
-        if( y[i] != y[j] )
-        {
-            double denom = Q_i[i] + Q_j[j] + 2 * Q_i[j];
-            double delta = (-G[i] - G[j]) / MAX(fabs(denom), FLT_EPSILON);
-            double diff = alpha_i - alpha_j;
-            alpha_i += delta;
-            alpha_j += delta;
-
-            if( diff > 0 && alpha_j < 0 )
-            {
-                alpha_j = 0;
-                alpha_i = diff;
-            }
-            else if( diff <= 0 && alpha_i < 0 )
-            {
-                alpha_i = 0;
-                alpha_j = -diff;
-            }
-
-            if( diff > C_i - C_j && alpha_i > C_i )
-            {
-                alpha_i = C_i;
-                alpha_j = C_i - diff;
-            }
-            else if( diff <= C_i - C_j && alpha_j > C_j )
-            {
-                alpha_j = C_j;
-                alpha_i = C_j + diff;
-            }
-        }
-        else
-        {
-            double denom = Q_i[i] + Q_j[j] - 2 * Q_i[j];
-            double delta = (G[i] - G[j]) / MAX(fabs(denom), FLT_EPSILON);
-            double sum = alpha_i + alpha_j;
-            alpha_i -= delta;
-            alpha_j += delta;
-
-            if( sum > C_i && alpha_i > C_i )
-            {
-                alpha_i = C_i;
-                alpha_j = sum - C_i;
-            }
-            else if( sum <= C_i && alpha_j < 0)
-            {
-                alpha_j = 0;
-                alpha_i = sum;
-            }
-
-            if( sum > C_j && alpha_j > C_j )
-            {
-                alpha_j = C_j;
-                alpha_i = sum - C_j;
-            }
-            else if( sum <= C_j && alpha_i < 0 )
-            {
-                alpha_i = 0;
-                alpha_j = sum;
-            }
-        }
-        // update alpha
-        alpha[i] = alpha_i;
-        alpha[j] = alpha_j;
-        update_alpha_status(i);
-        update_alpha_status(j);
-
-        // update G
-        delta_alpha_i = alpha_i - old_alpha_i;
-        delta_alpha_j = alpha_j - old_alpha_j;
-
-        for( k = 0; k < alpha_count; k++ )
-            G[k] += Q_i[k] * delta_alpha_i + Q_j[k] * delta_alpha_j;
-    }
-
-    // calculate rho
-    (this->*calc_rho_func)( si.rho, si.r );
-
-    // calculate objective value
-    for( i = 0, si.obj = 0; i < alpha_count; i++ )
-        si.obj += alpha[i] * (G[i] + b[i]);
-
-    si.obj *= 0.5;
-
-    si.upper_bound_p = C[1];
-    si.upper_bound_n = C[0];
-
-    return true;
-}
-
-void CvSVMKernel_ocl::calc( int vcount, const int row_idx, Qfloat* results, Mat& src)
-{
-    //const Qfloat max_val = (Qfloat)(FLT_MAX*1e-3);
-    //int j;
-    (this->*calc_func_ocl)( vcount, row_idx, results, src);
-
-#if !defined(HAVE_CLAMDBLAS)
-    // nothing
-#else
-    const Qfloat max_val = (Qfloat)(FLT_MAX * 1e-3);
-    int j;
-    for( j = 0; j < vcount; j++ )
-        if( results[j] > max_val )
-            results[j] = max_val;
-#endif
-}
-
-bool CvSVMKernel_ocl::create( const CvSVMParams* _params, Calc_ocl _calc_func, Calc _calc_func1 )
-{
-    clear();
-    params = _params;
-    calc_func_ocl = _calc_func;
-    calc_func = _calc_func1;
-    if( !calc_func_ocl )
-        calc_func_ocl = params->kernel_type == CvSVM::RBF ? &CvSVMKernel_ocl::calc_rbf :
-                        params->kernel_type == CvSVM::POLY ? &CvSVMKernel_ocl::calc_poly :
-                        params->kernel_type == CvSVM::SIGMOID ? &CvSVMKernel_ocl::calc_sigmoid :
-                        &CvSVMKernel_ocl::calc_linear;
-    if( !calc_func)
-        calc_func = params->kernel_type == CvSVM::RBF ? &CvSVMKernel::calc_rbf :
-                    params->kernel_type == CvSVM::POLY ? &CvSVMKernel::calc_poly :
-                    params->kernel_type == CvSVM::SIGMOID ? &CvSVMKernel::calc_sigmoid :
-                    &CvSVMKernel::calc_linear;
-    return true;
-}
-CvSVMKernel_ocl::CvSVMKernel_ocl(const CvSVMParams* params, CvSVMKernel_ocl::Calc_ocl _calc_func, CvSVMKernel::Calc _calc_func1)
-{
-    CvSVMKernel::clear();
-    CvSVMKernel_ocl::create( params, _calc_func, _calc_func1 );
-}
-
-void CvSVMKernel_ocl::calc_non_rbf_base( int vcount, const int row_idx, Qfloat* results, Mat& src)
-{
-#ifdef HAVE_CLAMDBLAS
-
-    for(int i = 0; i < vcount; i++)
-    {
-        results[i] = (Qfloat) * src.ptr<float>(row_idx, i);
-    }
-#else
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-    {
-        for(int i = 0; i < vcount; i++)
-        {
-            results[i] = (Qfloat) * src.ptr<float>(row_idx, i);
-        }
-    }
-    else
-    {
-        for(int i = 0; i < vcount; i++)
-        {
-            results[i] = (Qfloat) * src.ptr<double>(row_idx, i);
-        }
-    }
-#endif
-}
-
-void CvSVMKernel_ocl::calc_rbf( int vcount, const int row_idx, Qfloat* results, Mat& src)
-{
-    if(!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
-        for(int m = 0; m < vcount; m++)
-            results[m] = (Qfloat) * src.ptr<float>(row_idx, m);
-    else
-        for(int m = 0; m < vcount; m++)
-            results[m] = (Qfloat) * src.ptr<double>(row_idx, m);
-}
-
-void CvSVMKernel_ocl::calc_linear( int vcount, const int row_idx, Qfloat* results, Mat& src )
-{
-    calc_non_rbf_base( vcount, row_idx, results, src);
-}
-
-void CvSVMKernel_ocl::calc_poly( int vcount, const int row_idx, Qfloat* results, Mat& src)
-{
-    calc_non_rbf_base( vcount, row_idx, results, src);
-
-#if !defined(HAVE_CLAMDBLAS)
-    // nothing
-#else
-    CvMat R = cvMat( 1, vcount, QFLOAT_TYPE, results );
-    if( vcount > 0 )
-        cvPow( &R, &R, params->degree );
-#endif
-}
-
-
-void CvSVMKernel_ocl::calc_sigmoid( int vcount, const int row_idx, Qfloat* results, Mat& src)
-{
-    calc_non_rbf_base( vcount, row_idx, results, src);
-    // TODO: speedup this
-#if !defined(HAVE_CLAMDBLAS)
-    // nothing
-#else
-    for(int j = 0; j < vcount; j++ )
-    {
-        Qfloat t = results[j];
-        double e = ::exp(-fabs(t));
-        if( t > 0 )
-            results[j] = (Qfloat)((1. - e) / (1. + e));
-        else
-            results[j] = (Qfloat)((e - 1.) / (e + 1.));
-    }
-#endif
-}
-
-CvSVM_OCL::CvSVM_OCL()
-{
-    CvSVM();
-}
-
-CvSVM_OCL::CvSVM_OCL( const Mat& _train_data, const Mat& _responses,
-                      const Mat& _var_idx, const Mat& _sample_idx, CvSVMParams _params )
-{
-    decision_func = 0;
-    class_labels = 0;
-    class_weights = 0;
-    storage = 0;
-    var_idx = 0;
-    kernel = 0;
-    solver = 0;
-    default_model_name = "my_svm";
-
-    train( _train_data, _responses, _var_idx, _sample_idx, _params );
-}
-
-void CvSVM_OCL::create_kernel()
-{
-    kernel = new CvSVMKernel_ocl(&params, 0, 0);
-}
-
-void CvSVM_OCL::create_solver( )
-{
-    solver = new CvSVMSolver_ocl(&params);
-}
-
-} }
diff --git a/modules/ocl/src/tvl1flow.cpp b/modules/ocl/src/tvl1flow.cpp
deleted file mode 100644
index 6e75ee238..000000000
--- a/modules/ocl/src/tvl1flow.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//        Jin Ma, jin@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include "opencl_kernels.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL()
-{
-    tau            = 0.25;
-    lambda         = 0.15;
-    theta          = 0.3;
-    nscales        = 5;
-    warps          = 5;
-    epsilon        = 0.01;
-    iterations     = 300;
-    useInitialFlow = false;
-}
-
-void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy)
-{
-    CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
-    CV_Assert( I0.size() == I1.size() );
-    CV_Assert( I0.type() == I1.type() );
-    CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
-    CV_Assert( nscales > 0 );
-
-    // allocate memory for the pyramid structure
-    I0s.resize(nscales);
-    I1s.resize(nscales);
-    u1s.resize(nscales);
-    u2s.resize(nscales);
-    //I0s_step == I1s_step
-    I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
-    I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
-
-
-    if (!useInitialFlow)
-    {
-        flowx.create(I0.size(), CV_32FC1);
-        flowy.create(I0.size(), CV_32FC1);
-    }
-    //u1s_step != u2s_step
-    u1s[0] = flowx;
-    u2s[0] = flowy;
-
-    I1x_buf.create(I0.size(), CV_32FC1);
-    I1y_buf.create(I0.size(), CV_32FC1);
-
-    I1w_buf.create(I0.size(), CV_32FC1);
-    I1wx_buf.create(I0.size(), CV_32FC1);
-    I1wy_buf.create(I0.size(), CV_32FC1);
-
-    grad_buf.create(I0.size(), CV_32FC1);
-    rho_c_buf.create(I0.size(), CV_32FC1);
-
-    p11_buf.create(I0.size(), CV_32FC1);
-    p12_buf.create(I0.size(), CV_32FC1);
-    p21_buf.create(I0.size(), CV_32FC1);
-    p22_buf.create(I0.size(), CV_32FC1);
-
-    diff_buf.create(I0.size(), CV_32FC1);
-
-    // create the scales
-    for (int s = 1; s < nscales; ++s)
-    {
-        ocl::pyrDown(I0s[s - 1], I0s[s]);
-        ocl::pyrDown(I1s[s - 1], I1s[s]);
-
-        if (I0s[s].cols < 16 || I0s[s].rows < 16)
-        {
-            nscales = s;
-            break;
-        }
-
-        if (useInitialFlow)
-        {
-            ocl::pyrDown(u1s[s - 1], u1s[s]);
-            ocl::pyrDown(u2s[s - 1], u2s[s]);
-
-            ocl::multiply(0.5, u1s[s], u1s[s]);
-            ocl::multiply(0.5, u2s[s], u2s[s]);
-        }
-    }
-
-    // pyramidal structure for computing the optical flow
-    for (int s = nscales - 1; s >= 0; --s)
-    {
-        // compute the optical flow at the current scale
-        procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
-
-        // if this was the last scale, finish now
-        if (s == 0)
-            break;
-
-        // otherwise, upsample the optical flow
-
-        // zoom the optical flow for the next finer scale
-        ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
-        ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
-
-        // scale the optical flow with the appropriate zoom factor
-        multiply(2, u1s[s - 1], u1s[s - 1]);
-        multiply(2, u2s[s - 1], u2s[s - 1]);
-
-    }
-
-}
-
-namespace ocl_tvl1flow
-{
-    void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy);
-
-    void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y,
-        oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy,
-        oclMat &grad, oclMat &rho);
-
-    void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
-        oclMat &rho_c, oclMat &p11, oclMat &p12,
-        oclMat &p21, oclMat &p22, oclMat &u1,
-        oclMat &u2, oclMat &error, float l_t, float theta, char calc_error);
-
-    void estimateDualVariables(oclMat &u1, oclMat &u2,
-        oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut);
-}
-
-void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2)
-{
-    using namespace ocl_tvl1flow;
-
-    const double scaledEpsilon = epsilon * epsilon * I0.size().area();
-
-    CV_DbgAssert( I1.size() == I0.size() );
-    CV_DbgAssert( I1.type() == I0.type() );
-    CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
-    CV_DbgAssert( u2.size() == u1.size() );
-
-    if (u1.empty())
-    {
-        u1.create(I0.size(), CV_32FC1);
-        u1.setTo(Scalar::all(0));
-
-        u2.create(I0.size(), CV_32FC1);
-        u2.setTo(Scalar::all(0));
-    }
-
-    oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    centeredGradient(I1, I1x, I1y);
-
-    oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
-    oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
-    p11.setTo(Scalar::all(0));
-    p12.setTo(Scalar::all(0));
-    p21.setTo(Scalar::all(0));
-    p22.setTo(Scalar::all(0));
-
-    oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
-
-    const float l_t = static_cast<float>(lambda * theta);
-    const float taut = static_cast<float>(tau / theta);
-
-    for (int warpings = 0; warpings < warps; ++warpings)
-    {
-        warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
-
-        double error = std::numeric_limits<double>::max();
-        double prev_error = 0;
-        for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
-        {
-            // some tweaks to make sum operation less frequently
-            char calc_error = (n & 0x1) && (prev_error < scaledEpsilon);
-            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
-                      u1, u2, diff, l_t, static_cast<float>(theta), calc_error);
-            if(calc_error)
-            {
-                error = ocl::sum(diff)[0];
-                prev_error = error;
-            }
-            else
-            {
-                error = std::numeric_limits<double>::max();
-                prev_error -= scaledEpsilon;
-            }
-            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
-
-        }
-    }
-
-
-}
-
-void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage()
-{
-    I0s.clear();
-    I1s.clear();
-    u1s.clear();
-    u2s.clear();
-
-    I1x_buf.release();
-    I1y_buf.release();
-
-    I1w_buf.release();
-    I1wx_buf.release();
-    I1wy_buf.release();
-
-    grad_buf.release();
-    rho_c_buf.release();
-
-    p11_buf.release();
-    p12_buf.release();
-    p21_buf.release();
-    p22_buf.release();
-
-    diff_buf.release();
-    norm_buf.release();
-}
-
-void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy)
-{
-    Context  *clCxt = src.clCxt;
-    size_t localThreads[3] = {32, 8, 1};
-    size_t globalThreads[3] = {src.cols, src.rows, 1};
-
-    int srcElementSize = src.elemSize();
-    int src_step = src.step/srcElementSize;
-
-    int dElememntSize = dx.elemSize();
-    int dx_step = dx.step/dElememntSize;
-
-    String kernelName = "centeredGradientKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&src.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&src_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&dx.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&dy.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&dx_step));
-    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1);
-
-}
-
-void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut)
-{
-    Context *clCxt = u1.clCxt;
-
-    size_t localThread[] = {32, 8, 1};
-    size_t globalThread[] =
-    {
-        u1.cols,
-        u1.rows,
-        1
-    };
-
-    int u1_element_size = u1.elemSize();
-    int u1_step = u1.step/u1_element_size;
-
-    int u2_element_size = u2.elemSize();
-    int u2_step = u2.step/u2_element_size;
-
-    int p11_element_size = p11.elemSize();
-    int p11_step = p11.step/p11_element_size;
-
-    int u1_offset_y = u1.offset/u1.step;
-    int u1_offset_x = u1.offset%u1.step;
-    u1_offset_x = u1_offset_x/u1.elemSize();
-
-    int u2_offset_y = u2.offset/u2.step;
-    int u2_offset_x = u2.offset%u2.step;
-    u2_offset_x = u2_offset_x/u2.elemSize();
-
-    String kernelName = "estimateDualVariablesKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p11.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&p11_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p12.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p21.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p22.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void*)&taut));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
-
-    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
-}
-
-void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
-    oclMat &rho_c, oclMat &p11, oclMat &p12,
-    oclMat &p21, oclMat &p22, oclMat &u1,
-    oclMat &u2, oclMat &error, float l_t, float theta, char calc_error)
-{
-    Context* clCxt = I1wx.clCxt;
-
-    size_t localThread[] = {32, 8, 1};
-    size_t globalThread[] =
-    {
-        I1wx.cols,
-        I1wx.rows,
-        1
-    };
-
-    int I1wx_element_size = I1wx.elemSize();
-    int I1wx_step = I1wx.step/I1wx_element_size;
-
-    int u1_element_size = u1.elemSize();
-    int u1_step = u1.step/u1_element_size;
-
-    int u2_element_size = u2.elemSize();
-    int u2_step = u2.step/u2_element_size;
-
-    int u1_offset_y = u1.offset/u1.step;
-    int u1_offset_x = u1.offset%u1.step;
-    u1_offset_x = u1_offset_x/u1.elemSize();
-
-    int u2_offset_y = u2.offset/u2.step;
-    int u2_offset_x = u2.offset%u2.step;
-    u2_offset_x = u2_offset_x/u2.elemSize();
-
-    String kernelName = "estimateUKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wx.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx.rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1wx_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wy.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&grad.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&rho_c.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p11.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p12.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p21.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&p22.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&error.data));
-    args.push_back( std::make_pair( sizeof(cl_float), (void*)&l_t));
-    args.push_back( std::make_pair( sizeof(cl_float), (void*)&theta));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
-    args.push_back( std::make_pair( sizeof(cl_char), (void*)&calc_error));
-
-    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
-}
-
-void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho)
-{
-    Context* clCxt = I0.clCxt;
-
-    int u1ElementSize = u1.elemSize();
-    int u1Step = u1.step/u1ElementSize;
-
-    int u2ElementSize = u2.elemSize();
-    int u2Step = u2.step/u2ElementSize;
-
-    int I0ElementSize = I0.elemSize();
-    int I0Step = I0.step/I0ElementSize;
-
-    int I1w_element_size = I1w.elemSize();
-    int I1w_step = I1w.step/I1w_element_size;
-
-    int u1_offset_y = u1.offset/u1.step;
-    int u1_offset_x = u1.offset%u1.step;
-    u1_offset_x = u1_offset_x/u1.elemSize();
-
-    int u2_offset_y = u2.offset/u2.step;
-    int u2_offset_x = u2.offset%u2.step;
-    u2_offset_x = u2_offset_x/u2.elemSize();
-
-    size_t localThread[] = {32, 8, 1};
-    size_t globalThread[] =
-    {
-        I0.cols,
-        I0.rows,
-        1
-    };
-
-    cl_mem I1_tex;
-    cl_mem I1x_tex;
-    cl_mem I1y_tex;
-    I1_tex = bindTexture(I1);
-    I1x_tex = bindTexture(I1x);
-    I1y_tex = bindTexture(I1y);
-
-    String kernelName = "warpBackwardKernel";
-    std::vector< std::pair<size_t, const void *> > args;
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I0.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0Step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0.cols));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I0.rows));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1_tex));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1x_tex));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1y_tex));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u1.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1Step));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&u2.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1w.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wx.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&I1wy.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&grad.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void*)&rho.data));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&I1w_step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2Step));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u1_offset_y));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_x));
-    args.push_back( std::make_pair( sizeof(cl_int), (void*)&u2_offset_y));
-
-    openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
-
-    releaseTexture(I1_tex);
-    releaseTexture(I1x_tex);
-    releaseTexture(I1y_tex);
-}
diff --git a/modules/ocl/test/main.cpp b/modules/ocl/test/main.cpp
deleted file mode 100644
index 0d5146143..000000000
--- a/modules/ocl/test/main.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#define DUMP_PROPERTY_XML(propertyName, propertyValue) \
-    do { \
-        std::stringstream ssName, ssValue;\
-        ssName << propertyName;\
-        ssValue << propertyValue; \
-        ::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
-    } while (false)
-
-#define DUMP_MESSAGE_STDOUT(msg) \
-    do { \
-        std::cout << msg << std::endl; \
-    } while (false)
-
-#include "opencv2/ocl/private/opencl_dumpinfo.hpp"
-
-int LOOP_TIMES = 1;
-
-void readLoopTimes(int argc, char ** argv)
-{
-    const char * const command_line_keys =
-            "{   test_loop_times             |1        |count of iterations per each test}"
-            "{h  help                        |false    |print help info}";
-
-    cv::CommandLineParser parser(argc, argv, command_line_keys);
-    if (parser.has("help"))
-    {
-        std::cout << "\nAvailable options besides google test option: \n";
-        parser.printMessage();
-    }
-
-    LOOP_TIMES = parser.get<int>("test_loop_times");
-    CV_Assert(LOOP_TIMES > 0);
-}
-
-CV_TEST_MAIN(".", dumpOpenCLDevice(),
-                  readLoopTimes(argc, argv))
diff --git a/modules/ocl/test/test_api.cpp b/modules/ocl/test/test_api.cpp
deleted file mode 100644
index 6ca40270d..000000000
--- a/modules/ocl/test/test_api.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/core/opencl/runtime/opencl_core.hpp" // for OpenCL types & functions
-#include "opencv2/core/ocl.hpp"
-
-TEST(TestAPI, openCLExecuteKernelInterop)
-{
-    cv::RNG rng;
-    Size sz(10000, 1);
-    cv::Mat cpuMat = cvtest::randomMat(rng, sz, CV_32FC4, -10, 10, false);
-
-    cv::ocl::oclMat gpuMat(cpuMat);
-    cv::ocl::oclMat gpuMatDst(sz, CV_32FC4);
-
-    const char* kernelStr =
-"__kernel void test_kernel(__global float4* src, __global float4* dst) {\n"
-"    int x = get_global_id(0);\n"
-"    dst[x] = src[x];\n"
-"}\n";
-
-    cv::ocl::ProgramSource program("test_interop", kernelStr);
-
-    using namespace std;
-    vector<pair<size_t , const void *> > args;
-    args.push_back( make_pair( sizeof(cl_mem), (void *) &gpuMat.data ));
-    args.push_back( make_pair( sizeof(cl_mem), (void *) &gpuMatDst.data ));
-
-    size_t globalThreads[3] = { sz.width, 1, 1 };
-    cv::ocl::openCLExecuteKernelInterop(
-        gpuMat.clCxt,
-        program,
-        "test_kernel",
-        globalThreads, NULL, args,
-        -1, -1,
-        "");
-
-    cv::Mat dst;
-    gpuMatDst.download(dst);
-
-    EXPECT_LE(checkNorm(cpuMat, dst), 1e-3);
-}
-
-TEST(OCL_TestTAPI, performance)
-{
-    cv::RNG rng;
-    cv::Mat src(1280,768,CV_8UC4), dst;
-    rng.fill(src, RNG::UNIFORM, 0, 255);
-
-    cv::UMat usrc, udst;
-    src.copyTo(usrc);
-
-    cv::ocl::oclMat osrc(src);
-    cv::ocl::oclMat odst;
-
-    int cvtcode = cv::COLOR_BGR2GRAY;
-    int i, niters = 10;
-    double t;
-
-    cv::ocl::cvtColor(osrc, odst, cvtcode);
-    cv::ocl::finish();
-    t = (double)cv::getTickCount();
-    for(i = 0; i < niters; i++)
-    {
-        cv::ocl::cvtColor(osrc, odst, cvtcode);
-    }
-    cv::ocl::finish();
-    t = (double)cv::getTickCount() - t;
-    printf("ocl exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
-
-    cv::cvtColor(usrc, udst, cvtcode);
-    cv::ocl::finish2();
-    t = (double)cv::getTickCount();
-    for(i = 0; i < niters; i++)
-    {
-        cv::cvtColor(usrc, udst, cvtcode);
-    }
-    cv::ocl::finish2();
-    t = (double)cv::getTickCount() - t;
-    printf("t-api exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
-
-    cv::cvtColor(src, dst, cvtcode);
-    t = (double)cv::getTickCount();
-    for(i = 0; i < niters; i++)
-    {
-        cv::cvtColor(src, dst, cvtcode);
-    }
-    t = (double)cv::getTickCount() - t;
-    printf("cpu exec time = %gms per iter\n", t*1000./niters/cv::getTickFrequency());
-}
-
-// This test must be DISABLED by default!
-// (We can't restore original context for other tests)
-TEST(TestAPI, DISABLED_InitializationFromHandles)
-{
-#define MAX_PLATFORMS 16
-    cl_platform_id platforms[MAX_PLATFORMS] = { NULL };
-    cl_uint numPlatforms = 0;
-    cl_int status = ::clGetPlatformIDs(MAX_PLATFORMS, &platforms[0], &numPlatforms);
-    ASSERT_EQ(CL_SUCCESS, status) << "clGetPlatformIDs";
-    ASSERT_NE(0, (int)numPlatforms);
-
-    int selectedPlatform = 0;
-    cl_platform_id platform = platforms[selectedPlatform];
-
-    ASSERT_NE((void*)NULL, platform);
-
-    cl_device_id device = NULL;
-    status = ::clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
-    ASSERT_EQ(CL_SUCCESS, status) << "clGetDeviceIDs";
-    ASSERT_NE((void*)NULL, device);
-
-    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0 };
-    cl_context context = ::clCreateContext(cps, 1, &device, NULL, NULL, &status);
-    ASSERT_EQ(CL_SUCCESS, status) << "clCreateContext";
-    ASSERT_NE((void*)NULL, context);
-
-    ASSERT_NO_THROW(cv::ocl::initializeContext(&platform, &context, &device));
-
-    status = ::clReleaseContext(context);
-    ASSERT_EQ(CL_SUCCESS, status) << "clReleaseContext";
-
-#ifdef CL_VERSION_1_2
-#if 1
-    {
-        cv::ocl::Context* ctx = cv::ocl::Context::getContext();
-        ASSERT_NE((void*)NULL, ctx);
-        if (ctx->supportsFeature(cv::ocl::FEATURE_CL_VER_1_2)) // device supports OpenCL 1.2+
-        {
-            status = ::clReleaseDevice(device);
-            ASSERT_EQ(CL_SUCCESS, status) << "clReleaseDevice";
-        }
-    }
-#else // code below doesn't work on Linux (SEGFAULTs on 1.1- devices are not handled via exceptions)
-    try
-    {
-        status = ::clReleaseDevice(device); // NOTE This works only with !DEVICES! that supports OpenCL 1.2
-        (void)status; // no check
-    }
-    catch (...)
-    {
-        // nothing, there is no problem
-    }
-#endif
-#endif
-
-    // print the name of current device
-    cv::ocl::Context* ctx = cv::ocl::Context::getContext();
-    ASSERT_NE((void*)NULL, ctx);
-    const cv::ocl::DeviceInfo& deviceInfo = ctx->getDeviceInfo();
-    std::cout << "Device name: " << deviceInfo.deviceName << std::endl;
-    std::cout << "Platform name: " << deviceInfo.platform->platformName << std::endl;
-
-    ASSERT_EQ(context, *(cl_context*)ctx->getOpenCLContextPtr());
-    ASSERT_EQ(device, *(cl_device_id*)ctx->getOpenCLDeviceIDPtr());
-
-    // do some calculations and check results
-    cv::RNG rng;
-    Size sz(100, 100);
-    cv::Mat srcMat = cvtest::randomMat(rng, sz, CV_32FC4, -10, 10, false);
-    cv::Mat dstMat;
-
-    cv::ocl::oclMat srcGpuMat(srcMat);
-    cv::ocl::oclMat dstGpuMat;
-
-    cv::Scalar v = cv::Scalar::all(1);
-    cv::add(srcMat, v, dstMat);
-    cv::ocl::add(srcGpuMat, v, dstGpuMat);
-
-    cv::Mat dstGpuMatMap;
-    dstGpuMat.download(dstGpuMatMap);
-
-    EXPECT_LE(checkNorm(dstMat, dstGpuMatMap), 1e-3);
-}
diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp
deleted file mode 100644
index bf37afdf5..000000000
--- a/modules/ocl/test/test_arithm.cpp
+++ /dev/null
@@ -1,1621 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan,jlyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Zailong Wu, bullet@yeah.net
-//    Yao Wang, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-static bool relativeError(double actual, double expected, double eps)
-{
-    return std::abs(actual - expected) / actual < eps;
-}
-
-//////////////////////////////// LUT /////////////////////////////////////////////////
-
-PARAM_TEST_CASE(Lut, MatDepth, MatDepth, bool, bool)
-{
-    int lut_depth;
-    int cn;
-    bool use_roi, same_cn;
-
-    // src mat
-    cv::Mat src;
-    cv::Mat lut;
-    cv::Mat dst;
-
-    // src mat with roi
-    cv::Mat src_roi;
-    cv::Mat lut_roi;
-    cv::Mat dst_roi;
-
-    // ocl dst mat for testing
-    cv::ocl::oclMat gsrc_whole;
-    cv::ocl::oclMat glut_whole;
-    cv::ocl::oclMat gdst_whole;
-
-    // ocl mat with roi
-    cv::ocl::oclMat gsrc_roi;
-    cv::ocl::oclMat glut_roi;
-    cv::ocl::oclMat gdst_roi;
-
-    virtual void SetUp()
-    {
-        lut_depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
-        same_cn = GET_PARAM(2);
-        use_roi = GET_PARAM(3);
-    }
-
-    void random_roi()
-    {
-        const int src_type = CV_MAKE_TYPE(CV_8U, cn);
-        const int lut_type = CV_MAKE_TYPE(lut_depth, same_cn ? cn : 1);
-        const int dst_type = CV_MAKE_TYPE(lut_depth, cn);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, src_type, 0, 256);
-
-        Size lutRoiSize = Size(256, 1);
-        Border lutBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(lut, lut_roi, lutRoiSize, lutBorder, lut_type, 5, 16);
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, dst_type, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(glut_whole, glut_roi, lut, lutRoiSize, lutBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst, roiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-OCL_TEST_P(Lut, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::LUT(src_roi, lut_roi, dst_roi);
-        cv::ocl::LUT(gsrc_roi, glut_roi, gdst_roi);
-
-        Near();
-    }
-}
-
-///////////////////////// ArithmTestBase ///////////////////////////
-
-PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
-{
-    int depth;
-    int cn;
-    bool use_roi;
-    cv::Scalar val;
-
-    // src mat
-    cv::Mat src1;
-    cv::Mat src2;
-    cv::Mat mask;
-    cv::Mat dst1;
-    cv::Mat dst2;
-
-    // src mat with roi
-    cv::Mat src1_roi;
-    cv::Mat src2_roi;
-    cv::Mat mask_roi;
-    cv::Mat dst1_roi;
-    cv::Mat dst2_roi;
-
-    // ocl dst mat for testing
-    cv::ocl::oclMat gsrc1_whole;
-    cv::ocl::oclMat gsrc2_whole;
-    cv::ocl::oclMat gdst1_whole;
-    cv::ocl::oclMat gdst2_whole;
-    cv::ocl::oclMat gmask_whole;
-
-    // ocl mat with roi
-    cv::ocl::oclMat gsrc1_roi;
-    cv::ocl::oclMat gsrc2_roi;
-    cv::ocl::oclMat gdst1_roi;
-    cv::ocl::oclMat gdst2_roi;
-    cv::ocl::oclMat gmask_roi;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        cn = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
-    }
-
-    virtual void random_roi()
-    {
-        const int type = CV_MAKE_TYPE(depth, cn);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, roiSize, src1Border, type, 2, 11);
-
-        Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src2, src2_roi, roiSize, src2Border, type, -1540, 1740);
-
-        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst1, dst1_roi, roiSize, dst1Border, type, 5, 16);
-
-        Border dst2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst2, dst2_roi, roiSize, dst2Border, type, 5, 16);
-
-        Border maskBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC1, 0, 2);
-        cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
-
-
-        generateOclMat(gsrc1_whole, gsrc1_roi, src1, roiSize, src1Border);
-        generateOclMat(gsrc2_whole, gsrc2_roi, src2, roiSize, src2Border);
-        generateOclMat(gdst1_whole, gdst1_roi, dst1, roiSize, dst1Border);
-        generateOclMat(gdst2_whole, gdst2_roi, dst2, roiSize, dst2Border);
-        generateOclMat(gmask_whole, gmask_roi, mask, roiSize, maskBorder);
-
-        val = cv::Scalar(rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0),
-                         rng.uniform(-100.0, 100.0), rng.uniform(-100.0, 100.0));
-    }
-
-    void Near(double threshold = 0.)
-    {
-        Mat whole, roi;
-        gdst1_whole.download(whole);
-        gdst1_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst1, whole, threshold);
-        EXPECT_MAT_NEAR(dst1_roi, roi, threshold);
-    }
-
-    void Near1(double threshold = 0.)
-    {
-        Mat whole, roi;
-        gdst2_whole.download(whole);
-        gdst2_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst2, whole, threshold);
-        EXPECT_MAT_NEAR(dst2_roi, roi, threshold);
-    }
-};
-
-//////////////////////////////// Exp /////////////////////////////////////////////////
-
-typedef ArithmTestBase Exp;
-
-OCL_TEST_P(Exp, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::exp(src1_roi, dst1_roi);
-        cv::ocl::exp(gsrc1_roi, gdst1_roi);
-
-        Near(2);
-    }
-}
-
-//////////////////////////////// Log /////////////////////////////////////////////////
-
-typedef ArithmTestBase Log;
-
-OCL_TEST_P(Log, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::log(src1_roi, dst1_roi);
-        cv::ocl::log(gsrc1_roi, gdst1_roi);
-        Near(1);
-    }
-}
-
-//////////////////////////////// Sqrt ////////////////////////////////////////////////
-
-typedef ArithmTestBase Sqrt;
-
-OCL_TEST_P(Sqrt, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::sqrt(src1_roi, dst1_roi);
-        cv::ocl::sqrt(gsrc1_roi, gdst1_roi);
-        Near(1);
-    }
-}
-
-//////////////////////////////// Add /////////////////////////////////////////////////
-
-typedef ArithmTestBase Add;
-
-OCL_TEST_P(Add, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::add(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::add(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Add, Mat_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::add(src1_roi, src2_roi, dst1_roi, mask_roi);
-        cv::ocl::add(gsrc1_roi, gsrc2_roi, gdst1_roi, gmask_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Add, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::add(src1_roi, val, dst1_roi);
-        cv::ocl::add(gsrc1_roi, val, gdst1_roi);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Add, Scalar_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::add(src1_roi, val, dst1_roi, mask_roi);
-        cv::ocl::add(gsrc1_roi, val, gdst1_roi, gmask_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// Sub /////////////////////////////////////////////////
-
-typedef ArithmTestBase Sub;
-
-OCL_TEST_P(Sub, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::subtract(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::subtract(gsrc1_roi, gsrc2_roi, gdst1_roi);
-
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Sub, Mat_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::subtract(src1_roi, src2_roi, dst1_roi, mask_roi);
-        cv::ocl::subtract(gsrc1_roi, gsrc2_roi, gdst1_roi, gmask_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Sub, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::subtract(src1_roi, val, dst1_roi);
-        cv::ocl::subtract(gsrc1_roi, val, gdst1_roi);
-
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Sub, Scalar_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::subtract(src1_roi, val, dst1_roi, mask_roi);
-        cv::ocl::subtract(gsrc1_roi, val, gdst1_roi, gmask_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// Mul /////////////////////////////////////////////////
-
-typedef ArithmTestBase Mul;
-
-OCL_TEST_P(Mul, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::multiply(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::multiply(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Mul, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::multiply(Scalar::all(val[0]), src1_roi, dst1_roi);
-        cv::ocl::multiply(val[0], gsrc1_roi, gdst1_roi);
-
-        Near(gdst1_roi.depth() >= CV_32F ? 1e-3 : 1);
-    }
-}
-
-OCL_TEST_P(Mul, Mat_Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::multiply(src1_roi, src2_roi, dst1_roi, val[0]);
-        cv::ocl::multiply(gsrc1_roi, gsrc2_roi, gdst1_roi, val[0]);
-
-        Near(gdst1_roi.depth() >= CV_32F ? 1e-3 : 1);
-    }
-}
-
-//////////////////////////////// Div /////////////////////////////////////////////////
-
-typedef ArithmTestBase Div;
-
-OCL_TEST_P(Div, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::divide(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::divide(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(1);
-    }
-}
-
-OCL_TEST_P(Div, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::divide(val[0], src1_roi, dst1_roi);
-        cv::ocl::divide(val[0], gsrc1_roi, gdst1_roi);
-
-        Near(gdst1_roi.depth() >= CV_32F ? 1e-3 : 1);
-    }
-}
-
-OCL_TEST_P(Div, Mat_Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::divide(src1_roi, src2_roi, dst1_roi, val[0]);
-        cv::ocl::divide(gsrc1_roi, gsrc2_roi, gdst1_roi, val[0]);
-
-        Near(gdst1_roi.depth() >= CV_32F ? 4e-3 : 1);
-    }
-}
-
-//////////////////////////////// Absdiff /////////////////////////////////////////////////
-
-typedef ArithmTestBase Min;
-
-OCL_TEST_P(Min, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        dst1_roi = cv::min(src1_roi, src2_roi);
-        cv::ocl::min(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-typedef ArithmTestBase Max;
-
-OCL_TEST_P(Max, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        dst1_roi = cv::min(src1_roi, src2_roi);
-        cv::ocl::min(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-//////////////////////////////// Abs /////////////////////////////////////////////////////
-
-typedef ArithmTestBase Abs;
-
-OCL_TEST_P(Abs, Abs)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        dst1_roi = cv::abs(src1_roi);
-        cv::ocl::abs(gsrc1_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-//////////////////////////////// Absdiff /////////////////////////////////////////////////
-
-typedef ArithmTestBase Absdiff;
-
-OCL_TEST_P(Absdiff, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::absdiff(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::absdiff(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Absdiff, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::absdiff(src1_roi, val, dst1_roi);
-        cv::ocl::absdiff(gsrc1_roi, val, gdst1_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// CartToPolar /////////////////////////////////////////////////
-
-typedef ArithmTestBase CartToPolar;
-
-OCL_TEST_P(CartToPolar, angleInDegree)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
-        cv::ocl::cartToPolar(gsrc1_roi, gsrc2_roi, gdst1_roi, gdst2_roi, true);
-        Near(.5);
-        Near1(.5);
-    }
-}
-
-OCL_TEST_P(CartToPolar, angleInRadians)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::cartToPolar(src1_roi, src2_roi, dst1_roi, dst2_roi);
-        cv::ocl::cartToPolar(gsrc1_roi, gsrc2_roi, gdst1_roi, gdst2_roi);
-        Near(.5);
-        Near1(.5);
-    }
-}
-
-//////////////////////////////// PolarToCart /////////////////////////////////////////////////
-
-typedef ArithmTestBase PolarToCart;
-
-OCL_TEST_P(PolarToCart, angleInDegree)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi, true);
-        cv::ocl::polarToCart(gsrc1_roi, gsrc2_roi, gdst1_roi, gdst2_roi, true);
-
-        Near(.5);
-        Near1(.5);
-    }
-}
-
-OCL_TEST_P(PolarToCart, angleInRadians)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::polarToCart(src1_roi, src2_roi, dst1_roi, dst2_roi);
-        cv::ocl::polarToCart(gsrc1_roi, gsrc2_roi, gdst1_roi, gdst2_roi);
-
-        Near(.5);
-        Near1(.5);
-    }
-}
-
-//////////////////////////////// Magnitude /////////////////////////////////////////////////
-
-typedef ArithmTestBase Magnitude;
-
-OCL_TEST_P(Magnitude, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::magnitude(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::magnitude(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(depth == CV_64F ? 1e-5 : 1e-2);
-    }
-}
-
-//////////////////////////////// Transpose /////////////////////////////////////////////////
-
-typedef ArithmTestBase Transpose;
-
-OCL_TEST_P(Transpose, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::transpose(src1_roi, dst1_roi);
-        cv::ocl::transpose(gsrc1_roi, gdst1_roi);
-
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Transpose, SquareInplace)
-{
-    const int type = CV_MAKE_TYPE(depth, cn);
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        roiSize.height = roiSize.width; // make it square
-
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, roiSize, srcBorder, type, 5, 16);
-
-        generateOclMat(gsrc1_whole, gsrc1_roi, src1, roiSize, srcBorder);
-
-        cv::transpose(src1_roi, src1_roi);
-        cv::ocl::transpose(gsrc1_roi, gsrc1_roi);
-
-        EXPECT_MAT_NEAR(src1, Mat(gsrc1_whole), 0.0);
-        EXPECT_MAT_NEAR(src1_roi, Mat(gsrc1_roi), 0.0);
-    }
-}
-
-//////////////////////////////// Flip /////////////////////////////////////////////////
-
-typedef ArithmTestBase Flip;
-
-OCL_TEST_P(Flip, X)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::flip(src1_roi, dst1_roi, 0);
-        cv::ocl::flip(gsrc1_roi, gdst1_roi, 0);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Flip, Y)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::flip(src1_roi, dst1_roi, 1);
-        cv::ocl::flip(gsrc1_roi, gdst1_roi, 1);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Flip, BOTH)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::flip(src1_roi, dst1_roi, -1);
-        cv::ocl::flip(gsrc1_roi, gdst1_roi, -1);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// MinMax /////////////////////////////////////////////////
-
-typedef ArithmTestBase MinMax;
-
-OCL_TEST_P(MinMax, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double minVal, maxVal;
-
-        if (src1.depth() != CV_8S)
-            cv::minMaxIdx(src1_roi, &minVal, &maxVal, NULL, NULL);
-        else
-        {
-            minVal = std::numeric_limits<double>::max();
-            maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < src1_roi.rows; ++i)
-                for (int j = 0; j < src1_roi.cols; ++j)
-                {
-                    signed char val = src1_roi.at<signed char>(i, j);
-                    if (val < minVal) minVal = val;
-                    if (val > maxVal) maxVal = val;
-                }
-        }
-
-        double minVal_, maxVal_;
-        cv::ocl::minMax(gsrc1_roi, &minVal_, &maxVal_);
-
-        EXPECT_DOUBLE_EQ(minVal_, minVal);
-        EXPECT_DOUBLE_EQ(maxVal_, maxVal);
-    }
-}
-
-OCL_TEST_P(MinMax, MASK)
-{
-    enum { MAX_IDX = 0, MIN_IDX };
-    static const double minMaxGolds[2][7] =
-    {
-        { std::numeric_limits<uchar>::min(), std::numeric_limits<char>::min(), std::numeric_limits<ushort>::min(),
-          std::numeric_limits<short>::min(), std::numeric_limits<int>::min(), -std::numeric_limits<float>::max(), -std::numeric_limits<double>::max() },
-        { std::numeric_limits<uchar>::max(), std::numeric_limits<char>::max(), std::numeric_limits<ushort>::max(),
-          std::numeric_limits<short>::max(), std::numeric_limits<int>::max(), std::numeric_limits<float>::max(), std::numeric_limits<double>::max() },
-    };
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double minVal, maxVal;
-        cv::Point minLoc, maxLoc;
-
-        if (src1.depth() != CV_8S)
-            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        else
-        {
-            minVal = std::numeric_limits<double>::max();
-            maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < src1_roi.rows; ++i)
-                for (int j = 0; j < src1_roi.cols; ++j)
-                {
-                    signed char val = src1_roi.at<signed char>(i, j);
-                    unsigned char m = mask_roi.at<unsigned char>(i, j);
-                    if (val < minVal && m) minVal = val;
-                    if (val > maxVal && m) maxVal = val;
-                }
-        }
-
-        double minVal_, maxVal_;
-        cv::ocl::minMax(gsrc1_roi, &minVal_, &maxVal_, gmask_roi);
-
-        if (cv::countNonZero(mask_roi) == 0)
-        {
-            EXPECT_DOUBLE_EQ(minMaxGolds[MIN_IDX][depth], minVal_);
-            EXPECT_DOUBLE_EQ(minMaxGolds[MAX_IDX][depth], maxVal_);
-        }
-        else
-        {
-            EXPECT_DOUBLE_EQ(minVal, minVal_);
-            EXPECT_DOUBLE_EQ(maxVal, maxVal_);
-        }
-    }
-}
-
-//////////////////////////////// MinMaxLoc /////////////////////////////////////////////////
-
-typedef ArithmTestBase MinMaxLoc;
-
-OCL_TEST_P(MinMaxLoc, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double minVal, maxVal;
-        cv::Point minLoc, maxLoc;
-        int depth = src1.depth();
-
-        if (depth != CV_8S)
-            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc);
-        else
-        {
-            minVal = std::numeric_limits<double>::max();
-            maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < src1_roi.rows; ++i)
-                for (int j = 0; j < src1_roi.cols; ++j)
-                {
-                    signed char val = src1_roi.at<signed char>(i, j);
-                    if (val < minVal)
-                    {
-                        minVal = val;
-                        minLoc.x = j;
-                        minLoc.y = i;
-                    }
-                    if (val > maxVal)
-                    {
-                        maxVal = val;
-                        maxLoc.x = j;
-                        maxLoc.y = i;
-                    }
-                }
-        }
-
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gsrc1_roi, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
-
-        double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if (depth == 0)
-        {
-            minlocVal = src1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
-        }
-        if (depth == 1)
-        {
-            minlocVal = src1_roi.at<signed char>(minLoc);
-            minlocVal_ = src1_roi.at<signed char>(minLoc_);
-            maxlocVal = src1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
-            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
-        }
-        if (depth == 2)
-        {
-            minlocVal = src1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
-        }
-        if (depth == 3)
-        {
-            minlocVal = src1_roi.at<signed short>(minLoc);
-            minlocVal_ = src1_roi.at<signed short>(minLoc_);
-            maxlocVal = src1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
-            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
-        }
-        if (depth == 4)
-        {
-            minlocVal = src1_roi.at<int>(minLoc);
-            minlocVal_ = src1_roi.at<int>(minLoc_);
-            maxlocVal = src1_roi.at<int>(maxLoc);
-            maxlocVal_ = src1_roi.at<int>(maxLoc_);
-            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
-            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
-        }
-        if (depth == 5)
-        {
-            minlocVal = src1_roi.at<float>(minLoc);
-            minlocVal_ = src1_roi.at<float>(minLoc_);
-            maxlocVal = src1_roi.at<float>(maxLoc);
-            maxlocVal_ = src1_roi.at<float>(maxLoc_);
-            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
-            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
-        }
-        if (depth == 6)
-        {
-            minlocVal = src1_roi.at<double>(minLoc);
-            minlocVal_ = src1_roi.at<double>(minLoc_);
-            maxlocVal = src1_roi.at<double>(maxLoc);
-            maxlocVal_ = src1_roi.at<double>(maxLoc_);
-            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
-            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
-        }
-
-        EXPECT_DOUBLE_EQ(minVal_, minVal);
-        EXPECT_DOUBLE_EQ(maxVal_, maxVal);
-        EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
-        EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
-
-        EXPECT_DOUBLE_EQ(error0, 0.0);
-        EXPECT_DOUBLE_EQ(error1, 0.0);
-    }
-}
-
-OCL_TEST_P(MinMaxLoc, MASK)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        double minVal, maxVal;
-        cv::Point minLoc, maxLoc;
-        int depth = src1.depth();
-        if (depth != CV_8S)
-            cv::minMaxLoc(src1_roi, &minVal, &maxVal, &minLoc, &maxLoc, mask_roi);
-        else
-        {
-            minVal = std::numeric_limits<double>::max();
-            maxVal = -std::numeric_limits<double>::max();
-            for (int i = 0; i < src1_roi.rows; ++i)
-                for (int j = 0; j < src1_roi.cols; ++j)
-                {
-                    signed char val = src1_roi.at<signed char>(i, j);
-                    unsigned char m = mask_roi.at<unsigned char>(i , j);
-                    if (val < minVal && m)
-                    {
-                        minVal = val;
-                        minLoc.x = j;
-                        minLoc.y = i;
-                    }
-                    if (val > maxVal && m)
-                    {
-                        maxVal = val;
-                        maxLoc.x = j;
-                        maxLoc.y = i;
-                    }
-                }
-        }
-
-        double minVal_, maxVal_;
-        cv::Point minLoc_, maxLoc_;
-        cv::ocl::minMaxLoc(gsrc1_roi, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask_roi);
-
-        double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
-        if (minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
-        if (depth == 0)
-        {
-            minlocVal = src1_roi.at<unsigned char>(minLoc);
-            minlocVal_ = src1_roi.at<unsigned char>(minLoc_);
-            maxlocVal = src1_roi.at<unsigned char>(maxLoc);
-            maxlocVal_ = src1_roi.at<unsigned char>(maxLoc_);
-            error0 = ::abs(src1_roi.at<unsigned char>(minLoc_) - src1_roi.at<unsigned char>(minLoc));
-            error1 = ::abs(src1_roi.at<unsigned char>(maxLoc_) - src1_roi.at<unsigned char>(maxLoc));
-        }
-        if (depth == 1)
-        {
-            minlocVal = src1_roi.at<signed char>(minLoc);
-            minlocVal_ = src1_roi.at<signed char>(minLoc_);
-            maxlocVal = src1_roi.at<signed char>(maxLoc);
-            maxlocVal_ = src1_roi.at<signed char>(maxLoc_);
-            error0 = ::abs(src1_roi.at<signed char>(minLoc_) - src1_roi.at<signed char>(minLoc));
-            error1 = ::abs(src1_roi.at<signed char>(maxLoc_) - src1_roi.at<signed char>(maxLoc));
-        }
-        if (depth == 2)
-        {
-            minlocVal = src1_roi.at<unsigned short>(minLoc);
-            minlocVal_ = src1_roi.at<unsigned short>(minLoc_);
-            maxlocVal = src1_roi.at<unsigned short>(maxLoc);
-            maxlocVal_ = src1_roi.at<unsigned short>(maxLoc_);
-            error0 = ::abs(src1_roi.at<unsigned short>(minLoc_) - src1_roi.at<unsigned short>(minLoc));
-            error1 = ::abs(src1_roi.at<unsigned short>(maxLoc_) - src1_roi.at<unsigned short>(maxLoc));
-        }
-        if (depth == 3)
-        {
-            minlocVal = src1_roi.at<signed short>(minLoc);
-            minlocVal_ = src1_roi.at<signed short>(minLoc_);
-            maxlocVal = src1_roi.at<signed short>(maxLoc);
-            maxlocVal_ = src1_roi.at<signed short>(maxLoc_);
-            error0 = ::abs(src1_roi.at<signed short>(minLoc_) - src1_roi.at<signed short>(minLoc));
-            error1 = ::abs(src1_roi.at<signed short>(maxLoc_) - src1_roi.at<signed short>(maxLoc));
-        }
-        if (depth == 4)
-        {
-            minlocVal = src1_roi.at<int>(minLoc);
-            minlocVal_ = src1_roi.at<int>(minLoc_);
-            maxlocVal = src1_roi.at<int>(maxLoc);
-            maxlocVal_ = src1_roi.at<int>(maxLoc_);
-            error0 = ::abs(src1_roi.at<int>(minLoc_) - src1_roi.at<int>(minLoc));
-            error1 = ::abs(src1_roi.at<int>(maxLoc_) - src1_roi.at<int>(maxLoc));
-        }
-        if (depth == 5)
-        {
-            minlocVal = src1_roi.at<float>(minLoc);
-            minlocVal_ = src1_roi.at<float>(minLoc_);
-            maxlocVal = src1_roi.at<float>(maxLoc);
-            maxlocVal_ = src1_roi.at<float>(maxLoc_);
-            error0 = ::abs(src1_roi.at<float>(minLoc_) - src1_roi.at<float>(minLoc));
-            error1 = ::abs(src1_roi.at<float>(maxLoc_) - src1_roi.at<float>(maxLoc));
-        }
-        if (depth == 6)
-        {
-            minlocVal = src1_roi.at<double>(minLoc);
-            minlocVal_ = src1_roi.at<double>(minLoc_);
-            maxlocVal = src1_roi.at<double>(maxLoc);
-            maxlocVal_ = src1_roi.at<double>(maxLoc_);
-            error0 = ::abs(src1_roi.at<double>(minLoc_) - src1_roi.at<double>(minLoc));
-            error1 = ::abs(src1_roi.at<double>(maxLoc_) - src1_roi.at<double>(maxLoc));
-        }
-
-        EXPECT_DOUBLE_EQ(minVal_, minVal);
-        EXPECT_DOUBLE_EQ(maxVal_, maxVal);
-        EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
-        EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
-
-        EXPECT_DOUBLE_EQ(error0, 0.0);
-        EXPECT_DOUBLE_EQ(error1, 0.0);
-    }
-}
-
-//////////////////////////////// Sum /////////////////////////////////////////////////
-
-typedef ArithmTestBase Sum;
-
-OCL_TEST_P(Sum, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Scalar cpures = cv::sum(src1_roi);
-        Scalar gpures = cv::ocl::sum(gsrc1_roi);
-
-        // check results
-        EXPECT_NEAR(cpures[0], gpures[0], 0.1);
-        EXPECT_NEAR(cpures[1], gpures[1], 0.1);
-        EXPECT_NEAR(cpures[2], gpures[2], 0.1);
-        EXPECT_NEAR(cpures[3], gpures[3], 0.1);
-    }
-}
-
-typedef ArithmTestBase SqrSum;
-
-template <typename T, typename WT>
-static Scalar sqrSum(const Mat & src)
-{
-    Scalar sum = Scalar::all(0);
-    int cn = src.channels();
-    WT data[4] = { 0, 0, 0, 0 };
-
-    int cols = src.cols * cn;
-    for (int y = 0; y < src.rows; ++y)
-    {
-        const T * const sdata = src.ptr<T>(y);
-        for (int x = 0; x < cols; )
-            for (int i = 0; i < cn; ++i, ++x)
-            {
-                WT t = static_cast<WT>(sdata[x]);
-                data[i] += t * t;
-            }
-    }
-
-    for (int i = 0; i < cn; ++i)
-        sum[i] = static_cast<double>(data[i]);
-
-    return sum;
-}
-
-typedef Scalar (*sumFunc)(const Mat &);
-
-OCL_TEST_P(SqrSum, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        static sumFunc funcs[] = { sqrSum<uchar, int>,
-                                 sqrSum<char, int>,
-                                 sqrSum<ushort, int>,
-                                 sqrSum<short, int>,
-                                 sqrSum<int, int>,
-                                 sqrSum<float, double>,
-                                 sqrSum<double, double>,
-                                 0 };
-
-        sumFunc func = funcs[src1_roi.depth()];
-        CV_Assert(func != 0);
-
-        Scalar cpures = func(src1_roi);
-        Scalar gpures = cv::ocl::sqrSum(gsrc1_roi);
-
-        // check results
-        EXPECT_NEAR(cpures[0], gpures[0], 1.0);
-        EXPECT_NEAR(cpures[1], gpures[1], 1.0);
-        EXPECT_NEAR(cpures[2], gpures[2], 1.0);
-        EXPECT_NEAR(cpures[3], gpures[3], 1.0);
-    }
-}
-
-typedef ArithmTestBase AbsSum;
-
-template <typename T, typename WT>
-static Scalar absSum(const Mat & src)
-{
-    Scalar sum = Scalar::all(0);
-    int cn = src.channels();
-    WT data[4] = { 0, 0, 0, 0 };
-
-    int cols = src.cols * cn;
-    for (int y = 0; y < src.rows; ++y)
-    {
-        const T * const sdata = src.ptr<T>(y);
-        for (int x = 0; x < cols; )
-            for (int i = 0; i < cn; ++i, ++x)
-            {
-                WT t = static_cast<WT>(sdata[x]);
-                data[i] += t >= 0 ? t : -t;
-            }
-    }
-
-    for (int i = 0; i < cn; ++i)
-        sum[i] = static_cast<double>(data[i]);
-
-    return sum;
-}
-
-OCL_TEST_P(AbsSum, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        static sumFunc funcs[] = { absSum<uchar, int>,
-                                 absSum<char, int>,
-                                 absSum<ushort, int>,
-                                 absSum<short, int>,
-                                 absSum<int, int>,
-                                 absSum<float, double>,
-                                 absSum<double, double>,
-                                 0 };
-
-        sumFunc func = funcs[src1_roi.depth()];
-        CV_Assert(func != 0);
-
-        Scalar cpures = func(src1_roi);
-        Scalar gpures = cv::ocl::absSum(gsrc1_roi);
-
-        // check results
-        EXPECT_NEAR(cpures[0], gpures[0], 0.1);
-        EXPECT_NEAR(cpures[1], gpures[1], 0.1);
-        EXPECT_NEAR(cpures[2], gpures[2], 0.1);
-        EXPECT_NEAR(cpures[3], gpures[3], 0.1);
-    }
-}
-
-//////////////////////////////// CountNonZero /////////////////////////////////////////////////
-
-typedef ArithmTestBase CountNonZero;
-
-OCL_TEST_P(CountNonZero, MAT)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        int cpures = cv::countNonZero(src1_roi);
-        int gpures = cv::ocl::countNonZero(gsrc1_roi);
-
-        EXPECT_DOUBLE_EQ((double)cpures, (double)gpures);
-    }
-}
-
-//////////////////////////////// Phase /////////////////////////////////////////////////
-
-typedef ArithmTestBase Phase;
-
-OCL_TEST_P(Phase, angleInDegrees)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        cv::phase(src1_roi, src2_roi, dst1_roi, true);
-        cv::ocl::phase(gsrc1_roi, gsrc2_roi, gdst1_roi, true);
-
-        Near(1e-2);
-    }
-}
-
-OCL_TEST_P(Phase, angleInRadians)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        cv::phase(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::phase(gsrc1_roi, gsrc2_roi, gdst1_roi);
-
-        Near(1e-2);
-    }
-}
-
-//////////////////////////////// Bitwise_and /////////////////////////////////////////////////
-
-typedef ArithmTestBase Bitwise_and;
-
-OCL_TEST_P(Bitwise_and, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_and(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::bitwise_and(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_and, Mat_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_and(src1_roi, src2_roi, dst1_roi, mask_roi);
-        cv::ocl::bitwise_and(gsrc1_roi, gsrc2_roi, gdst1_roi, gmask_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_and, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_and(src1_roi, val, dst1_roi);
-        cv::ocl::bitwise_and(gsrc1_roi, val, gdst1_roi);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Bitwise_and, Scalar_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_and(src1_roi, val, dst1_roi, mask_roi);
-        cv::ocl::bitwise_and(gsrc1_roi, val, gdst1_roi, gmask_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// Bitwise_or /////////////////////////////////////////////////
-
-typedef ArithmTestBase Bitwise_or;
-
-OCL_TEST_P(Bitwise_or, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_or(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::bitwise_or(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_or, Mat_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_or(src1_roi, src2_roi, dst1_roi, mask_roi);
-        cv::ocl::bitwise_or(gsrc1_roi, gsrc2_roi, gdst1_roi, gmask_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_or, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_or(src1_roi, val, dst1_roi);
-        cv::ocl::bitwise_or(gsrc1_roi, val, gdst1_roi);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Bitwise_or, Scalar_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_or(src1_roi, val, dst1_roi, mask_roi);
-        cv::ocl::bitwise_or(gsrc1_roi, val, gdst1_roi, gmask_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// Bitwise_xor /////////////////////////////////////////////////
-
-typedef ArithmTestBase Bitwise_xor;
-
-OCL_TEST_P(Bitwise_xor, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi);
-        cv::ocl::bitwise_xor(gsrc1_roi, gsrc2_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_xor, Mat_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_xor(src1_roi, src2_roi, dst1_roi, mask_roi);
-        cv::ocl::bitwise_xor(gsrc1_roi, gsrc2_roi, gdst1_roi, gmask_roi);
-        Near(0);
-    }
-}
-
-OCL_TEST_P(Bitwise_xor, Scalar)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_xor(src1_roi, val, dst1_roi);
-        cv::ocl::bitwise_xor(gsrc1_roi, val, gdst1_roi);
-        Near(1e-5);
-    }
-}
-
-OCL_TEST_P(Bitwise_xor, Scalar_Mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_xor(src1_roi, val, dst1_roi, mask_roi);
-        cv::ocl::bitwise_xor(gsrc1_roi, val, gdst1_roi, gmask_roi);
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////// Bitwise_not /////////////////////////////////////////////////
-
-typedef ArithmTestBase Bitwise_not;
-
-OCL_TEST_P(Bitwise_not, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::bitwise_not(src1_roi, dst1_roi);
-        cv::ocl::bitwise_not(gsrc1_roi, gdst1_roi);
-        Near(0);
-    }
-}
-
-//////////////////////////////// Compare /////////////////////////////////////////////////
-
-typedef ArithmTestBase Compare;
-
-OCL_TEST_P(Compare, Mat)
-{
-    int cmp_codes[] = { CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE };
-    int cmp_num = sizeof(cmp_codes) / sizeof(int);
-
-    for (int i = 0; i < cmp_num; ++i)
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi();
-
-            cv::compare(src1_roi, src2_roi, dst1_roi, cmp_codes[i]);
-            cv::ocl::compare(gsrc1_roi, gsrc2_roi, gdst1_roi, cmp_codes[i]);
-
-            Near(0);
-        }
-}
-
-//////////////////////////////// Pow /////////////////////////////////////////////////
-
-typedef ArithmTestBase Pow;
-
-OCL_TEST_P(Pow, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        double p = 4.5;
-        cv::pow(src1_roi, p, dst1_roi);
-        cv::ocl::pow(gsrc1_roi, p, gdst1_roi);
-        Near(1);
-    }
-}
-
-//////////////////////////////// AddWeighted /////////////////////////////////////////////////
-
-typedef ArithmTestBase AddWeighted;
-
-OCL_TEST_P(AddWeighted, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        const double alpha = 2.0, beta = 1.0, gama = 3.0;
-
-        cv::addWeighted(src1_roi, alpha, src2_roi, beta, gama, dst1_roi);
-        cv::ocl::addWeighted(gsrc1_roi, alpha, gsrc2_roi, beta, gama, gdst1_roi);
-
-        Near(3e-4);
-    }
-}
-
-//////////////////////////////// setIdentity /////////////////////////////////////////////////
-
-typedef ArithmTestBase SetIdentity;
-
-OCL_TEST_P(SetIdentity, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::setIdentity(dst1_roi, val);
-        cv::ocl::setIdentity(gdst1_roi, val);
-
-        Near(0);
-    }
-}
-
-//////////////////////////////// meanStdDev /////////////////////////////////////////////////
-
-typedef ArithmTestBase MeanStdDev;
-
-OCL_TEST_P(MeanStdDev, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Scalar cpu_mean, cpu_stddev;
-        Scalar gpu_mean, gpu_stddev;
-
-        cv::meanStdDev(src1_roi, cpu_mean, cpu_stddev);
-        cv::ocl::meanStdDev(gsrc1_roi, gpu_mean, gpu_stddev);
-
-        for (int i = 0; i < 4; ++i)
-        {
-            EXPECT_NEAR(cpu_mean[i], gpu_mean[i], 0.1);
-            EXPECT_NEAR(cpu_stddev[i], gpu_stddev[i], 0.1);
-        }
-    }
-}
-
-//////////////////////////////// Norm /////////////////////////////////////////////////
-
-typedef ArithmTestBase Norm;
-
-OCL_TEST_P(Norm, NORM_INF)
-{
-    for (int relative = 0; relative < 2; ++relative)
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi();
-
-            int type = NORM_INF;
-            if (relative == 1)
-                type |= NORM_RELATIVE;
-
-            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
-            const double gpuRes = cv::ocl::norm(gsrc1_roi, gsrc2_roi, type);
-
-            EXPECT_NEAR(cpuRes, gpuRes, 0.1);
-        }
-}
-
-OCL_TEST_P(Norm, NORM_L1)
-{
-    for (int relative = 0; relative < 2; ++relative)
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi();
-
-            int type = NORM_L1;
-            if (relative == 1)
-                type |= NORM_RELATIVE;
-
-            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
-            const double gpuRes = cv::ocl::norm(gsrc1_roi, gsrc2_roi, type);
-
-            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
-        }
-}
-
-OCL_TEST_P(Norm, NORM_L2)
-{
-    for (int relative = 0; relative < 2; ++relative)
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi();
-
-            int type = NORM_L2;
-            if (relative == 1)
-                type |= NORM_RELATIVE;
-
-            const double cpuRes = cv::norm(src1_roi, src2_roi, type);
-            const double gpuRes = cv::ocl::norm(gsrc1_roi, gsrc2_roi, type);
-
-            EXPECT_PRED3(relativeError, cpuRes, gpuRes, 1e-6);
-        }
-}
-
-//// Repeat
-
-struct RepeatTestCase :
-        public ArithmTestBase
-{
-    int nx, ny;
-
-    virtual void random_roi()
-    {
-        const int type = CV_MAKE_TYPE(depth, cn);
-
-        nx = randomInt(1, 4);
-        ny = randomInt(1, 4);
-
-        Size srcRoiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, srcRoiSize, srcBorder, type, 2, 11);
-
-        Size dstRoiSize(srcRoiSize.width * nx, srcRoiSize.height * ny);
-        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst1, dst1_roi, dstRoiSize, dst1Border, type, 5, 16);
-
-        generateOclMat(gsrc1_whole, gsrc1_roi, src1, srcRoiSize, srcBorder);
-        generateOclMat(gdst1_whole, gdst1_roi, dst1, dstRoiSize, dst1Border);
-    }
-};
-
-typedef RepeatTestCase Repeat;
-
-OCL_TEST_P(Repeat, Mat)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        cv::repeat(src1_roi, ny, nx, dst1_roi);
-        cv::ocl::repeat(gsrc1_roi, ny, nx, gdst1_roi);
-
-        Near();
-    }
-}
-
-//////////////////////////////////////// Instantiation /////////////////////////////////////////
-
-INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool(), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Exp, Combine(testing::Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Log, Combine(testing::Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(testing::Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Div, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Min, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Max, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, CartToPolar, Combine(Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, PolarToCart, Combine(Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Magnitude, Combine(Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Transpose, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, MinMax, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(Channels(1)), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, MinMaxLoc, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(Channels(1)), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Sum, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, SqrSum, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, AbsSum, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, CountNonZero, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(Channels(1)), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(Channels(1)), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Pow, Combine(Values(CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, AddWeighted, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, SetIdentity, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, MeanStdDev, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Norm, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-INSTANTIATE_TEST_CASE_P(Arithm, Repeat, Combine(Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), Values(1, 2, 3, 4), Bool()));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_bgfg.cpp b/modules/ocl/test/test_bgfg.cpp
deleted file mode 100644
index 8b4c865c3..000000000
--- a/modules/ocl/test/test_bgfg.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2013, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#if defined(HAVE_XINE)         || \
-    defined(HAVE_GSTREAMER)    || \
-    defined(HAVE_QUICKTIME)    || \
-    defined(HAVE_AVFOUNDATION) || \
-    defined(HAVE_FFMPEG)       || \
-    defined(WIN32)
-
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
-#else
-#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
-#endif
-
-#if BUILD_WITH_VIDEO_INPUT_SUPPORT
-
-//////////////////////////////////////////////////////
-// MOG
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(UseGray, bool)
-    IMPLEMENT_PARAM_CLASS(LearningRate, double)
-}
-
-PARAM_TEST_CASE(mog, UseGray, LearningRate, bool)
-{
-    bool useGray;
-    double learningRate;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        useGray = GET_PARAM(0);
-        learningRate = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-    }
-};
-
-OCL_TEST_P(mog, Update)
-{
-    std::string inputFile = string(cvtest::TS::ptr()->get_data_path()) + "gpu/video/768x576.avi";
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::ocl::MOG mog;
-    cv::ocl::oclMat foreground = createMat_ocl(rng, frame.size(), CV_8UC1, useRoi);
-
-    Ptr<cv::BackgroundSubtractorMOG> mog_gold = createBackgroundSubtractorMOG();
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog(loadMat_ocl(rng, frame, useRoi), foreground, (float)learningRate);
-
-        mog_gold->apply(frame, foreground_gold, learningRate);
-
-        EXPECT_MAT_NEAR(foreground_gold, foreground, 0.0);
-    }
-}
-INSTANTIATE_TEST_CASE_P(OCL_Video, mog, testing::Combine(
-    testing::Values(UseGray(false), UseGray(true)),
-    testing::Values(LearningRate(0.0), LearningRate(0.01)),
-    Values(true, false)));
-
-//////////////////////////////////////////////////////
-// MOG2
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
-}
-
-PARAM_TEST_CASE(mog2, UseGray, DetectShadow, bool)
-{
-    bool useGray;
-    bool detectShadow;
-    bool useRoi;
-    virtual void SetUp()
-    {
-        useGray = GET_PARAM(0);
-        detectShadow = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-    }
-};
-
-OCL_TEST_P(mog2, Update)
-{
-    std::string inputFile = string(cvtest::TS::ptr()->get_data_path()) + "gpu/video/768x576.avi";
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    cv::ocl::MOG2 mog2;
-    mog2.bShadowDetection = detectShadow;
-    cv::ocl::oclMat foreground = createMat_ocl(rng, frame.size(), CV_8UC1, useRoi);
-
-    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = createBackgroundSubtractorMOG2();
-    mog2_gold->setDetectShadows(detectShadow);
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (useGray)
-        {
-            cv::Mat temp;
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            cv::swap(temp, frame);
-        }
-
-        mog2(loadMat_ocl(rng, frame, useRoi), foreground);
-
-        mog2_gold->apply(frame, foreground_gold);
-
-        if (detectShadow)
-            EXPECT_MAT_SIMILAR(foreground_gold, foreground, 15e-3)
-        else
-            EXPECT_MAT_NEAR(foreground_gold, foreground, 0)
-    }
-}
-
-OCL_TEST_P(mog2, getBackgroundImage)
-{
-    if (useGray)
-        return;
-
-    std::string inputFile = string(cvtest::TS::ptr()->get_data_path()) + "gpu/video/768x576.avi";
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::ocl::MOG2 mog2;
-    mog2.bShadowDetection = detectShadow;
-    cv::ocl::oclMat foreground;
-
-    cv::Ptr<cv::BackgroundSubtractorMOG2> mog2_gold = createBackgroundSubtractorMOG2();
-    mog2_gold->setDetectShadows(detectShadow);
-    cv::Mat foreground_gold;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        mog2(loadMat_ocl(rng, frame, useRoi), foreground);
-
-        mog2_gold->apply(frame, foreground_gold);
-    }
-
-    cv::ocl::oclMat background = createMat_ocl(rng, frame.size(), frame.type(), useRoi);
-    mog2.getBackgroundImage(background);
-
-    cv::Mat background_gold;
-    mog2_gold->getBackgroundImage(background_gold);
-
-    EXPECT_MAT_NEAR(background_gold, background, 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Video, mog2, testing::Combine(
-    testing::Values(UseGray(true), UseGray(false)),
-    testing::Values(DetectShadow(true), DetectShadow(false)),
-    Values(true, false)));
-
-#endif
-
-#endif
diff --git a/modules/ocl/test/test_blend.cpp b/modules/ocl/test/test_blend.cpp
deleted file mode 100644
index 1576891a4..000000000
--- a/modules/ocl/test/test_blend.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Nathan, liujun@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "test_precomp.hpp"
-#include <iomanip>
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace testing;
-using namespace std;
-
-template <typename T>
-static void blendLinearGold(const Mat &img1, const Mat &img2,
-                            const Mat &weights1, const Mat &weights2,
-                            Mat &result_gold)
-{
-    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
-    CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
-              weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
-
-    result_gold.create(img1.size(), img1.type());
-
-    int cn = img1.channels();
-    int step1 = img1.cols * img1.channels();
-
-    for (int y = 0; y < img1.rows; ++y)
-    {
-        const float * const weights1_row = weights1.ptr<float>(y);
-        const float * const weights2_row = weights2.ptr<float>(y);
-        const T * const img1_row = img1.ptr<T>(y);
-        const T * const img2_row = img2.ptr<T>(y);
-        T * const result_gold_row = result_gold.ptr<T>(y);
-
-        for (int x = 0; x < step1; ++x)
-        {
-            int x1 = x / cn;
-            float w1 = weights1_row[x1], w2 = weights2_row[x1];
-            result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
-                                                 + (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
-        }
-    }
-}
-
-PARAM_TEST_CASE(Blend, MatDepth, int, bool)
-{
-    int depth, channels;
-    bool useRoi;
-
-    Mat src1, src2, weights1, weights2, dst;
-    Mat src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi;
-    oclMat gsrc1, gsrc2, gweights1, gweights2, gdst, gst;
-    oclMat gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-    }
-
-    void random_roi()
-    {
-        const int type = CV_MAKE_TYPE(depth, channels);
-
-        const double upValue = 256;
-        const double sumMinValue = 0.01; // we don't want to divide by "zero"
-
-        Size roiSize = randomSize(1, 20);
-        Border src1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, roiSize, src1Border, type, -upValue, upValue);
-
-        Border src2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src2, src2_roi, roiSize, src2Border, type, -upValue, upValue);
-
-        Border weights1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(weights1, weights1_roi, roiSize, weights1Border, CV_32FC1, -upValue, upValue);
-
-        Border weights2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(weights2, weights2_roi, roiSize, weights2Border, CV_32FC1, sumMinValue, upValue); // fill it as a (w1 + w12)
-
-        weights2_roi = weights2_roi - weights1_roi;
-        // check that weights2_roi is still a part of weights2 (not a new matrix)
-        CV_Assert(checkNorm(weights2_roi,
-            weights2(Rect(weights2Border.lef, weights2Border.top, roiSize.width, roiSize.height))) < 1e-6);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
-
-        generateOclMat(gsrc1, gsrc1_roi, src1, roiSize, src1Border);
-        generateOclMat(gsrc2, gsrc2_roi, src2, roiSize, src2Border);
-        generateOclMat(gweights1, gweights1_roi, weights1, roiSize, weights1Border);
-        generateOclMat(gweights2, gweights2_roi, weights2, roiSize, weights2Border);
-        generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
-    }
-
-    void Near(double eps = 0.0)
-    {
-        Mat whole, roi;
-        gdst.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst, whole, eps);
-        EXPECT_MAT_NEAR(dst_roi, roi, eps);
-    }
-};
-
-typedef void (*blendLinearFunc)(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold);
-
-OCL_TEST_P(Blend, Accuracy)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        cv::ocl::blendLinear(gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi);
-
-        static blendLinearFunc funcs[] = {
-            blendLinearGold<uchar>,
-            blendLinearGold<schar>,
-            blendLinearGold<ushort>,
-            blendLinearGold<short>,
-            blendLinearGold<int>,
-            blendLinearGold<float>,
-        };
-
-        blendLinearFunc func = funcs[depth];
-        func(src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi);
-
-        Near(depth <= CV_32S ? 1.0 : 0.2);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend,
-                        Combine(testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
-                                testing::Range(1, 5), Bool()));
diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp
deleted file mode 100644
index 04ca9e297..000000000
--- a/modules/ocl/test/test_brute_force_matcher.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Nathan, liujun@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#ifdef HAVE_OPENCL
-namespace
-{
-    /////////////////////////////////////////////////////////////////////////////////////////////////
-    // BruteForceMatcher
-    CV_ENUM(DistType, BruteForceMatcher_OCL_base::L1Dist,
-                      BruteForceMatcher_OCL_base::L2Dist,
-                      BruteForceMatcher_OCL_base::HammingDist)
-    IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
-    PARAM_TEST_CASE(BruteForceMatcher, DistType, DescriptorSize)
-    {
-        cv::ocl::BruteForceMatcher_OCL_base::DistType distType;
-        int normCode;
-        int dim;
-
-        int queryDescCount;
-        int countFactor;
-
-        cv::Mat query, train;
-
-        virtual void SetUp()
-        {
-            distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0);
-            dim = GET_PARAM(1);
-
-            queryDescCount = 300; // must be even number because we split train data in some cases in two
-            countFactor = 4; // do not change it
-
-            cv::Mat queryBuf, trainBuf;
-
-            // Generate query descriptors randomly.
-            // Descriptor vector elements are integer values.
-            queryBuf.create(queryDescCount, dim, CV_32SC1);
-            rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
-            queryBuf.convertTo(queryBuf, CV_32FC1);
-
-            // Generate train decriptors as follows:
-            // copy each query descriptor to train set countFactor times
-            // and perturb some one element of the copied descriptors in
-            // in ascending order. General boundaries of the perturbation
-            // are (0.f, 1.f).
-            trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
-            float step = 1.f / countFactor;
-            for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
-            {
-                cv::Mat queryDescriptor = queryBuf.row(qIdx);
-                for (int c = 0; c < countFactor; c++)
-                {
-                    int tIdx = qIdx * countFactor + c;
-                    cv::Mat trainDescriptor = trainBuf.row(tIdx);
-                    queryDescriptor.copyTo(trainDescriptor);
-                    int elem = rng(dim);
-                    float diff = rng.uniform(step * c, step * (c + 1));
-                    trainDescriptor.at<float>(0, elem) += diff;
-                }
-            }
-
-            queryBuf.convertTo(query, CV_32F);
-            trainBuf.convertTo(train, CV_32F);
-        }
-    };
-
-#ifdef ANDROID
-    OCL_TEST_P(BruteForceMatcher, DISABLED_Match_Single)
-#else
-    OCL_TEST_P(BruteForceMatcher, Match_Single)
-#endif
-    {
-        cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
-
-        std::vector<cv::DMatch> matches;
-        matcher.match(cv::ocl::oclMat(query),  cv::ocl::oclMat(train),  matches);
-
-        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-        int badCount = 0;
-        for (size_t i = 0; i < matches.size(); i++)
-        {
-            cv::DMatch match = matches[i];
-            if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-                badCount++;
-        }
-
-        ASSERT_EQ(0, badCount);
-    }
-
-#ifdef ANDROID
-    OCL_TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
-#else
-    OCL_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
-#endif
-    {
-        const int knn = 2;
-
-        cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
-
-        std::vector< std::vector<cv::DMatch> > matches;
-        matcher.knnMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, knn);
-
-        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-        int badCount = 0;
-        for (size_t i = 0; i < matches.size(); i++)
-        {
-            if ((int)matches[i].size() != knn)
-                badCount++;
-            else
-            {
-                int localBadCount = 0;
-                for (int k = 0; k < knn; k++)
-                {
-                    cv::DMatch match = matches[i][k];
-                    if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
-                        localBadCount++;
-                }
-                badCount += localBadCount > 0 ? 1 : 0;
-            }
-        }
-
-        ASSERT_EQ(0, badCount);
-    }
-
-#ifdef ANDROID
-    OCL_TEST_P(BruteForceMatcher, DISABLED_RadiusMatch_Single)
-#else
-    OCL_TEST_P(BruteForceMatcher, RadiusMatch_Single)
-#endif
-    {
-        float radius = 1.f / countFactor;
-
-        cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
-
-        std::vector< std::vector<cv::DMatch> > matches;
-        matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
-
-        ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
-
-        int badCount = 0;
-        for (size_t i = 0; i < matches.size(); i++)
-        {
-            if ((int)matches[i].size() != 1)
-            {
-                badCount++;
-            }
-            else
-            {
-                cv::DMatch match = matches[i][0];
-                if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
-                    badCount++;
-            }
-        }
-
-        ASSERT_EQ(0, badCount);
-    }
-
-    INSTANTIATE_TEST_CASE_P(OCL_Features2D, BruteForceMatcher,
-        testing::Combine(
-        testing::Values(
-            DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist),
-            DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)/*,
-            DistType(cv::ocl::BruteForceMatcher_OCL_base::HammingDist)*/
-        ),
-        testing::Values(
-            DescriptorSize(57),
-            DescriptorSize(64),
-            DescriptorSize(83),
-            DescriptorSize(128),
-            DescriptorSize(179),
-            DescriptorSize(256),
-            DescriptorSize(304))
-        )
-    );
-} // namespace
-#endif
diff --git a/modules/ocl/test/test_calib3d.cpp b/modules/ocl/test/test_calib3d.cpp
deleted file mode 100644
index 9fd0b2329..000000000
--- a/modules/ocl/test/test_calib3d.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//     Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include <iomanip>
-
-using namespace cv;
-
-#ifdef HAVE_OPENCL
-
-PARAM_TEST_CASE(StereoMatchBM, int, int)
-{
-    int n_disp;
-    int winSize;
-
-    virtual void SetUp()
-    {
-        n_disp  = GET_PARAM(0);
-        winSize = GET_PARAM(1);
-    }
-};
-
-OCL_TEST_P(StereoMatchBM, Regression)
-{
-
-    Mat left_image  = readImage("gpu/stereobm/aloe-L.png", IMREAD_GRAYSCALE);
-    Mat right_image = readImage("gpu/stereobm/aloe-R.png", IMREAD_GRAYSCALE);
-    Mat disp_gold   = readImage("gpu/stereobm/aloe-disp.png", IMREAD_GRAYSCALE);
-    ocl::oclMat d_left, d_right;
-    ocl::oclMat d_disp(left_image.size(), CV_8U);
-    Mat  disp;
-
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-    d_left.upload(left_image);
-    d_right.upload(right_image);
-
-    ocl::StereoBM_OCL bm(0, n_disp, winSize);
-
-
-    bm(d_left, d_right, d_disp);
-    d_disp.download(disp);
-
-    EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128),
-                                       testing::Values(19)));
-
-PARAM_TEST_CASE(StereoMatchBP, int, int, int, float, float, float, float)
-{
-    int ndisp_;
-    int iters_;
-    int levels_;
-    float max_data_term_;
-    float data_weight_;
-    float max_disc_term_;
-    float disc_single_jump_;
-    virtual void SetUp()
-    {
-        ndisp_          = GET_PARAM(0);
-        iters_          = GET_PARAM(1);
-        levels_         = GET_PARAM(2);
-        max_data_term_  = GET_PARAM(3);
-        data_weight_    = GET_PARAM(4);
-        max_disc_term_     = GET_PARAM(5);
-        disc_single_jump_  = GET_PARAM(6);
-    }
-};
-OCL_TEST_P(StereoMatchBP, Regression)
-{
-    Mat left_image  = readImage("gpu/stereobp/aloe-L.png");
-    Mat right_image = readImage("gpu/stereobp/aloe-R.png");
-    Mat disp_gold   = readImage("gpu/stereobp/aloe-disp.png", IMREAD_GRAYSCALE);
-    ocl::oclMat d_left, d_right;
-    ocl::oclMat d_disp;
-    Mat  disp;
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-    d_left.upload(left_image);
-    d_right.upload(right_image);
-    ocl::StereoBeliefPropagation bp(ndisp_, iters_, levels_, max_data_term_, data_weight_,
-        max_disc_term_, disc_single_jump_, CV_16S);
-    bp(d_left, d_right, d_disp);
-    d_disp.download(disp);
-    disp.convertTo(disp, disp_gold.depth());
-    EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
-}
-INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64),
-    testing::Values(8),testing::Values(2),testing::Values(25.0f),
-    testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f)));
-
-//////////////////////////////////////////////////////////////////////////
-//  ConstSpaceBeliefPropagation
-PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int)
-{
-    int ndisp_;
-    int iters_;
-    int levels_;
-    int nr_plane_;
-    float max_data_term_;
-    float data_weight_;
-    float max_disc_term_;
-    float disc_single_jump_;
-    int min_disp_th_;
-    int msg_type_;
-
-    virtual void SetUp()
-    {
-        ndisp_          = GET_PARAM(0);
-        iters_          = GET_PARAM(1);
-        levels_         = GET_PARAM(2);
-        nr_plane_ = GET_PARAM(3);
-        max_data_term_  = GET_PARAM(4);
-        data_weight_    = GET_PARAM(5);
-        max_disc_term_     = GET_PARAM(6);
-        disc_single_jump_  = GET_PARAM(7);
-        min_disp_th_ = GET_PARAM(8);
-        msg_type_  = GET_PARAM(9);
-    }
-};
-OCL_TEST_P(StereoMatchConstSpaceBP, Regression)
-{
-    Mat left_image  = readImage("gpu/csstereobp/aloe-L.png");
-    Mat right_image = readImage("gpu/csstereobp/aloe-R.png");
-    Mat disp_gold   = readImage("gpu/csstereobp/aloe-disp.png", IMREAD_GRAYSCALE);
-
-    ocl::oclMat d_left, d_right;
-    ocl::oclMat d_disp;
-
-    Mat  disp;
-    ASSERT_FALSE(left_image.empty());
-    ASSERT_FALSE(right_image.empty());
-    ASSERT_FALSE(disp_gold.empty());
-
-    d_left.upload(left_image);
-    d_right.upload(right_image);
-
-    ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_,
-        max_disc_term_, disc_single_jump_, 0, CV_32F);
-    bp(d_left, d_right, d_disp);
-    d_disp.download(disp);
-    disp.convertTo(disp, disp_gold.depth());
-
-    EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4);
-    //EXPECT_MAT_NEAR(disp_gold, disp, 1.0, "");
-}
-INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128),
-    testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f),
-    testing::Values(1.0f),testing::Values(160.0f),
-    testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F)));
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_color.cpp b/modules/ocl/test/test_color.cpp
deleted file mode 100644
index c4641d42c..000000000
--- a/modules/ocl/test/test_color.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-using namespace cv;
-
-#ifdef HAVE_OPENCL
-
-using namespace testing;
-using namespace cv;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// cvtColor
-
-PARAM_TEST_CASE(CvtColor, MatDepth, bool)
-{
-    int depth;
-    bool use_roi;
-
-    // src mat
-    Mat src;
-    Mat dst;
-
-    // src mat with roi
-    Mat src_roi;
-    Mat dst_roi;
-
-    // ocl dst mat for testing
-    ocl::oclMat gsrc_whole;
-    ocl::oclMat gdst_whole;
-
-    // ocl mat with roi
-    ocl::oclMat gsrc_roi;
-    ocl::oclMat gdst_roi;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
-    }
-
-    virtual void random_roi(int channelsIn, int channelsOut)
-    {
-        const int srcType = CV_MAKE_TYPE(depth, channelsIn);
-        const int dstType = CV_MAKE_TYPE(depth, channelsOut);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, srcType, 2, 100);
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst, roiSize, dstBorder);
-    }
-
-    void Near(double threshold)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-
-    void doTest(int channelsIn, int channelsOut, int code, double threshold = 1e-3)
-    {
-        for (int j = 0; j < LOOP_TIMES; j++)
-        {
-            random_roi(channelsIn, channelsOut);
-
-            cvtColor(src_roi, dst_roi, code, channelsOut);
-            ocl::cvtColor(gsrc_roi, gdst_roi, code, channelsOut);
-
-            Near(threshold);
-        }
-    }
-};
-
-#define CVTCODE(name) COLOR_ ## name
-
-// RGB[A] <-> BGR[A]
-
-OCL_TEST_P(CvtColor, BGR2BGRA) { doTest(3, 4, CVTCODE(BGR2BGRA)); }
-OCL_TEST_P(CvtColor, RGB2RGBA) { doTest(3, 4, CVTCODE(RGB2RGBA)); }
-OCL_TEST_P(CvtColor, BGRA2BGR) { doTest(4, 3, CVTCODE(BGRA2BGR)); }
-OCL_TEST_P(CvtColor, RGBA2RGB) { doTest(4, 3, CVTCODE(RGBA2RGB)); }
-OCL_TEST_P(CvtColor, BGR2RGBA) { doTest(3, 4, CVTCODE(BGR2RGBA)); }
-OCL_TEST_P(CvtColor, RGB2BGRA) { doTest(3, 4, CVTCODE(RGB2BGRA)); }
-OCL_TEST_P(CvtColor, RGBA2BGR) { doTest(4, 3, CVTCODE(RGBA2BGR)); }
-OCL_TEST_P(CvtColor, BGRA2RGB) { doTest(4, 3, CVTCODE(BGRA2RGB)); }
-OCL_TEST_P(CvtColor, BGR2RGB) { doTest(3, 3, CVTCODE(BGR2RGB)); }
-OCL_TEST_P(CvtColor, RGB2BGR) { doTest(3, 3, CVTCODE(RGB2BGR)); }
-OCL_TEST_P(CvtColor, BGRA2RGBA) { doTest(4, 4, CVTCODE(BGRA2RGBA)); }
-OCL_TEST_P(CvtColor, RGBA2BGRA) { doTest(4, 4, CVTCODE(RGBA2BGRA)); }
-
-// RGB <-> Gray
-
-OCL_TEST_P(CvtColor, RGB2GRAY) { doTest(3, 1, CVTCODE(RGB2GRAY)); }
-OCL_TEST_P(CvtColor, GRAY2RGB) { doTest(1, 3, CVTCODE(GRAY2RGB)); }
-OCL_TEST_P(CvtColor, BGR2GRAY) { doTest(3, 1, CVTCODE(BGR2GRAY)); }
-OCL_TEST_P(CvtColor, GRAY2BGR) { doTest(1, 3, CVTCODE(GRAY2BGR)); }
-OCL_TEST_P(CvtColor, RGBA2GRAY) { doTest(4, 1, CVTCODE(RGBA2GRAY)); }
-OCL_TEST_P(CvtColor, GRAY2RGBA) { doTest(1, 4, CVTCODE(GRAY2RGBA)); }
-OCL_TEST_P(CvtColor, BGRA2GRAY) { doTest(4, 1, CVTCODE(BGRA2GRAY)); }
-OCL_TEST_P(CvtColor, GRAY2BGRA) { doTest(1, 4, CVTCODE(GRAY2BGRA)); }
-
-// RGB <-> YUV
-
-OCL_TEST_P(CvtColor, RGB2YUV) { doTest(3, 3, CVTCODE(RGB2YUV)); }
-OCL_TEST_P(CvtColor, BGR2YUV) { doTest(3, 3, CVTCODE(BGR2YUV)); }
-OCL_TEST_P(CvtColor, RGBA2YUV) { doTest(4, 3, CVTCODE(RGB2YUV)); }
-OCL_TEST_P(CvtColor, BGRA2YUV) { doTest(4, 3, CVTCODE(BGR2YUV)); }
-OCL_TEST_P(CvtColor, YUV2RGB) { doTest(3, 3, CVTCODE(YUV2RGB)); }
-OCL_TEST_P(CvtColor, YUV2BGR) { doTest(3, 3, CVTCODE(YUV2BGR)); }
-OCL_TEST_P(CvtColor, YUV2RGBA) { doTest(3, 4, CVTCODE(YUV2RGB)); }
-OCL_TEST_P(CvtColor, YUV2BGRA) { doTest(3, 4, CVTCODE(YUV2BGR)); }
-
-// RGB <-> YCrCb
-
-OCL_TEST_P(CvtColor, RGB2YCrCb) { doTest(3, 3, CVTCODE(RGB2YCrCb)); }
-OCL_TEST_P(CvtColor, BGR2YCrCb) { doTest(3, 3, CVTCODE(BGR2YCrCb)); }
-OCL_TEST_P(CvtColor, RGBA2YCrCb) { doTest(4, 3, CVTCODE(RGB2YCrCb)); }
-OCL_TEST_P(CvtColor, BGRA2YCrCb) { doTest(4, 3, CVTCODE(BGR2YCrCb)); }
-OCL_TEST_P(CvtColor, YCrCb2RGB) { doTest(3, 3, CVTCODE(YCrCb2RGB)); }
-OCL_TEST_P(CvtColor, YCrCb2BGR) { doTest(3, 3, CVTCODE(YCrCb2BGR)); }
-OCL_TEST_P(CvtColor, YCrCb2RGBA) { doTest(3, 4, CVTCODE(YCrCb2RGB)); }
-OCL_TEST_P(CvtColor, YCrCb2BGRA) { doTest(3, 4, CVTCODE(YCrCb2BGR)); }
-
-// RGB <-> XYZ
-
-OCL_TEST_P(CvtColor, RGB2XYZ) { doTest(3, 3, CVTCODE(RGB2XYZ)); }
-OCL_TEST_P(CvtColor, BGR2XYZ) { doTest(3, 3, CVTCODE(BGR2XYZ)); }
-OCL_TEST_P(CvtColor, RGBA2XYZ) { doTest(4, 3, CVTCODE(RGB2XYZ)); }
-OCL_TEST_P(CvtColor, BGRA2XYZ) { doTest(4, 3, CVTCODE(BGR2XYZ)); }
-
-OCL_TEST_P(CvtColor, XYZ2RGB) { doTest(3, 3, CVTCODE(XYZ2RGB)); }
-OCL_TEST_P(CvtColor, XYZ2BGR) { doTest(3, 3, CVTCODE(XYZ2BGR)); }
-OCL_TEST_P(CvtColor, XYZ2RGBA) { doTest(3, 4, CVTCODE(XYZ2RGB)); }
-OCL_TEST_P(CvtColor, XYZ2BGRA) { doTest(3, 4, CVTCODE(XYZ2BGR)); }
-
-// RGB <-> HSV
-
-typedef CvtColor CvtColor8u32f;
-
-OCL_TEST_P(CvtColor8u32f, RGB2HSV) { doTest(3, 3, CVTCODE(RGB2HSV)); }
-OCL_TEST_P(CvtColor8u32f, BGR2HSV) { doTest(3, 3, CVTCODE(BGR2HSV)); }
-OCL_TEST_P(CvtColor8u32f, RGBA2HSV) { doTest(4, 3, CVTCODE(RGB2HSV)); }
-OCL_TEST_P(CvtColor8u32f, BGRA2HSV) { doTest(4, 3, CVTCODE(BGR2HSV)); }
-
-OCL_TEST_P(CvtColor8u32f, RGB2HSV_FULL) { doTest(3, 3, CVTCODE(RGB2HSV_FULL)); }
-OCL_TEST_P(CvtColor8u32f, BGR2HSV_FULL) { doTest(3, 3, CVTCODE(BGR2HSV_FULL)); }
-OCL_TEST_P(CvtColor8u32f, RGBA2HSV_FULL) { doTest(4, 3, CVTCODE(RGB2HSV_FULL)); }
-OCL_TEST_P(CvtColor8u32f, BGRA2HSV_FULL) { doTest(4, 3, CVTCODE(BGR2HSV_FULL)); }
-
-OCL_TEST_P(CvtColor8u32f, HSV2RGB) { doTest(3, 3, CVTCODE(HSV2RGB), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2BGR) { doTest(3, 3, CVTCODE(HSV2BGR), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2RGBA) { doTest(3, 4, CVTCODE(HSV2RGB), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2BGRA) { doTest(3, 4, CVTCODE(HSV2BGR), depth == CV_8U ? 1 : 4e-1); }
-
-OCL_TEST_P(CvtColor8u32f, HSV2RGB_FULL) { doTest(3, 3, CVTCODE(HSV2RGB_FULL), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2BGR_FULL) { doTest(3, 3, CVTCODE(HSV2BGR_FULL), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2RGBA_FULL) { doTest(3, 4, CVTCODE(HSV2BGR_FULL), depth == CV_8U ? 1 : 4e-1); }
-OCL_TEST_P(CvtColor8u32f, HSV2BGRA_FULL) { doTest(3, 4, CVTCODE(HSV2BGR_FULL), depth == CV_8U ? 1 : 4e-1); }
-
-// RGB <-> HLS
-
-OCL_TEST_P(CvtColor8u32f, RGB2HLS) { doTest(3, 3, CVTCODE(RGB2HLS), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, BGR2HLS) { doTest(3, 3, CVTCODE(BGR2HLS), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, RGBA2HLS) { doTest(4, 3, CVTCODE(RGB2HLS), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, BGRA2HLS) { doTest(4, 3, CVTCODE(BGR2HLS), depth == CV_8U ? 1 : 1e-3); }
-
-OCL_TEST_P(CvtColor8u32f, RGB2HLS_FULL) { doTest(3, 3, CVTCODE(RGB2HLS_FULL), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, BGR2HLS_FULL) { doTest(3, 3, CVTCODE(BGR2HLS_FULL), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, RGBA2HLS_FULL) { doTest(4, 3, CVTCODE(RGB2HLS_FULL), depth == CV_8U ? 1 : 1e-3); }
-OCL_TEST_P(CvtColor8u32f, BGRA2HLS_FULL) { doTest(4, 3, CVTCODE(BGR2HLS_FULL), depth == CV_8U ? 1 : 1e-3); }
-
-OCL_TEST_P(CvtColor8u32f, HLS2RGB) { doTest(3, 3, CVTCODE(HLS2RGB), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2BGR) { doTest(3, 3, CVTCODE(HLS2BGR), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2RGBA) { doTest(3, 4, CVTCODE(HLS2RGB), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2BGRA) { doTest(3, 4, CVTCODE(HLS2BGR), 1); }
-
-OCL_TEST_P(CvtColor8u32f, HLS2RGB_FULL) { doTest(3, 3, CVTCODE(HLS2RGB_FULL), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2BGR_FULL) { doTest(3, 3, CVTCODE(HLS2BGR_FULL), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2RGBA_FULL) { doTest(3, 4, CVTCODE(HLS2RGB_FULL), 1); }
-OCL_TEST_P(CvtColor8u32f, HLS2BGRA_FULL) { doTest(3, 4, CVTCODE(HLS2BGR_FULL), 1); }
-
-// RGB5x5 <-> RGB
-
-typedef CvtColor CvtColor8u;
-
-OCL_TEST_P(CvtColor8u, BGR5652BGR) { doTest(2, 3, CVTCODE(BGR5652BGR)); }
-OCL_TEST_P(CvtColor8u, BGR5652RGB) { doTest(2, 3, CVTCODE(BGR5652RGB)); }
-OCL_TEST_P(CvtColor8u, BGR5652BGRA) { doTest(2, 4, CVTCODE(BGR5652BGRA)); }
-OCL_TEST_P(CvtColor8u, BGR5652RGBA) { doTest(2, 4, CVTCODE(BGR5652RGBA)); }
-
-OCL_TEST_P(CvtColor8u, BGR5552BGR) { doTest(2, 3, CVTCODE(BGR5552BGR)); }
-OCL_TEST_P(CvtColor8u, BGR5552RGB) { doTest(2, 3, CVTCODE(BGR5552RGB)); }
-OCL_TEST_P(CvtColor8u, BGR5552BGRA) { doTest(2, 4, CVTCODE(BGR5552BGRA)); }
-OCL_TEST_P(CvtColor8u, BGR5552RGBA) { doTest(2, 4, CVTCODE(BGR5552RGBA)); }
-
-OCL_TEST_P(CvtColor8u, BGR2BGR565) { doTest(3, 2, CVTCODE(BGR2BGR565)); }
-OCL_TEST_P(CvtColor8u, RGB2BGR565) { doTest(3, 2, CVTCODE(RGB2BGR565)); }
-OCL_TEST_P(CvtColor8u, BGRA2BGR565) { doTest(4, 2, CVTCODE(BGRA2BGR565)); }
-OCL_TEST_P(CvtColor8u, RGBA2BGR565) { doTest(4, 2, CVTCODE(RGBA2BGR565)); }
-
-OCL_TEST_P(CvtColor8u, BGR2BGR555) { doTest(3, 2, CVTCODE(BGR2BGR555)); }
-OCL_TEST_P(CvtColor8u, RGB2BGR555) { doTest(3, 2, CVTCODE(RGB2BGR555)); }
-OCL_TEST_P(CvtColor8u, BGRA2BGR555) { doTest(4, 2, CVTCODE(BGRA2BGR555)); }
-OCL_TEST_P(CvtColor8u, RGBA2BGR555) { doTest(4, 2, CVTCODE(RGBA2BGR555)); }
-
-// RGB5x5 <-> Gray
-
-OCL_TEST_P(CvtColor8u, BGR5652GRAY) { doTest(2, 1, CVTCODE(BGR5652GRAY)); }
-OCL_TEST_P(CvtColor8u, BGR5552GRAY) { doTest(2, 1, CVTCODE(BGR5552GRAY)); }
-
-OCL_TEST_P(CvtColor8u, GRAY2BGR565) { doTest(1, 2, CVTCODE(GRAY2BGR565)); }
-OCL_TEST_P(CvtColor8u, GRAY2BGR555) { doTest(1, 2, CVTCODE(GRAY2BGR555)); }
-
-// RGBA <-> mRGBA
-
-OCL_TEST_P(CvtColor8u, RGBA2mRGBA) { doTest(4, 4, CVTCODE(RGBA2mRGBA)); }
-OCL_TEST_P(CvtColor8u, mRGBA2RGBA) { doTest(4, 4, CVTCODE(mRGBA2RGBA)); }
-
-// YUV -> RGBA_NV12
-
-struct CvtColor_YUV420 :
-        public CvtColor
-{
-    void random_roi(int channelsIn, int channelsOut)
-    {
-        const int srcType = CV_MAKE_TYPE(depth, channelsIn);
-        const int dstType = CV_MAKE_TYPE(depth, channelsOut);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        roiSize.width *= 2;
-        roiSize.height *= 3;
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, srcType, 2, 100);
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst, roiSize, dstBorder);
-    }
-};
-
-OCL_TEST_P(CvtColor_YUV420, YUV2RGBA_NV12) { doTest(1, 4, COLOR_YUV2RGBA_NV12); }
-OCL_TEST_P(CvtColor_YUV420, YUV2BGRA_NV12) { doTest(1, 4, COLOR_YUV2BGRA_NV12); }
-OCL_TEST_P(CvtColor_YUV420, YUV2RGB_NV12) { doTest(1, 3, COLOR_YUV2RGB_NV12); }
-OCL_TEST_P(CvtColor_YUV420, YUV2BGR_NV12) { doTest(1, 3, COLOR_YUV2BGR_NV12); }
-
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor8u,
-                            testing::Combine(testing::Values(MatDepth(CV_8U)), Bool()));
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor8u32f,
-                            testing::Combine(testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)), Bool()));
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor,
-                            testing::Combine(
-                                testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
-                                Bool()));
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor_YUV420,
-                            testing::Combine(
-                                testing::Values(MatDepth(CV_8U)),
-                                Bool()));
-
-#endif
diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp
deleted file mode 100644
index 1c2a1da47..000000000
--- a/modules/ocl/test/test_fft.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-using namespace std;
-
-////////////////////////////////////////////////////////////////////////////
-// Dft
-
-PARAM_TEST_CASE(Dft, cv::Size, int, bool)
-{
-    cv::Size dft_size;
-    int	 dft_flags;
-    bool doubleFP;
-
-    virtual void SetUp()
-    {
-        dft_size  = GET_PARAM(0);
-        dft_flags = GET_PARAM(1);
-        doubleFP = GET_PARAM(2);
-    }
-};
-
-OCL_TEST_P(Dft, C2C)
-{
-    cv::Mat a = randomMat(dft_size, doubleFP ? CV_64FC2 : CV_32FC2, 0.0, 100.0);
-    cv::Mat b_gold;
-
-    cv::ocl::oclMat d_b;
-
-    cv::dft(a, b_gold, dft_flags);
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), dft_flags);
-
-    EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4);
-}
-
-OCL_TEST_P(Dft, R2C)
-{
-    cv::Mat a = randomMat(dft_size, doubleFP ? CV_64FC1 : CV_32FC1, 0.0, 100.0);
-    cv::Mat b_gold, b_gold_roi;
-
-    cv::ocl::oclMat d_b, d_c;
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), dft_flags);
-    cv::dft(a, b_gold, cv::DFT_COMPLEX_OUTPUT | dft_flags);
-
-    b_gold_roi = b_gold(cv::Rect(0, 0, d_b.cols, d_b.rows));
-    EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
-
-    cv::Mat c_gold;
-    cv::dft(b_gold, c_gold, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
-    EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
-}
-
-OCL_TEST_P(Dft, R2CthenC2R)
-{
-    cv::Mat a = randomMat(dft_size, doubleFP ? CV_64FC1 : CV_32FC1, 0.0, 10.0);
-
-    cv::ocl::oclMat d_b, d_c;
-    cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), 0);
-    cv::ocl::dft(d_b, d_c, a.size(), cv::DFT_SCALE | cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
-    EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Dft, testing::Combine(
-                            testing::Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20), cv::Size(512, 1), cv::Size(1024, 768)),
-                            testing::Values(0, (int)cv::DFT_ROWS, (int)cv::DFT_SCALE), testing::Bool()));
-
-////////////////////////////////////////////////////////////////////////////
-// MulSpectrums
-
-PARAM_TEST_CASE(MulSpectrums, cv::Size, DftFlags, bool)
-{
-    cv::Size size;
-    int flag;
-    bool ccorr;
-    cv::Mat a, b;
-
-    virtual void SetUp()
-    {
-        size  = GET_PARAM(0);
-        flag  = GET_PARAM(1);
-        ccorr = GET_PARAM(2);
-
-        a = randomMat(size, CV_32FC2, -100, 100, false);
-        b = randomMat(size, CV_32FC2, -100, 100, false);
-    }
-};
-
-OCL_TEST_P(MulSpectrums, Simple)
-{
-    cv::ocl::oclMat c;
-    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, 1.0, ccorr);
-
-    cv::Mat c_gold;
-    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
-
-    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
-}
-
-OCL_TEST_P(MulSpectrums, Scaled)
-{
-    float scale = 1.f / size.area();
-
-    cv::ocl::oclMat c;
-    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, scale, ccorr);
-
-    cv::Mat c_gold;
-    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
-    c_gold.convertTo(c_gold, c_gold.type(), scale);
-
-    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(
-    DIFFERENT_SIZES,
-    testing::Values(DftFlags(0)),
-    testing::Values(false, true)));
-
-
-////////////////////////////////////////////////////////
-// Convolve
-
-void static convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
-{
-    // reallocate the output array if needed
-    C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
-    cv::Size dftSize;
-
-    // compute the size of DFT transform
-    dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
-    dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
-
-    // allocate temporary buffers and initialize them with 0s
-    cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
-    cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
-
-    // copy A and B to the top-left corners of tempA and tempB, respectively
-    cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
-    A.copyTo(roiA);
-    cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
-    B.copyTo(roiB);
-
-    // now transform the padded A & B in-place;
-    // use "nonzeroRows" hint for faster processing
-    cv::dft(tempA, tempA, 0, A.rows);
-    cv::dft(tempB, tempB, 0, B.rows);
-
-    // multiply the spectrums;
-    // the function handles packed spectrum representations well
-    cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
-
-    // transform the product back from the frequency domain.
-    // Even though all the result rows will be non-zero,
-    // you need only the first C.rows of them, and thus you
-    // pass nonzeroRows == C.rows
-    cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
-
-    // now copy the result back to C.
-    tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
-}
-
-IMPLEMENT_PARAM_CLASS(KSize, int);
-IMPLEMENT_PARAM_CLASS(Ccorr, bool);
-
-PARAM_TEST_CASE(Convolve_DFT, cv::Size, KSize, Ccorr)
-{
-    cv::Size size;
-    int ksize;
-    bool ccorr;
-
-    cv::Mat src;
-    cv::Mat kernel;
-
-    cv::Mat dst_gold;
-
-    virtual void SetUp()
-    {
-        size  = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        ccorr = GET_PARAM(2);
-    }
-};
-
-OCL_TEST_P(Convolve_DFT, Accuracy)
-{
-    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
-    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
-
-    cv::ocl::oclMat dst;
-    cv::ocl::convolve(cv::ocl::oclMat(src), cv::ocl::oclMat(kernel), dst, ccorr);
-
-    cv::Mat dst_gold;
-    convolveDFT(src, kernel, dst_gold, ccorr);
-
-    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
-}
-#define DIFFERENT_CONVOLVE_SIZES testing::Values(cv::Size(251, 257), cv::Size(113, 113), cv::Size(200, 480), cv::Size(1300, 1300))
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Convolve_DFT, testing::Combine(
-    DIFFERENT_CONVOLVE_SIZES,
-    testing::Values(KSize(19), KSize(23), KSize(45)),
-    testing::Values(Ccorr(true)/*, Ccorr(false)*/))); // TODO false ccorr cannot pass for some instances
diff --git a/modules/ocl/test/test_filters.cpp b/modules/ocl/test/test_filters.cpp
deleted file mode 100644
index b2caeaf6f..000000000
--- a/modules/ocl/test/test_filters.cpp
+++ /dev/null
@@ -1,476 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Zero Lin, Zero.Lin@amd.com
-//    Zhang Ying, zhangying913@gmail.com
-//    Yao Wang, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace testing;
-using namespace std;
-using namespace cv;
-
-PARAM_TEST_CASE(FilterTestBase, MatType,
-                int, // kernel size
-                Size, // dx, dy
-                int, // border type
-                double, // optional parameter
-                bool) // roi or not
-{
-    bool isFP;
-
-    int type, borderType, ksize;
-    Size size;
-    double param;
-    bool useRoi;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        ksize = GET_PARAM(1);
-        size = GET_PARAM(2);
-        borderType = GET_PARAM(3);
-        param = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
-
-        isFP = (CV_MAT_DEPTH(type) == CV_32F || CV_MAT_DEPTH(type) == CV_64F);
-    }
-
-    void random_roi(int minSize = 1)
-    {
-        if (minSize == 0)
-            minSize = ksize;
-        Size roiSize = randomSize(minSize, MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, isFP ? 0 : 5, isFP ? 1 : 256);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, isFP ? 0.20 : 60, isFP ? 0.25 : 70);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-    }
-
-    void Near()
-    {
-        if (isFP)
-            Near(1e-6, true);
-        else
-            Near(1, false);
-    }
-
-    void Near(double threshold, bool relative)
-    {
-        Mat roi, whole;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        if (relative)
-        {
-            EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
-            EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
-        }
-        else
-        {
-            EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-            EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-        }
-    }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// blur
-
-typedef FilterTestBase Blur;
-
-#ifdef ANDROID
-OCL_TEST_P(Blur, DISABLED_Mat)
-#else
-OCL_TEST_P(Blur, Mat)
-#endif
-{
-    Size kernelSize(ksize, ksize);
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi(0); // TODO NOTE: min value for size is kernel size (temporary bypass border issues in CPU implementation)
-
-        blur(src_roi, dst_roi, kernelSize, Point(-1, -1), borderType);
-        ocl::blur(gsrc_roi, gdst_roi, kernelSize, Point(-1, -1), borderType); // TODO anchor
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-typedef FilterTestBase LaplacianTest;
-
-OCL_TEST_P(LaplacianTest, Accuracy)
-{
-    double scale = param;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Laplacian(src_roi, dst_roi, -1, ksize, scale, 0, borderType);
-        ocl::Laplacian(gsrc_roi, gdst_roi, -1, ksize, scale, 0, borderType);
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// erode & dilate
-
-typedef FilterTestBase Erode;
-
-OCL_TEST_P(Erode, Mat)
-{
-    // erode or dilate kernel
-    Size kernelSize(ksize, ksize);
-    Mat kernel;
-    int iterations = (int)param;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
-
-        cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations);//, borderType);
-        ocl::erode(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations);//, borderType);
-
-        Near();
-    }
-}
-
-typedef FilterTestBase Dilate;
-
-OCL_TEST_P(Dilate, Mat)
-{
-    // erode or dilate kernel
-    Mat kernel;
-    int iterations = (int)param;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        kernel = randomMat(Size(3, 3), CV_8UC1, 0, 3);
-
-        random_roi();
-
-        cv::dilate(src_roi, dst_roi, kernel, Point(-1, -1), iterations);
-        ocl::dilate(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations); // TODO iterations, borderType
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Sobel
-
-typedef FilterTestBase SobelTest;
-
-OCL_TEST_P(SobelTest, Mat)
-{
-    int dx = size.width, dy = size.height;
-    double scale = param;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Sobel(src_roi, dst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
-        ocl::Sobel(gsrc_roi, gdst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Scharr
-
-typedef FilterTestBase ScharrTest;
-
-OCL_TEST_P(ScharrTest, Mat)
-{
-    int dx = size.width, dy = size.height;
-    double scale = param;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Scharr(src_roi, dst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
-        ocl::Scharr(gsrc_roi, gdst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-typedef FilterTestBase GaussianBlurTest;
-
-OCL_TEST_P(GaussianBlurTest, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double sigma1 = rng.uniform(0.1, 1.0);
-        double sigma2 = rng.uniform(0.1, 1.0);
-
-        GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
-        ocl::GaussianBlur(gsrc_roi, gdst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
-
-        Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 5e-5, false);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-typedef FilterTestBase Filter2D;
-
-OCL_TEST_P(Filter2D, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Point anchor(-1, -1);
-        if (size.width >= 0)
-            anchor.x = size.width % ksize;
-        if (size.height >= 0)
-            anchor.y = size.height % ksize;
-
-        const Size kernelSize(ksize, ksize);
-        Mat kernel = randomMat(kernelSize, CV_32FC1, 0, 1.0);
-        kernel *= 1.0 / (double)(ksize * ksize);
-
-        cv::filter2D(src_roi, dst_roi, -1, kernel, anchor, 0.0, borderType);
-        ocl::filter2D(gsrc_roi, gdst_roi, -1, kernel, anchor, 0.0, borderType);
-
-        Near();
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Bilateral
-
-typedef FilterTestBase Bilateral;
-
-OCL_TEST_P(Bilateral, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double sigmacolor = rng.uniform(20, 100);
-        double sigmaspace = rng.uniform(10, 40);
-
-        cv::bilateralFilter(src_roi, dst_roi, ksize, sigmacolor, sigmaspace, borderType);
-        ocl::bilateralFilter(gsrc_roi, gdst_roi, ksize, sigmacolor, sigmaspace, borderType);
-
-        Near();
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// AdaptiveBilateral
-
-typedef FilterTestBase AdaptiveBilateral;
-
-OCL_TEST_P(AdaptiveBilateral, Mat)
-{
-    const Size kernelSize(ksize, ksize);
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        adaptiveBilateralFilter(src_roi, dst_roi, kernelSize, 5, 1, Point(-1, -1), borderType); // TODO anchor
-        ocl::adaptiveBilateralFilter(gsrc_roi, gdst_roi, kernelSize, 5, 1, Point(-1, -1), borderType);
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////
-// MedianFilter
-
-typedef FilterTestBase MedianFilter;
-
-OCL_TEST_P(MedianFilter, Mat)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        medianBlur(src_roi, dst_roi, ksize);
-        ocl::medianFilter(gsrc_roi, gdst_roi, ksize);
-
-        Near();
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define FILTER_BORDER_SET_NO_ISOLATED \
-    Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_WRAP, (int)BORDER_REFLECT_101/*, \
-            (int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
-            (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
-            (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
-
-#define FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED \
-    Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, /*(int)BORDER_WRAP,*/ (int)BORDER_REFLECT_101/*, \
-            (int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
-            (int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
-            (int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
-
-#define FILTER_DATATYPES Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, \
-                                CV_32FC1, CV_32FC3, CV_32FC4, \
-                                CV_64FC1, CV_64FC3, CV_64FC4)
-
-INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
-                            FILTER_DATATYPES,
-                            Values(3, 5, 7),
-                            Values(Size(0, 0)), // not used
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
-                            FILTER_DATATYPES,
-                            Values(1, 3),
-                            Values(Size(0, 0)), // not used
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(1.0, 0.2, 3.0), // scalar
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(3, 5, 7),
-                            Values(Size(0, 0)), // not used
-                            Values(0), // not used
-                            Values(1.0, 2.0, 3.0),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(3, 5, 7),
-                            Values(Size(0, 0)), // not used
-                            Values(0), // not used
-                            Values(1.0, 2.0, 3.0),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(3, 5),
-                            Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(1),
-                            Values(Size(0, 1), Size(1, 0)), // dx, dy
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(1.0, 0.2), // scalar
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(3, 5),
-                            Values(Size(0, 0)), // not used
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
-                            FILTER_DATATYPES,
-                            Values(3, 15), // TODO 25: CPU implementation has some issues
-                            Values(Size(-1, -1), Size(0, 0), Size(2, 1)), // anchor
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
-                            Values(CV_8UC1, CV_8UC3),
-                            Values(5, 9),
-                            Values(Size(0, 0)), // not used
-                            FILTER_BORDER_SET_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, AdaptiveBilateral, Combine(
-                            Values(CV_8UC1, CV_8UC3),
-                            Values(5, 9),
-                            Values(Size(0, 0)), // not used
-                            FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
-                            Values(0.0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Filter, MedianFilter, Combine(
-                            Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
-                            Values(3, 5),
-                            Values(Size(0, 0)), // not used
-                            Values(0), // not used
-                            Values(0.0), // not used
-                            Bool()));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_gemm.cpp b/modules/ocl/test/test_gemm.cpp
deleted file mode 100644
index c2a44842c..000000000
--- a/modules/ocl/test/test_gemm.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-using namespace std;
-
-////////////////////////////////////////////////////////////////////////////
-// GEMM
-
-PARAM_TEST_CASE(Gemm, int, cv::Size, int)
-{
-    int      type;
-    cv::Size mat_size;
-    int		 flags;
-
-    virtual void SetUp()
-    {
-        type     = GET_PARAM(0);
-        mat_size = GET_PARAM(1);
-        flags    = GET_PARAM(2);
-    }
-};
-
-OCL_TEST_P(Gemm, Accuracy)
-{
-    cv::Mat a = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat b = randomMat(mat_size, type, 0.0, 10.0);
-    cv::Mat c = randomMat(mat_size, type, 0.0, 10.0);
-
-    cv::Mat dst;
-    cv::ocl::oclMat ocl_dst;
-
-    cv::gemm(a, b, 1.0, c, 1.0, dst, flags);
-    cv::ocl::gemm(cv::ocl::oclMat(a), cv::ocl::oclMat(b), 1.0, cv::ocl::oclMat(c), 1.0, ocl_dst, flags);
-
-    EXPECT_MAT_NEAR(dst, ocl_dst, mat_size.area() * 1e-4);
-}
-
-INSTANTIATE_TEST_CASE_P(ocl_gemm, Gemm, testing::Combine(
-                            testing::Values(CV_32FC1, CV_32FC2/*, CV_64FC1, CV_64FC2*/),
-                            testing::Values(cv::Size(20, 20), cv::Size(300, 300)),
-                            testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)(cv::GEMM_1_T + cv::GEMM_2_T))));
diff --git a/modules/ocl/test/test_hough.cpp b/modules/ocl/test/test_hough.cpp
deleted file mode 100644
index f5d257801..000000000
--- a/modules/ocl/test/test_hough.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// HoughCircles
-
-PARAM_TEST_CASE(HoughCircles, cv::Size)
-{
-    static void drawCircles(cv::Mat& dst, const std::vector<cv::Vec3f>& circles, bool fill)
-    {
-        dst.setTo(cv::Scalar::all(0));
-
-        for (size_t i = 0; i < circles.size(); ++i)
-            cv::circle(dst, cv::Point2f(circles[i][0], circles[i][1]), (int)circles[i][2], cv::Scalar::all(255), fill ? -1 : 1);
-    }
-};
-
-OCL_TEST_P(HoughCircles, Accuracy)
-{
-    const cv::Size size = GET_PARAM(0);
-
-    const float dp = 2.0f;
-    const float minDist = 10.0f;
-    const int minRadius = 10;
-    const int maxRadius = 20;
-    const int cannyThreshold = 100;
-    const int votesThreshold = 15;
-
-    std::vector<cv::Vec3f> circles_gold(4);
-    circles_gold[0] = cv::Vec3i(20, 20, minRadius);
-    circles_gold[1] = cv::Vec3i(90, 87, minRadius + 3);
-    circles_gold[2] = cv::Vec3i(30, 70, minRadius + 8);
-    circles_gold[3] = cv::Vec3i(80, 10, maxRadius);
-
-    cv::Mat src(size, CV_8UC1);
-    drawCircles(src, circles_gold, true);
-    cv::ocl::oclMat d_src(src);
-
-    cv::ocl::oclMat d_circles;
-    cv::ocl::HoughCircles(d_src, d_circles, cv::HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
-    ASSERT_TRUE(d_circles.rows > 0);
-
-    cv::Mat circles;
-    d_circles.download(circles);
-
-    for (int i = 0; i < circles.cols; ++i)
-    {
-        cv::Vec3f cur = circles.at<cv::Vec3f>(i);
-
-        bool found = false;
-
-        for (size_t j = 0; j < circles_gold.size(); ++j)
-        {
-            cv::Vec3f gold = circles_gold[j];
-
-            if (std::fabs(cur[0] - gold[0]) < minDist && std::fabs(cur[1] - gold[1]) < minDist && std::fabs(cur[2] - gold[2]) < minDist)
-            {
-                found = true;
-                break;
-            }
-        }
-
-        ASSERT_TRUE(found);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Hough, HoughCircles, DIFFERENT_SIZES);
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp
deleted file mode 100644
index 9b25d9f9c..000000000
--- a/modules/ocl/test/test_imgproc.cpp
+++ /dev/null
@@ -1,622 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, lyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Wu Zailong, bullet@yeah.net
-//    Xu Pang, pangxu010@163.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace testing;
-using namespace std;
-using namespace cv;
-
-///////////////////////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(ImgprocTestBase, MatType,
-                int, // blockSize
-                int, // border type
-                bool) // roi or not
-{
-    int type, borderType, blockSize;
-    bool useRoi;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        blockSize = GET_PARAM(1);
-        borderType = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-    }
-
-    virtual void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0, bool relative = false)
-    {
-        Mat roi, whole;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        if (relative)
-        {
-            EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
-            EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
-        }
-        else
-        {
-            EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-            EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-        }
-    }
-};
-
-////////////////////////////////copyMakeBorder////////////////////////////////////////////
-
-PARAM_TEST_CASE(CopyMakeBorder, MatDepth, // depth
-                Channels, // channels
-                bool, // isolated or not
-                Border, // border type
-                bool) // roi or not
-{
-    int type, borderType;
-    bool useRoi;
-
-    Border border;
-    Scalar val;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        type = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
-        borderType = GET_PARAM(3);
-
-        if (GET_PARAM(2))
-            borderType |= BORDER_ISOLATED;
-
-        useRoi = GET_PARAM(4);
-    }
-
-    void random_roi()
-    {
-        border = randomBorder(0, MAX_VALUE << 2);
-        val = randomScalar(-MAX_VALUE, MAX_VALUE);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        dstBorder.top += border.top;
-        dstBorder.lef += border.lef;
-        dstBorder.rig += border.rig;
-        dstBorder.bot += border.bot;
-
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-OCL_TEST_P(CopyMakeBorder, Mat)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        cv::copyMakeBorder(src_roi, dst_roi, border.top, border.bot, border.lef, border.rig, borderType, val);
-        ocl::copyMakeBorder(gsrc_roi, gdst_roi, border.top, border.bot, border.lef, border.rig, borderType, val);
-
-        Near();
-    }
-}
-
-////////////////////////////////equalizeHist//////////////////////////////////////////////
-
-typedef ImgprocTestBase EqualizeHist;
-
-OCL_TEST_P(EqualizeHist, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        equalizeHist(src_roi, dst_roi);
-        ocl::equalizeHist(gsrc_roi, gdst_roi);
-
-        Near(1.1);
-    }
-}
-
-////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
-
-struct CornerTestBase :
-        public ImgprocTestBase
-{
-    virtual void random_roi()
-    {
-        Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
-        ASSERT_FALSE(image.empty());
-
-        bool isFP = CV_MAT_DEPTH(type) >= CV_32F;
-        float val = 255.0f;
-        if (isFP)
-        {
-            image.convertTo(image, -1, 1.0 / 255);
-            val /= 255.0f;
-        }
-
-        Size roiSize = image.size();
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-
-        Size wholeSize = Size(roiSize.width + srcBorder.lef + srcBorder.rig, roiSize.height + srcBorder.top + srcBorder.bot);
-        src = randomMat(wholeSize, type, -val, val, false);
-        src_roi = src(Rect(srcBorder.lef, srcBorder.top, roiSize.width, roiSize.height));
-        image.copyTo(src_roi);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, CV_32FC1, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-    }
-};
-
-typedef CornerTestBase CornerMinEigenVal;
-
-OCL_TEST_P(CornerMinEigenVal, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        int apertureSize = 3;
-
-        cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType);
-        ocl::cornerMinEigenVal(gsrc_roi, gdst_roi, blockSize, apertureSize, borderType);
-
-        Near(1e-5, true);
-    }
-}
-
-////////////////////////////////cornerHarris//////////////////////////////////////////
-struct CornerHarris :
-    public ImgprocTestBase
-{
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        absdiff(whole, dst_whole, whole);
-        absdiff(roi, dst_roi, roi);
-
-        divide(whole, dst_whole, whole);
-        divide(roi, dst_roi, roi);
-
-        absdiff(dst_whole, dst_whole, dst_whole);
-        absdiff(dst_roi, dst_roi, dst_roi);
-
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-OCL_TEST_P(CornerHarris, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        int apertureSize = 3;
-        double k = randomDouble(0.01, 0.9);
-
-        cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType);
-        ocl::cornerHarris(gsrc_roi, gdst_roi, blockSize, apertureSize, k, borderType);
-
-        Near(1e-5);
-    }
-}
-
-//////////////////////////////////integral/////////////////////////////////////////////////
-
-struct Integral :
-        public ImgprocTestBase
-{
-    int sdepth;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        blockSize = GET_PARAM(1);
-        sdepth = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-    }
-};
-OCL_TEST_P(Integral, Mat1)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        ocl::integral(gsrc_roi, gdst_roi, sdepth);
-        integral(src_roi, dst_roi, sdepth);
-
-        Near();
-    }
-}
-
-OCL_TEST_P(Integral, Mat2)
-{
-    Mat dst1;
-    ocl::oclMat gdst1;
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        integral(src_roi, dst_roi, dst1, sdepth);
-        ocl::integral(gsrc_roi, gdst_roi, gdst1, sdepth);
-
-        Near();
-        if(gdst1.clCxt->supportsFeature(ocl::FEATURE_CL_DOUBLE))
-            EXPECT_MAT_NEAR(dst1, Mat(gdst1), 0.);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//// threshold
-
-struct Threshold :
-        public ImgprocTestBase
-{
-    int thresholdType;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        blockSize = GET_PARAM(1);
-        thresholdType = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-    }
-};
-
-OCL_TEST_P(Threshold, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        double maxVal = randomDouble(20.0, 127.0);
-        double thresh = randomDouble(0.0, maxVal);
-
-        threshold(src_roi, dst_roi, thresh, maxVal, thresholdType);
-        ocl::threshold(gsrc_roi, gdst_roi, thresh, maxVal, thresholdType);
-
-        Near(1);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////
-// calcHist
-
-static void calcHistGold(const Mat &src, Mat &hist)
-{
-    hist = Mat(1, 256, CV_32SC1, Scalar::all(0));
-
-    int * const hist_row = hist.ptr<int>();
-    for (int y = 0; y < src.rows; ++y)
-    {
-        const uchar * const src_row = src.ptr(y);
-
-        for (int x = 0; x < src.cols; ++x)
-            ++hist_row[src_row[x]];
-    }
-}
-
-typedef ImgprocTestBase CalcHist;
-
-OCL_TEST_P(CalcHist, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        calcHistGold(src_roi, dst_roi);
-        ocl::calcHist(gsrc_roi, gdst_roi);
-
-        Near();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// CLAHE
-
-PARAM_TEST_CASE(CLAHETest, Size, double, bool)
-{
-    Size gridSize;
-    double clipLimit;
-    bool useRoi;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        gridSize = GET_PARAM(0);
-        clipLimit = GET_PARAM(1);
-        useRoi = GET_PARAM(2);
-    }
-
-    void random_roi()
-    {
-        Size roiSize = randomSize(std::max(gridSize.height, gridSize.width), MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, CV_8UC1, 5, 256);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, CV_8UC1, 5, 16);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-OCL_TEST_P(CLAHETest, Accuracy)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        Ptr<CLAHE> clahe = ocl::createCLAHE(clipLimit, gridSize);
-        clahe->apply(gsrc_roi, gdst_roi);
-
-        Ptr<CLAHE> clahe_gold = createCLAHE(clipLimit, gridSize);
-        clahe_gold->apply(src_roi, dst_roi);
-
-        Near(1.0);
-    }
-}
-
-/////////////////////////////Convolve//////////////////////////////////
-
-static void convolve_gold(const Mat & src, const Mat & kernel, Mat & dst)
-{
-    for (int i = 0; i < src.rows; i++)
-    {
-        float * const dstptr = dst.ptr<float>(i);
-
-        for (int j = 0; j < src.cols; j++)
-        {
-            float temp = 0;
-
-            for (int m = 0; m < kernel.rows; m++)
-            {
-                const float * const kptr = kernel.ptr<float>(m);
-                for (int n = 0; n < kernel.cols; n++)
-                {
-                    int r = clipInt(i - kernel.rows / 2 + m, 0, src.rows - 1);
-                    int c = clipInt(j - kernel.cols / 2 + n, 0, src.cols - 1);
-
-                    temp += src.ptr<float>(r)[c] * kptr[n];
-                }
-            }
-
-            dstptr[j] = temp;
-        }
-    }
-}
-
-typedef ImgprocTestBase Convolve;
-
-OCL_TEST_P(Convolve, Mat)
-{
-    Mat kernel, kernel_roi;
-    ocl::oclMat gkernel, gkernel_roi;
-    const Size roiSize(7, 7);
-
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Border kernelBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(kernel, kernel_roi, roiSize, kernelBorder, type, 5, 16);
-        generateOclMat(gkernel, gkernel_roi, kernel, roiSize, kernelBorder);
-
-        convolve_gold(src_roi, kernel_roi, dst_roi);
-        ocl::convolve(gsrc_roi, gkernel_roi, gdst_roi);
-
-        Near(1);
-    }
-}
-
-////////////////////////////////// ColumnSum //////////////////////////////////////
-
-static void columnSum_gold(const Mat & src, Mat & dst)
-{
-    float * prevdptr = dst.ptr<float>(0);
-    const float * sptr = src.ptr<float>(0);
-
-    for (int x = 0; x < src.cols; ++x)
-        prevdptr[x] = sptr[x];
-
-    for (int y = 1; y < src.rows; ++y)
-    {
-        sptr = src.ptr<float>(y);
-        float * const dptr = dst.ptr<float>(y);
-
-        for (int x = 0; x < src.cols; ++x)
-            dptr[x] = prevdptr[x] + sptr[x];
-
-        prevdptr = dptr;
-    }
-}
-
-typedef ImgprocTestBase ColumnSum;
-
-OCL_TEST_P(ColumnSum, Accuracy)
-{
-    for (int i = 0; i < LOOP_TIMES; ++i)
-    {
-        random_roi();
-
-        columnSum_gold(src_roi, dst_roi);
-        ocl::columnSum(gsrc_roi, gdst_roi);
-
-        Near(1e-5);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-INSTANTIATE_TEST_CASE_P(Imgproc, EqualizeHist, Combine(
-                            Values((MatType)CV_8UC1),
-                            Values(0), // not used
-                            Values(0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, CornerMinEigenVal, Combine(
-                            Values((MatType)CV_8UC1, (MatType)CV_32FC1),
-                            Values(3, 5),
-                            Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT101),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
-                            Values((MatType)CV_8UC1, CV_32FC1),
-                            Values(3, 5),
-                            Values( (int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
-                            Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
-                            Values(0), // not used
-                            Values((MatType)CV_32SC1, (MatType)CV_32FC1),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
-                            Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
-                                   CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
-                                   CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
-                            Values(0),
-                            Values(ThreshOp(THRESH_BINARY),
-                                   ThreshOp(THRESH_BINARY_INV), ThreshOp(THRESH_TRUNC),
-                                   ThreshOp(THRESH_TOZERO), ThreshOp(THRESH_TOZERO_INV)),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, CalcHist, Combine(
-                            Values((MatType)CV_8UC1),
-                            Values(0), // not used
-                            Values(0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, CLAHETest, Combine(
-                            Values(Size(4, 4), Size(32, 8), Size(8, 64)),
-                            Values(0.0, 10.0, 62.0, 300.0),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, Convolve, Combine(
-                            Values((MatType)CV_32FC1),
-                            Values(0), // not used
-                            Values(0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, ColumnSum, Combine(
-                            Values(MatType(CV_32FC1)),
-                            Values(0), // not used
-                            Values(0), // not used
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocTestBase, CopyMakeBorder, Combine(
-                            testing::Values((MatDepth)CV_8U, (MatDepth)CV_16S, (MatDepth)CV_32S, (MatDepth)CV_32F),
-                            testing::Values(Channels(1), Channels(3), (Channels)4),
-                            Bool(), // border isolated or not
-                            Values((Border)BORDER_REPLICATE, (Border)BORDER_REFLECT,
-                                   (Border)BORDER_WRAP, (Border)BORDER_REFLECT_101),
-                            Bool()));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_kalman.cpp b/modules/ocl/test/test_kalman.cpp
deleted file mode 100644
index 045cd9815..000000000
--- a/modules/ocl/test/test_kalman.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma, jin@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-//////////////////////////////////////////////////////////////////////////
-
-PARAM_TEST_CASE(Kalman, int, int)
-{
-    int size_;
-    int iteration;
-    virtual void SetUp()
-    {
-        size_ = GET_PARAM(0);
-        iteration = GET_PARAM(1);
-    }
-};
-
-OCL_TEST_P(Kalman, Accuracy)
-{
-    const int Dim = size_;
-    const int Steps = iteration;
-    const double max_init = 1;
-    const double max_noise = 0.1;
-
-    Mat sample_mat(Dim, 1, CV_32F), temp_mat;
-    oclMat Sample(Dim, 1, CV_32F);
-    oclMat Temp(Dim, 1, CV_32F);
-    Mat Temp_cpu(Dim, 1, CV_32F);
-
-    Size size(Sample.cols, Sample.rows);
-
-    sample_mat =  randomMat(size, Sample.type(), -max_init, max_init, false);
-    Sample.upload(sample_mat);
-
-    //ocl start
-    cv::ocl::KalmanFilter kalman_filter_ocl;
-    kalman_filter_ocl.init(Dim, Dim);
-
-    cv::ocl::setIdentity(kalman_filter_ocl.errorCovPre, 1);
-    cv::ocl::setIdentity(kalman_filter_ocl.measurementMatrix, 1);
-    cv::ocl::setIdentity(kalman_filter_ocl.errorCovPost, 1);
-
-    kalman_filter_ocl.measurementNoiseCov.setTo(Scalar::all(0));
-    kalman_filter_ocl.statePre.setTo(Scalar::all(0));
-    kalman_filter_ocl.statePost.setTo(Scalar::all(0));
-
-    kalman_filter_ocl.correct(Sample);
-    //ocl end
-
-    //cpu start
-    cv::KalmanFilter kalman_filter_cpu;
-
-    kalman_filter_cpu.init(Dim, Dim);
-
-    cv::setIdentity(kalman_filter_cpu.errorCovPre, 1);
-    cv::setIdentity(kalman_filter_cpu.measurementMatrix, 1);
-    cv::setIdentity(kalman_filter_cpu.errorCovPost, 1);
-
-    kalman_filter_cpu.measurementNoiseCov.setTo(Scalar::all(0));
-    kalman_filter_cpu.statePre.setTo(Scalar::all(0));
-    kalman_filter_cpu.statePost.setTo(Scalar::all(0));
-
-    kalman_filter_cpu.correct(sample_mat);
-    //cpu end
-    //test begin
-    for(int i = 0; i<Steps; i++)
-    {
-        kalman_filter_ocl.predict();
-        kalman_filter_cpu.predict();
-
-        cv::gemm(kalman_filter_cpu.transitionMatrix, sample_mat, 1, cv::Mat(), 0, Temp_cpu);
-
-        Size size1(Temp.cols, Temp.rows);
-        Mat temp = randomMat(size1, Temp.type(), 0, 0xffff, false);
-
-
-        cv::multiply(2, temp, temp);
-
-        cv::subtract(temp, 1, temp);
-
-        cv::multiply(max_noise, temp, temp);
-
-        cv::add(temp, Temp_cpu, Temp_cpu);
-
-        Temp.upload(Temp_cpu);
-        Temp.copyTo(Sample);
-        Temp_cpu.copyTo(sample_mat);
-
-        kalman_filter_ocl.correct(Temp);
-        kalman_filter_cpu.correct(Temp_cpu);
-    }
-    //test end
-    EXPECT_MAT_NEAR(kalman_filter_cpu.statePost, kalman_filter_ocl.statePost, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Video, Kalman, Combine(Values(3, 7), Values(30)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_kmeans.cpp b/modules/ocl/test/test_kmeans.cpp
deleted file mode 100644
index eb3627451..000000000
--- a/modules/ocl/test/test_kmeans.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Erping Pang,   pang_er_ping@163.com
-//    Xiaopeng Fu,   fuxiaopeng2222@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-using namespace cv;
-
-#define OCL_KMEANS_USE_INITIAL_LABELS 1
-#define OCL_KMEANS_PP_CENTERS         2
-
-PARAM_TEST_CASE(Kmeans, int, int, int)
-{
-    int type;
-    int K;
-    int flags;
-    Mat src ;
-    ocl::oclMat d_src, d_dists;
-
-    Mat labels, centers;
-    ocl::oclMat d_labels, d_centers;
-    virtual void SetUp()
-    {
-        K = GET_PARAM(0);
-        type = GET_PARAM(1);
-        flags = GET_PARAM(2);
-
-        // MWIDTH=256, MHEIGHT=256. defined in utility.hpp
-        Size size = Size(MWIDTH, MHEIGHT);
-        src.create(size, type);
-        int row_idx = 0;
-        const int max_neighbour = MHEIGHT / K - 1;
-        CV_Assert(K <= MWIDTH);
-        for(int i = 0; i < K; i++ )
-        {
-            Mat center_row_header = src.row(row_idx);
-            center_row_header.setTo(0);
-            int nchannel = center_row_header.channels();
-            for(int j = 0; j < nchannel; j++)
-                center_row_header.at<float>(0, i*nchannel+j) = 50000.0;
-
-            for(int j = 0; (j < max_neighbour) ||
-                           (i == K-1 && j < max_neighbour + MHEIGHT%K); j ++)
-            {
-                Mat cur_row_header = src.row(row_idx + 1 + j);
-                center_row_header.copyTo(cur_row_header);
-                Mat tmpmat = randomMat(cur_row_header.size(), cur_row_header.type(), -200, 200, false);
-                cur_row_header += tmpmat;
-            }
-            row_idx += 1 + max_neighbour;
-        }
-    }
-};
-OCL_TEST_P(Kmeans, Mat){
-    if(flags & KMEANS_USE_INITIAL_LABELS)
-    {
-        // inital a given labels
-        labels.create(src.rows, 1, CV_32S);
-        int *label = labels.ptr<int>();
-        for(int i = 0; i < src.rows; i++)
-            label[i] = rng.uniform(0, K);
-        d_labels.upload(labels);
-    }
-    d_src.upload(src);
-
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        kmeans(src, K, labels,
-            TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
-            1, flags, centers);
-        ocl::kmeans(d_src, K, d_labels,
-            TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
-            1, flags, d_centers);
-        Mat dd_labels(d_labels);
-        Mat dd_centers(d_centers);
-        if(flags & KMEANS_USE_INITIAL_LABELS)
-        {
-            EXPECT_MAT_NEAR(labels, dd_labels, 0);
-            EXPECT_MAT_NEAR(centers, dd_centers, 1e-3);
-        }
-        else
-        {
-            int row_idx = 0;
-            for(int i = 0; i < K; i++)
-            {
-                // verify lables with ground truth resutls
-                int label = labels.at<int>(row_idx);
-                int header_label = dd_labels.at<int>(row_idx);
-                for(int j = 0; (j < MHEIGHT/K)||(i == K-1 && j < MHEIGHT/K+MHEIGHT%K); j++)
-                {
-                    ASSERT_NEAR(labels.at<int>(row_idx+j), label, 0);
-                    ASSERT_NEAR(dd_labels.at<int>(row_idx+j), header_label, 0);
-                }
-
-                // verify centers
-                float *center = centers.ptr<float>(label);
-                float *header_center = dd_centers.ptr<float>(header_label);
-                for(int t = 0; t < centers.cols; t++)
-                    ASSERT_NEAR(center[t], header_center[t], 1e-3);
-
-                row_idx += MHEIGHT/K;
-            }
-        }
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ML, Kmeans, Combine(
-    Values(3, 5, 8),
-    Values(CV_32FC1, CV_32FC2, CV_32FC4),
-    Values(OCL_KMEANS_USE_INITIAL_LABELS/*, OCL_KMEANS_PP_CENTERS*/)));
-
-
-/////////////////////////////// DistanceToCenters //////////////////////////////////////////
-
-CV_ENUM(DistType, NORM_L1, NORM_L2SQR)
-
-PARAM_TEST_CASE(distanceToCenters, DistType, bool)
-{
-    int distType;
-    bool useRoi;
-
-    Mat src, centers, src_roi, centers_roi;
-    ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi;
-
-    virtual void SetUp()
-    {
-        distType = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
-    }
-
-    void random_roi()
-    {
-        Size roiSizeSrc = randomSize(1, MAX_VALUE);
-        Size roiSizeCenters = randomSize(1, MAX_VALUE);
-        roiSizeSrc.width = roiSizeCenters.width;
-
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSizeSrc, srcBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE);
-
-        Border centersBorder = randomBorder(0, useRoi ? 500 : 0);
-        randomSubMat(centers, centers_roi, roiSizeCenters, centersBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE);
-
-        for (int i = 0; i < centers.rows; i++)
-            centers.at<float>(i, randomInt(0, centers.cols)) = (float)randomDouble(SHRT_MAX, INT_MAX);
-
-        generateOclMat(ocl_src, ocl_src_roi, src, roiSizeSrc, srcBorder);
-        generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSizeCenters, centersBorder);
-    }
-};
-
-OCL_TEST_P(distanceToCenters, Accuracy)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Mat labels, dists;
-        ocl::distanceToCenters(ocl_src_roi, ocl_centers_roi, dists, labels, distType);
-
-        EXPECT_EQ(dists.size(), labels.size());
-
-        Mat batch_dists;
-        cv::batchDistance(src_roi, centers_roi, batch_dists, CV_32FC1, noArray(), distType);
-
-        std::vector<float> gold_dists_v;
-        gold_dists_v.reserve(batch_dists.rows);
-
-        for (int i = 0; i < batch_dists.rows; i++)
-        {
-            Mat r = batch_dists.row(i);
-            double mVal;
-            Point mLoc;
-            minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
-
-            int ocl_label = labels.at<int>(i, 0);
-            EXPECT_EQ(mLoc.x, ocl_label);
-
-            gold_dists_v.push_back(static_cast<float>(mVal));
-        }
-
-        double relative_error = cv::norm(Mat(gold_dists_v), dists, NORM_INF | NORM_RELATIVE);
-        ASSERT_LE(relative_error, 1e-5);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool()));
-
-#endif
diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp
deleted file mode 100644
index edbc36a3f..000000000
--- a/modules/ocl/test/test_match_template.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size);
-
-#define MTEMP_SIZES testing::Values(cv::Size(128, 256), cv::Size(1024, 768))
-
-PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-    }
-};
-
-OCL_TEST_P(MatchTemplate8U, Accuracy)
-{
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn), 0, 255);
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn), 0, 255);
-
-    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
-    cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-
-    cv::Mat dst_gold;
-    cv::matchTemplate(image, templ, dst_gold, method);
-
-    cv::Mat mat_dst;
-    dst.download(mat_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1);
-}
-
-PARAM_TEST_CASE(MatchTemplate32F, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size;
-    cv::Size templ_size;
-    int cn;
-    int method;
-
-    virtual void SetUp()
-    {
-        size = GET_PARAM(0);
-        templ_size = GET_PARAM(1);
-        cn = GET_PARAM(2);
-        method = GET_PARAM(3);
-    }
-};
-
-OCL_TEST_P(MatchTemplate32F, Accuracy)
-{
-    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn), 0, 255);
-    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn), 0, 255);
-
-    cv::ocl::oclMat dst, ocl_image(image), ocl_templ(templ);
-    cv::ocl::matchTemplate(ocl_image, ocl_templ, dst, method);
-
-    cv::Mat dst_gold;
-    cv::matchTemplate(image, templ, dst_gold, method);
-
-    cv::Mat mat_dst;
-    dst.download(mat_dst);
-
-    EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U,
-                        testing::Combine(
-                            MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-                            testing::Values(Channels(1), Channels(3), Channels(4)),
-                            ALL_TEMPLATE_METHODS
-                        )
-                       );
-
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine(
-                            MTEMP_SIZES,
-                            testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-                            testing::Values(Channels(1), Channels(3), Channels(4)),
-                            testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-#endif
diff --git a/modules/ocl/test/test_matrix_operation.cpp b/modules/ocl/test/test_matrix_operation.cpp
deleted file mode 100644
index c7ceef453..000000000
--- a/modules/ocl/test/test_matrix_operation.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace testing;
-using namespace std;
-
-////////////////////////////////converto/////////////////////////////////////////////////
-
-PARAM_TEST_CASE(MatrixTestBase, MatDepth, MatDepth, int, bool)
-{
-    int src_depth, cn, dstType;
-    bool use_roi;
-
-    Mat src, dst, src_roi, dst_roi;
-    ocl::oclMat gdst, gsrc, gdst_roi, gsrc_roi;
-
-    virtual void SetUp()
-    {
-        src_depth = GET_PARAM(0);
-        cn = GET_PARAM(2);
-        dstType = CV_MAKE_TYPE(GET_PARAM(1), cn);
-
-        use_roi = GET_PARAM(3);
-    }
-
-    virtual void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKE_TYPE(src_depth, cn), -MAX_VALUE, MAX_VALUE);
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, dstType, 5, 16);
-
-        generateOclMat(gsrc, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
-    }
-};
-
-typedef MatrixTestBase ConvertTo;
-
-OCL_TEST_P(ConvertTo, Accuracy)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        src_roi.convertTo(dst_roi, dstType);
-        gsrc_roi.convertTo(gdst_roi, dstType);
-
-        EXPECT_MAT_NEAR(dst, Mat(gdst), src_depth == CV_64F ? 1.0 : 0.0);
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), src_depth == CV_64F ? 1.0 : 0.0);
-    }
-}
-
-///////////////////////////////////////////copyto/////////////////////////////////////////////////////////////
-
-struct CopyTo :
-        public MatrixTestBase
-{
-    Mat mask, mask_roi;
-    ocl::oclMat gmask, gmask_roi;
-
-    virtual void random_roi()
-    {
-        int type = CV_MAKE_TYPE(src_depth, cn);
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
-
-        Border maskBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(mask, mask_roi, roiSize, maskBorder, CV_8UC1, 5, 16);
-
-        generateOclMat(gsrc, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
-        generateOclMat(gmask, gmask_roi, mask, roiSize, maskBorder);
-    }
-};
-
-OCL_TEST_P(CopyTo, Without_mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        src_roi.copyTo(dst_roi);
-        gsrc_roi.copyTo(gdst_roi);
-
-        EXPECT_MAT_NEAR(dst, Mat(gdst), 0.0);
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), 0.0);
-    }
-}
-
-OCL_TEST_P(CopyTo, With_mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        src_roi.copyTo(dst_roi, mask_roi);
-        gsrc_roi.copyTo(gdst_roi, gmask_roi);
-
-        EXPECT_MAT_NEAR(dst, Mat(gdst), 0.0);
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), 0.0);
-    }
-}
-
-/////////////////////////////////////////// setTo /////////////////////////////////////////////////////////////
-
-typedef CopyTo SetTo;
-
-OCL_TEST_P(SetTo, Without_mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        Scalar scalar = randomScalar(-MAX_VALUE, MAX_VALUE);
-
-        src_roi.setTo(scalar);
-        gsrc_roi.setTo(scalar);
-
-        EXPECT_MAT_NEAR(dst, Mat(gdst), 0.0);
-        EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), 0.0);;
-    }
-}
-
-OCL_TEST_P(SetTo, With_mask)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-        Scalar scalar = randomScalar(-MAX_VALUE, MAX_VALUE);
-
-        src_roi.setTo(scalar, mask_roi);
-        gsrc_roi.setTo(scalar, gmask_roi);
-
-        EXPECT_MAT_NEAR(src, Mat(gsrc), 1.);
-        EXPECT_MAT_NEAR(src_roi, Mat(gsrc_roi), 1.);
-    }
-}
-
-// convertC3C4
-
-PARAM_TEST_CASE(convertC3C4, MatDepth, bool)
-{
-    int depth;
-    bool use_roi;
-
-    Mat src, src_roi;
-    ocl::oclMat gsrc, gsrc_roi;
-
-    virtual void SetUp()
-    {
-        depth = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
-    }
-
-    void random_roi()
-    {
-        int type = CV_MAKE_TYPE(depth, 3);
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-        generateOclMat(gsrc, gsrc_roi, src, roiSize, srcBorder);
-    }
-};
-
-OCL_TEST_P(convertC3C4, Accuracy)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        gsrc_roi = src_roi;
-
-        EXPECT_MAT_NEAR(src_roi, Mat(gsrc_roi), 0.0);
-        EXPECT_MAT_NEAR(src, Mat(gsrc), 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                            testing::Range(1, 5), Bool()));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                            Values(MatDepth(0)), // not used
-                            testing::Range(1, 5), Bool()));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                            Values((MatDepth)0), // not used
-                            testing::Range(1, 5), Bool()));
-
-INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F),
-                            Bool()));
-#endif
diff --git a/modules/ocl/test/test_mean_shift.cpp b/modules/ocl/test/test_mean_shift.cpp
deleted file mode 100644
index 6ee3e35a7..000000000
--- a/modules/ocl/test/test_mean_shift.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, lyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Wu Zailong, bullet@yeah.net
-//    Xu Pang, pangxu010@163.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace testing;
-using namespace std;
-using namespace cv;
-
-typedef struct
-{
-    short x;
-    short y;
-} COOR;
-
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep, Size size, int sp, int sr, int maxIter, float eps, int *tab)
-{
-
-    int isr2 = sr * sr;
-    int c0, c1, c2, c3;
-    int iter;
-    uchar *ptr = NULL;
-    uchar *pstart = NULL;
-    int revx = 0, revy = 0;
-    c0 = sptr[0];
-    c1 = sptr[1];
-    c2 = sptr[2];
-    c3 = sptr[3];
-    // iterate meanshift procedure
-    for(iter = 0; iter < maxIter; iter++ )
-    {
-        int count = 0;
-        int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-
-        //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-        int minx = x0 - sp;
-        int miny = y0 - sp;
-        int maxx = x0 + sp;
-        int maxy = y0 + sp;
-
-        //deal with the image boundary
-        if(minx < 0) minx = 0;
-        if(miny < 0) miny = 0;
-        if(maxx >= size.width) maxx = size.width - 1;
-        if(maxy >= size.height) maxy = size.height - 1;
-        if(iter == 0)
-        {
-            pstart = sptr;
-        }
-        else
-        {
-            pstart = pstart + revy * sstep + (revx << 2); //point to the new position
-        }
-        ptr = pstart;
-        ptr = ptr + (miny - y0) * sstep + ((minx - x0) << 2); //point to the start in the row
-
-        for( int y = miny; y <= maxy; y++, ptr += sstep - ((maxx - minx + 1) << 2))
-        {
-            int rowCount = 0;
-            int x = minx;
-#if CV_ENABLE_UNROLLED
-            for( ; x + 4 <= maxx; x += 4, ptr += 16)
-            {
-                int t0, t1, t2;
-                t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-                t0 = ptr[4], t1 = ptr[5], t2 = ptr[6];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 1;
-                    rowCount++;
-                }
-                t0 = ptr[8], t1 = ptr[9], t2 = ptr[10];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 2;
-                    rowCount++;
-                }
-                t0 = ptr[12], t1 = ptr[13], t2 = ptr[14];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x + 3;
-                    rowCount++;
-                }
-            }
-#endif
-            for(; x <= maxx; x++, ptr += 4)
-            {
-                int t0 = ptr[0], t1 = ptr[1], t2 = ptr[2];
-                if(tab[t0 - c0 + 255] + tab[t1 - c1 + 255] + tab[t2 - c2 + 255] <= isr2)
-                {
-                    s0 += t0;
-                    s1 += t1;
-                    s2 += t2;
-                    sx += x;
-                    rowCount++;
-                }
-            }
-            if(rowCount == 0)
-                continue;
-            count += rowCount;
-            sy += y * rowCount;
-        }
-
-        if( count == 0 )
-            break;
-
-        int x1 = sx / count;
-        int y1 = sy / count;
-        s0 = s0 / count;
-        s1 = s1 / count;
-        s2 = s2 / count;
-
-        bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1 - x0) + abs(y1 - y0) +
-                        tab[s0 - c0 + 255] + tab[s1 - c1 + 255] + tab[s2 - c2 + 255] <= eps);
-
-        //revise the pointer corresponding to the new (y0,x0)
-        revx = x1 - x0;
-        revy = y1 - y0;
-
-        x0 = x1;
-        y0 = y1;
-        c0 = s0;
-        c1 = s1;
-        c2 = s2;
-
-        if( stopFlag )
-            break;
-    } //for iter
-
-    dptr[0] = (uchar)c0;
-    dptr[1] = (uchar)c1;
-    dptr[2] = (uchar)c2;
-    dptr[3] = (uchar)c3;
-
-    COOR coor;
-    coor.x = (short)x0;
-    coor.y = (short)y0;
-    return coor;
-}
-
-void meanShiftFiltering_(const Mat &src_roi, Mat &dst_roi, int sp, int sr, TermCriteria crit)
-{
-    if( src_roi.empty() )
-        CV_Error( CV_StsBadArg, "The input image is empty" );
-
-    if( src_roi.depth() != CV_8U || src_roi.channels() != 4 )
-        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
-
-    CV_Assert( (src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) );
-    CV_Assert( !(dst_roi.step & 0x3) );
-
-    if( !(crit.type & TermCriteria::MAX_ITER) )
-        crit.maxCount = 5;
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-    if( !(crit.type & TermCriteria::EPS) )
-        eps = 1.f;
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-    for(int i = 0; i < 512; i++)
-        tab[i] = (i - 255) * (i - 255);
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    Size size = src_roi.size();
-
-    for(int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-            dptr += dstep - (size.width << 2))
-    {
-        for(int j = 0; j < size.width; j++, sptr += 4, dptr += 4)
-        {
-            do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-}
-
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi, int sp, int sr, TermCriteria crit)
-{
-    if( src_roi.empty() )
-        CV_Error( CV_StsBadArg, "The input image is empty" );
-    if( src_roi.depth() != CV_8U || src_roi.channels() != 4 )
-        CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
-    CV_Assert( (src_roi.cols == dst_roi.cols) && (src_roi.rows == dst_roi.rows) &&
-               (src_roi.cols == dstCoor_roi.cols) && (src_roi.rows == dstCoor_roi.rows));
-    CV_Assert( !(dstCoor_roi.step & 0x3) );
-
-    if( !(crit.type & TermCriteria::MAX_ITER) )
-        crit.maxCount = 5;
-    int maxIter = std::min(std::max(crit.maxCount, 1), 100);
-    float eps;
-    if( !(crit.type & TermCriteria::EPS) )
-        eps = 1.f;
-    eps = (float)std::max(crit.epsilon, 0.0);
-
-    int tab[512];
-    for(int i = 0; i < 512; i++)
-        tab[i] = (i - 255) * (i - 255);
-    uchar *sptr = src_roi.data;
-    uchar *dptr = dst_roi.data;
-    short *dCoorptr = (short *)dstCoor_roi.data;
-    int sstep = (int)src_roi.step;
-    int dstep = (int)dst_roi.step;
-    int dCoorstep = (int)dstCoor_roi.step >> 1;
-    Size size = src_roi.size();
-
-    for(int i = 0; i < size.height; i++, sptr += sstep - (size.width << 2),
-            dptr += dstep - (size.width << 2), dCoorptr += dCoorstep - (size.width << 1))
-    {
-        for(int j = 0; j < size.width; j++, sptr += 4, dptr += 4, dCoorptr += 2)
-        {
-            *((COOR *)dCoorptr) = do_meanShift(j, i, sptr, dptr, sstep, size, sp, sr, maxIter, eps, tab);
-        }
-    }
-
-}
-
-//////////////////////////////// meanShift //////////////////////////////////////////
-
-PARAM_TEST_CASE(meanShiftTestBase, MatType, MatType, int, int, TermCriteria, bool)
-{
-    int type, typeCoor;
-    int sp, sr;
-    TermCriteria crit;
-    bool useRoi;
-
-    // src mat
-    Mat src, src_roi;
-    Mat dst, dst_roi;
-    Mat dstCoor, dstCoor_roi;
-
-    // ocl dst mat
-    ocl::oclMat gsrc, gsrc_roi;
-    ocl::oclMat gdst, gdst_roi;
-    ocl::oclMat gdstCoor, gdstCoor_roi;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        typeCoor = GET_PARAM(1);
-        sp = GET_PARAM(2);
-        sr = GET_PARAM(3);
-        crit = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
-    }
-
-    void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
-        generateOclMat(gsrc, gsrc_roi, src, roiSize, srcBorder);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 256);
-        generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
-
-        randomSubMat(dstCoor, dstCoor_roi, roiSize, dstBorder, typeCoor, 5, 256);
-        generateOclMat(gdstCoor, gdstCoor_roi, dstCoor, roiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-
-    void Near1(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdstCoor.download(whole);
-        gdstCoor_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dstCoor, whole, threshold);
-        EXPECT_MAT_NEAR(dstCoor_roi, roi, threshold);
-    }
-};
-
-/////////////////////////meanShiftFiltering/////////////////////////////
-
-typedef meanShiftTestBase meanShiftFiltering;
-
-OCL_TEST_P(meanShiftFiltering, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        meanShiftFiltering_(src_roi, dst_roi, sp, sr, crit);
-        ocl::meanShiftFiltering(gsrc_roi, gdst_roi, sp, sr, crit);
-
-        Near();
-    }
-}
-
-///////////////////////////meanShiftProc//////////////////////////////////
-
-typedef meanShiftTestBase meanShiftProc;
-
-OCL_TEST_P(meanShiftProc, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        meanShiftProc_(src_roi, dst_roi, dstCoor_roi, sp, sr, crit);
-        ocl::meanShiftProc(gsrc_roi, gdst_roi, gdstCoor_roi, sp, sr, crit);
-
-        Near();
-        Near1();
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftFiltering, Combine(
-                            Values((MatType)CV_8UC4),
-                            Values((MatType)CV_16SC2),
-                            Values(5),
-                            Values(6),
-                            Values(TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 5, 1)),
-                            Bool()
-                        ));
-
-INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
-                            Values((MatType)CV_8UC4),
-                            Values((MatType)CV_16SC2),
-                            Values(5),
-                            Values(6),
-                            Values(TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 5, 1)),
-                            Bool()
-                        ));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_ml.cpp b/modules/ocl/test/test_ml.cpp
deleted file mode 100644
index 00f9fa941..000000000
--- a/modules/ocl/test/test_ml.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jin Ma,        jin@multicorewareinc.com
-//    Xiaopeng Fu,   fuxiaopeng2222@163.com
-//    Erping Pang,   pang_er_ping@163.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-
-///////K-NEAREST NEIGHBOR//////////////////////////
-
-static void genTrainData(cv::RNG& rng, Mat& trainData, int trainDataRow, int trainDataCol,
-                         Mat& trainLabel = Mat().setTo(Scalar::all(0)), int nClasses = 0)
-{
-    cv::Size size(trainDataCol, trainDataRow);
-    trainData = randomMat(rng, size, CV_32FC1, 1.0, 1000.0, false);
-    if(nClasses != 0)
-    {
-        cv::Size size1(trainDataRow, 1);
-        trainLabel = randomMat(rng, size1, CV_8UC1, 0, nClasses - 1, false);
-        trainLabel.convertTo(trainLabel, CV_32FC1);
-    }
-}
-
-PARAM_TEST_CASE(KNN, int, Size, int, bool)
-{
-    int k;
-    int trainDataCol;
-    int testDataRow;
-    int nClass;
-    bool regression;
-    virtual void SetUp()
-    {
-        k = GET_PARAM(0);
-        nClass = GET_PARAM(2);
-        trainDataCol = GET_PARAM(1).width;
-        testDataRow = GET_PARAM(1).height;
-        regression = GET_PARAM(3);
-    }
-};
-
-OCL_TEST_P(KNN, Accuracy)
-{
-    Mat trainData, trainLabels;
-    const int trainDataRow = 500;
-    genTrainData(rng, trainData, trainDataRow, trainDataCol, trainLabels, nClass);
-
-    Mat testData, testLabels;
-    genTrainData(rng, testData, testDataRow, trainDataCol);
-
-    KNearestNeighbour knn_ocl;
-    CvKNearest knn_cpu;
-    Mat best_label_cpu;
-    oclMat best_label_ocl;
-
-    /*ocl k-Nearest_Neighbor start*/
-    oclMat trainData_ocl;
-    trainData_ocl.upload(trainData);
-    Mat simpleIdx;
-    knn_ocl.train(trainData, trainLabels, simpleIdx, regression);
-
-    oclMat testdata;
-    testdata.upload(testData);
-    knn_ocl.find_nearest(testdata, k, best_label_ocl);
-    /*ocl k-Nearest_Neighbor end*/
-
-    /*cpu k-Nearest_Neighbor start*/
-    knn_cpu.train(trainData, trainLabels, simpleIdx, regression);
-    knn_cpu.find_nearest(testData, k, &best_label_cpu);
-    /*cpu k-Nearest_Neighbor end*/
-    if(regression)
-    {
-        EXPECT_MAT_SIMILAR(Mat(best_label_ocl), best_label_cpu, 1e-5);
-    }
-    else
-    {
-        EXPECT_MAT_NEAR(Mat(best_label_ocl), best_label_cpu, 0.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ML, KNN, Combine(Values(6, 5), Values(Size(200, 400), Size(300, 600)),
-    Values(4, 3), Values(false, true)));
-
-////////////////////////////////SVM/////////////////////////////////////////////////
-
-PARAM_TEST_CASE(SVM_OCL, int, int, int)
-{
-    cv::Size size;
-    int kernel_type;
-    int svm_type;
-    Mat src, labels, samples, labels_predict;
-    int K;
-
-    virtual void SetUp()
-    {
-
-        kernel_type = GET_PARAM(0);
-        svm_type = GET_PARAM(1);
-        K = GET_PARAM(2);
-        cv::Size size = cv::Size(MWIDTH, MHEIGHT);
-        src.create(size, CV_32FC1);
-        labels.create(1, size.height, CV_32SC1);
-        int row_idx = 0;
-        const int max_number = size.height / K - 1;
-        CV_Assert(K <= size.height);
-        for(int i = 0; i < K; i++ )
-        {
-            Mat center_row_header = src.row(row_idx);
-            center_row_header.setTo(0);
-            int nchannel = center_row_header.channels();
-            for(int j = 0; j < nchannel; j++)
-            {
-                center_row_header.at<float>(0, i * nchannel + j) = 500.0;
-            }
-            labels.at<int>(0, row_idx) = i;
-            for(int j = 0; (j < max_number) ||
-                    (i == K - 1 && j < max_number + size.height % K); j ++)
-            {
-                Mat cur_row_header = src.row(row_idx + 1 + j);
-                center_row_header.copyTo(cur_row_header);
-                Mat tmpmat = randomMat(cur_row_header.size(), cur_row_header.type(), 1, 100, false);
-                cur_row_header += tmpmat;
-                labels.at<int>(0, row_idx + 1 + j) = i;
-            }
-            row_idx += 1 + max_number;
-        }
-        labels.convertTo(labels, CV_32FC1);
-        cv::Size test_size = cv::Size(MWIDTH, 100);
-        samples.create(test_size, CV_32FC1);
-        labels_predict.create(1, test_size.height, CV_32SC1);
-        const int max_number_test = test_size.height / K - 1;
-        row_idx = 0;
-        for(int i = 0; i < K; i++ )
-        {
-            Mat center_row_header = samples.row(row_idx);
-            center_row_header.setTo(0);
-            int nchannel = center_row_header.channels();
-            for(int j = 0; j < nchannel; j++)
-            {
-                center_row_header.at<float>(0, i * nchannel + j) = 500.0;
-            }
-            labels_predict.at<int>(0, row_idx) = i;
-            for(int j = 0; (j < max_number_test) ||
-                    (i == K - 1 && j < max_number_test + test_size.height % K); j ++)
-            {
-                Mat cur_row_header = samples.row(row_idx + 1 + j);
-                center_row_header.copyTo(cur_row_header);
-                Mat tmpmat = randomMat(cur_row_header.size(), cur_row_header.type(), 1, 100, false);
-                cur_row_header += tmpmat;
-                labels_predict.at<int>(0, row_idx + 1 + j) = i;
-            }
-            row_idx += 1 + max_number_test;
-        }
-        labels_predict.convertTo(labels_predict, CV_32FC1);
-    }
-};
-
-OCL_TEST_P(SVM_OCL, Accuracy)
-{
-    CvSVMParams params;
-    params.degree = 0.4;
-    params.gamma = 1;
-    params.coef0 = 1;
-    params.C = 1;
-    params.nu = 0.5;
-    params.p = 1;
-    params.svm_type = svm_type;
-    params.kernel_type = kernel_type;
-
-    params.term_crit = cvTermCriteria(CV_TERMCRIT_ITER, 1000, 0.001);
-
-    CvSVM SVM;
-    SVM.train(src, labels, Mat(), Mat(), params);
-
-    cv::ocl::CvSVM_OCL SVM_OCL;
-    SVM_OCL.train(src, labels, Mat(), Mat(), params);
-
-    int c = SVM.get_support_vector_count();
-    int c1 = SVM_OCL.get_support_vector_count();
-
-    Mat sv(c, MHEIGHT, CV_32FC1);
-    Mat sv_ocl(c1, MHEIGHT, CV_32FC1);
-    for(int i = 0; i < c; i++)
-    {
-        const float* v = SVM.get_support_vector(i);
-
-        for(int j = 0; j < MHEIGHT; j++)
-        {
-            sv.at<float>(i, j) = v[j];
-        }
-    }
-    for(int i = 0; i < c1; i++)
-    {
-        const float* v_ocl = SVM_OCL.get_support_vector(i);
-
-        for(int j = 0; j < MHEIGHT; j++)
-        {
-            sv_ocl.at<float>(i, j) = v_ocl[j];
-        }
-    }
-    cv::BFMatcher matcher(cv::NORM_L2);
-    std::vector<cv::DMatch> matches;
-    matcher.match(sv, sv_ocl, matches);
-    int count = 0;
-
-    for(std::vector<cv::DMatch>::iterator itr = matches.begin(); itr != matches.end(); itr++)
-    {
-        if((*itr).distance < 0.1)
-        {
-            count ++;
-        }
-    }
-    if(c != 0)
-    {
-        float matchedRatio = (float)count / c;
-        EXPECT_GT(matchedRatio, 0.95);
-    }
-    if(c != 0)
-    {
-        CvMat *result = cvCreateMat(1, samples.rows, CV_32FC1);
-        CvMat test_samples = samples;
-
-        CvMat *result_ocl = cvCreateMat(1, samples.rows, CV_32FC1);
-
-        SVM.predict(&test_samples, result);
-
-        SVM_OCL.predict(&test_samples, result_ocl);
-
-        int true_resp = 0, true_resp_ocl = 0;
-        for (int i = 0; i < samples.rows; i++)
-        {
-            if (result->data.fl[i] == labels_predict.at<float>(0, i))
-            {
-                true_resp++;
-            }
-        }
-        float matchedRatio = (float)true_resp / samples.rows;
-
-        for (int i = 0; i < samples.rows; i++)
-        {
-            if (result_ocl->data.fl[i] == labels_predict.at<float>(0, i))
-            {
-                true_resp_ocl++;
-            }
-        }
-        float matchedRatio_ocl = (float)true_resp_ocl / samples.rows;
-
-        if(matchedRatio != 0 && true_resp_ocl < true_resp)
-        {
-            EXPECT_NEAR(matchedRatio_ocl, matchedRatio, 0.03);
-        }
-    }
-}
-
-// TODO FIXIT: CvSVM::EPS_SVR case is crashed inside CPU implementation
-// Anonymous enums are not supported well so cast them to 'int'
-
-INSTANTIATE_TEST_CASE_P(OCL_ML, SVM_OCL, testing::Combine(
-                            Values((int)CvSVM::LINEAR, (int)CvSVM::POLY, (int)CvSVM::RBF, (int)CvSVM::SIGMOID),
-                            Values((int)CvSVM::C_SVC, (int)CvSVM::NU_SVC, (int)CvSVM::ONE_CLASS, (int)CvSVM::NU_SVR),
-                            Values(2, 3, 4)
-                        ));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_moments.cpp b/modules/ocl/test/test_moments.cpp
deleted file mode 100644
index e978bb28f..000000000
--- a/modules/ocl/test/test_moments.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "test_precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-PARAM_TEST_CASE(MomentsTest, MatType, bool, bool)
-{
-    int type;
-    cv::Mat mat;
-    bool test_contours;
-    bool binaryImage;
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        test_contours = GET_PARAM(1);
-        cv::Size size(10 * MWIDTH, 10 * MHEIGHT);
-        mat = randomMat(size, type, 0, 256, false);
-        binaryImage = GET_PARAM(2);
-    }
-
-    void Compare(Moments& cpu_moments, Moments& gpu_moments)
-    {
-        Mat gpu_dst, cpu_dst;
-        HuMoments(cpu_moments, cpu_dst);
-        HuMoments(gpu_moments, gpu_dst);
-        EXPECT_MAT_NEAR(gpu_dst, cpu_dst, 1e-3);
-    }
-};
-
-OCL_TEST_P(MomentsTest, Mat)
-{
-    oclMat src_d(mat);
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        if(test_contours)
-        {
-            Mat src = readImage( "cv/shared/pic3.png", IMREAD_GRAYSCALE );
-            ASSERT_FALSE(src.empty());
-            Mat canny_output;
-            vector<vector<Point> > contours;
-            vector<Vec4i> hierarchy;
-            Canny( src, canny_output, 100, 200, 3 );
-            findContours( canny_output, contours, hierarchy, RETR_TREE, CHAIN_APPROX_SIMPLE, Point(0, 0) );
-            for( size_t i = 0; i < contours.size(); i++ )
-            {
-                Moments m = moments( contours[i], false );
-                Moments dm = ocl::ocl_moments( contours[i]);
-                Compare(m, dm);
-            }
-        }
-        cv::Moments CvMom = cv::moments(mat, binaryImage);
-        cv::Moments oclMom = cv::ocl::ocl_moments(src_d, binaryImage);
-
-        Compare(CvMom, oclMom);
-    }
-}
-INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine(
-    Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true)));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_objdetect.cpp b/modules/ocl/test/test_objdetect.cpp
deleted file mode 100644
index 119caa9ab..000000000
--- a/modules/ocl/test/test_objdetect.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//		Yao Wang, bitwangyaoyao@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include "opencv2/objdetect.hpp"
-
-using namespace cv;
-using namespace testing;
-
-///////////////////// HOG /////////////////////////////
-PARAM_TEST_CASE(HOG, Size, int)
-{
-    Size winSize;
-    int type;
-    Mat img_rgb;
-    virtual void SetUp()
-    {
-        winSize = GET_PARAM(0);
-        type = GET_PARAM(1);
-        img_rgb = readImage("gpu/hog/road.png");
-        ASSERT_FALSE(img_rgb.empty());
-    }
-};
-
-OCL_TEST_P(HOG, GetDescriptors)
-{
-    // Convert image
-    Mat img;
-    switch (type)
-    {
-    case CV_8UC1:
-        cvtColor(img_rgb, img, COLOR_BGR2GRAY);
-        break;
-    case CV_8UC4:
-    default:
-        cvtColor(img_rgb, img, COLOR_BGR2BGRA);
-        break;
-    }
-    ocl::oclMat d_img(img);
-
-    // HOGs
-    ocl::HOGDescriptor ocl_hog;
-    ocl_hog.gamma_correction = true;
-    HOGDescriptor hog;
-    hog.gammaCorrection = true;
-
-    // Compute descriptor
-    ocl::oclMat d_descriptors;
-    ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
-    Mat down_descriptors;
-    d_descriptors.download(down_descriptors);
-    down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
-
-    hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    std::vector<float> descriptors;
-    switch (type)
-    {
-    case CV_8UC1:
-        hog.compute(img, descriptors, ocl_hog.win_size);
-        break;
-    case CV_8UC4:
-    default:
-        hog.compute(img_rgb, descriptors, ocl_hog.win_size);
-        break;
-    }
-    Mat cpu_descriptors(descriptors);
-
-    EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
-}
-
-OCL_TEST_P(HOG, Detect)
-{
-    // Convert image
-    Mat img;
-    switch (type)
-    {
-    case CV_8UC1:
-        cvtColor(img_rgb, img, COLOR_BGR2GRAY);
-        break;
-    case CV_8UC4:
-    default:
-        cvtColor(img_rgb, img, COLOR_BGR2BGRA);
-        break;
-    }
-    ocl::oclMat d_img(img);
-
-    // HOGs
-    if ((winSize != Size(48, 96)) && (winSize != Size(64, 128)))
-        winSize = Size(64, 128);
-    ocl::HOGDescriptor ocl_hog(winSize);
-    ocl_hog.gamma_correction = true;
-
-    HOGDescriptor hog;
-    hog.winSize = winSize;
-    hog.gammaCorrection = true;
-
-    if (winSize.width == 48 && winSize.height == 96)
-    {
-        // daimler's base
-        ocl_hog.setSVMDetector(hog.getDaimlerPeopleDetector());
-        hog.setSVMDetector(hog.getDaimlerPeopleDetector());
-    }
-    else if (winSize.width == 64 && winSize.height == 128)
-    {
-        ocl_hog.setSVMDetector(hog.getDefaultPeopleDetector());
-        hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    }
-    else
-    {
-        ocl_hog.setSVMDetector(hog.getDefaultPeopleDetector());
-        hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    }
-
-    // OpenCL detection
-    std::vector<Rect> d_found;
-    ocl_hog.detectMultiScale(d_img, d_found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
-
-    // CPU detection
-    std::vector<Rect> found;
-    switch (type)
-    {
-    case CV_8UC1:
-        hog.detectMultiScale(img, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
-        break;
-    case CV_8UC4:
-    default:
-        hog.detectMultiScale(img_rgb, found, 0, Size(8, 8), Size(0, 0), 1.05, 6);
-        break;
-    }
-
-    EXPECT_LT(checkRectSimilarity(img.size(), found, d_found), 1.0);
-}
-
-
-INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
-                            testing::Values(Size(64, 128), Size(48, 96)),
-                            testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-
-///////////////////////////// Haar //////////////////////////////
-IMPLEMENT_PARAM_CLASS(CascadeName, std::string);
-CascadeName cascade_frontalface_alt(std::string("haarcascade_frontalface_alt.xml"));
-CascadeName cascade_frontalface_alt2(std::string("haarcascade_frontalface_alt2.xml"));
-
-PARAM_TEST_CASE(Haar, int, CascadeName)
-{
-    ocl::OclCascadeClassifier cascade, nestedCascade;
-    CascadeClassifier cpucascade, cpunestedCascade;
-
-    int flags;
-    std::string cascadeName;
-    std::vector<Rect> faces, oclfaces;
-    Mat img;
-    ocl::oclMat d_img;
-
-    virtual void SetUp()
-    {
-        flags = GET_PARAM(0);
-        cascadeName = (std::string(cvtest::TS::ptr()->get_data_path()) + "cv/cascadeandhog/cascades/").append(GET_PARAM(1));
-        ASSERT_TRUE(cascade.load( cascadeName ));
-        ASSERT_TRUE(cpucascade.load(cascadeName));
-        img = readImage("cv/shared/lena.png", IMREAD_GRAYSCALE);
-        ASSERT_FALSE(img.empty());
-        equalizeHist(img, img);
-        d_img.upload(img);
-    }
-};
-
-OCL_TEST_P(Haar, FaceDetect)
-{
-    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3,
-                                flags,
-                                Size(30, 30), Size(0, 0));
-
-    cpucascade.detectMultiScale(img, faces,  1.1, 3,
-                                flags,
-                                Size(30, 30), Size(0, 0));
-
-    EXPECT_LT(checkRectSimilarity(img.size(), faces, oclfaces), 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, Haar,
-    Combine(Values((int)CASCADE_SCALE_IMAGE, 0),
-            Values(cascade_frontalface_alt, cascade_frontalface_alt2)));
diff --git a/modules/ocl/test/test_optflow.cpp b/modules/ocl/test/test_optflow.cpp
deleted file mode 100644
index 7296a6b7e..000000000
--- a/modules/ocl/test/test_optflow.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#include <iomanip>
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(MinDistance, double)
-}
-PARAM_TEST_CASE(GoodFeaturesToTrack, MinDistance)
-{
-    double minDistance;
-
-    virtual void SetUp()
-    {
-        minDistance = GET_PARAM(0);
-    }
-};
-
-OCL_TEST_P(GoodFeaturesToTrack, Accuracy)
-{
-    cv::Mat frame = readImage("gpu/opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame.empty());
-
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
-
-    cv::ocl::oclMat d_pts;
-    detector(oclMat(frame), d_pts);
-
-    ASSERT_FALSE(d_pts.empty());
-
-    std::vector<cv::Point2f> pts(d_pts.cols);
-
-    detector.downloadPoints(d_pts, pts);
-
-    std::vector<cv::Point2f> pts_gold;
-    cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
-
-    ASSERT_EQ(pts_gold.size(), pts.size());
-
-    size_t mistmatch = 0;
-    for (size_t i = 0; i < pts.size(); ++i)
-    {
-        cv::Point2i a = pts_gold[i];
-        cv::Point2i b = pts[i];
-
-        bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
-
-        if (!eq)
-            ++mistmatch;
-    }
-
-    double bad_ratio = static_cast<double>(mistmatch) / pts.size();
-
-    ASSERT_LE(bad_ratio, 0.01);
-}
-
-OCL_TEST_P(GoodFeaturesToTrack, EmptyCorners)
-{
-    int maxCorners = 1000;
-    double qualityLevel = 0.01;
-
-    cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
-
-    cv::ocl::oclMat src(100, 100, CV_8UC1, cv::Scalar::all(0));
-    cv::ocl::oclMat corners(1, maxCorners, CV_32FC2);
-
-    detector(src, corners);
-
-    ASSERT_TRUE(corners.empty());
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Video, GoodFeaturesToTrack,
-    testing::Values(MinDistance(0.0), MinDistance(3.0)));
-
-//////////////////////////////////////////////////////////////////////////
-PARAM_TEST_CASE(TVL1, bool)
-{
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        useRoi = GET_PARAM(0);
-    }
-
-};
-
-OCL_TEST_P(TVL1, DISABLED_Accuracy) // TODO implementations of TV1 in video module are different in 2.4 and master branches
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
-    cv::Mat flowx = randomMat(frame0.size(), CV_32FC1, 0, 0, useRoi);
-    cv::Mat flowy = randomMat(frame0.size(), CV_32FC1, 0, 0, useRoi);
-    cv::ocl::oclMat d_flowx(flowx), d_flowy(flowy);
-    d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
-
-    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
-    cv::Mat flow;
-    alg->calc(frame0, frame1, flow);
-    cv::Mat gold[2];
-    cv::split(flow, gold);
-
-    EXPECT_MAT_SIMILAR(gold[0], d_flowx, 3e-3);
-    EXPECT_MAT_SIMILAR(gold[1], d_flowy, 3e-3);
-}
-INSTANTIATE_TEST_CASE_P(OCL_Video, TVL1, Values(false, true));
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// PyrLKOpticalFlow
-
-PARAM_TEST_CASE(Sparse, bool, bool)
-{
-    bool useGray;
-    bool UseSmart;
-
-    virtual void SetUp()
-    {
-        UseSmart = GET_PARAM(0);
-        useGray = GET_PARAM(1);
-    }
-};
-
-OCL_TEST_P(Sparse, Mat)
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    std::vector<cv::Point2f> pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
-
-    cv::ocl::oclMat d_pts;
-    cv::Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
-    d_pts.upload(pts_mat);
-
-    cv::ocl::PyrLKOpticalFlow pyrLK;
-
-    cv::ocl::oclMat oclFrame0;
-    cv::ocl::oclMat oclFrame1;
-    cv::ocl::oclMat d_nextPts;
-    cv::ocl::oclMat d_status;
-    cv::ocl::oclMat d_err;
-
-    oclFrame0 = frame0;
-    oclFrame1 = frame1;
-
-    pyrLK.sparse(oclFrame0, oclFrame1, d_pts, d_nextPts, d_status, &d_err);
-
-    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
-    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void *)&nextPts[0]);
-    d_nextPts.download(nextPts_mat);
-
-    std::vector<unsigned char> status(d_status.cols);
-    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void *)&status[0]);
-    d_status.download(status_mat);
-
-    std::vector<float> err(d_err.cols);
-    cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
-    d_err.download(err_mat);
-
-    std::vector<cv::Point2f> nextPts_gold;
-    std::vector<unsigned char> status_gold;
-    std::vector<float> err_gold;
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, err_gold);
-
-    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
-    ASSERT_EQ(status_gold.size(), status.size());
-
-    size_t mistmatch = 0;
-    for (size_t i = 0; i < nextPts.size(); ++i)
-    {
-        if (status[i] != status_gold[i])
-        {
-            ++mistmatch;
-            continue;
-        }
-
-        if (status[i])
-        {
-            cv::Point2i a = nextPts[i];
-            cv::Point2i b = nextPts_gold[i];
-
-            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
-            float errdiff = 0.0f;
-
-            if (!eq || errdiff > 1e-1)
-                ++mistmatch;
-        }
-    }
-
-    double bad_ratio = static_cast<double>(mistmatch) / (nextPts.size());
-
-    ASSERT_LE(bad_ratio, 0.02f);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Video, Sparse, Combine(Bool(), Bool()));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlow
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(PyrScale, double)
-        IMPLEMENT_PARAM_CLASS(PolyN, int)
-        CV_FLAGS(FarnebackOptFlowFlags, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
-        IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
-}
-
-PARAM_TEST_CASE(Farneback, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
-{
-    double pyrScale;
-    int polyN;
-    int flags;
-    bool useInitFlow;
-
-    virtual void SetUp()
-    {
-        pyrScale = GET_PARAM(0);
-        polyN = GET_PARAM(1);
-        flags = GET_PARAM(2);
-        useInitFlow = GET_PARAM(3);
-    }
-};
-
-OCL_TEST_P(Farneback, Accuracy)
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    double polySigma = polyN <= 5 ? 1.1 : 1.5;
-
-    cv::ocl::FarnebackOpticalFlow farn;
-    farn.pyrScale = pyrScale;
-    farn.polyN = polyN;
-    farn.polySigma = polySigma;
-    farn.flags = flags;
-
-    cv::ocl::oclMat d_flowx, d_flowy;
-    farn(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
-
-    cv::Mat flow;
-    if (useInitFlow)
-    {
-        cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-        cv::merge(flowxy, 2, flow);
-
-        farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-        farn(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
-    }
-
-    cv::calcOpticalFlowFarneback(
-        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-
-    std::vector<cv::Mat> flowxy;
-    cv::split(flow, flowxy);
-
-    EXPECT_MAT_SIMILAR(flowxy[0], d_flowx, 0.1);
-    EXPECT_MAT_SIMILAR(flowxy[1], d_flowy, 0.1);
-}
-
-INSTANTIATE_TEST_CASE_P(OCL_Video, Farneback, testing::Combine(
-    testing::Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
-    testing::Values(PolyN(5), PolyN(7)),
-    testing::Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
-    testing::Values(UseInitFlow(false), UseInitFlow(true))));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_sort.cpp b/modules/ocl/test/test_sort.cpp
deleted file mode 100644
index b25914968..000000000
--- a/modules/ocl/test/test_sort.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include <map>
-#include <functional>
-#include "test_precomp.hpp"
-
-using namespace std;
-using namespace cvtest;
-using namespace testing;
-using namespace cv;
-
-
-namespace
-{
-IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool)
-IMPLEMENT_PARAM_CLASS(InputSize, int)
-IMPLEMENT_PARAM_CLASS(SortMethod, int)
-
-
-template<class T>
-struct KV_CVTYPE{ static int toType() {return 0;} };
-
-template<> struct KV_CVTYPE<int>  { static int toType() {return CV_32SC1;} };
-template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} };
-template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} };
-template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} };
-
-template<class key_type, class val_type>
-bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
-{
-    return p1.first > p2.first;
-}
-
-template<class key_type, class val_type>
-bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
-{
-    return p1.first < p2.first;
-}
-
-template<class key_type, class val_type>
-void toKVPair(
-    MatConstIterator_<key_type> kit,
-    MatConstIterator_<val_type> vit,
-    int vecSize,
-    vector<pair<key_type, val_type> >& kvres
-    )
-{
-    kvres.clear();
-    for(int i = 0; i < vecSize; i ++)
-    {
-        kvres.push_back(make_pair(*kit, *vit));
-        ++kit;
-        ++vit;
-    }
-}
-
-template<class key_type, class val_type>
-void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false)
-{
-    vector<pair<key_type, val_type> > kvres;
-    toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres);
-
-    if(isGreater)
-    {
-        std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>);
-    }
-    else
-    {
-        std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>);
-    }
-    key_type * kptr = keys.ptr<key_type>();
-    val_type * vptr = vals.ptr<val_type>();
-    for(int i = 0; i < keys.cols; i ++)
-    {
-        kptr[i] = kvres[i].first;
-        vptr[i] = kvres[i].second;
-    }
-}
-
-class SortByKey_STL
-{
-public:
-    static void sort(cv::Mat&, cv::Mat&, bool is_gt);
-private:
-    typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool);
-    SortByKey_STL();
-    quick_sorter quick_sorters[CV_64FC4][CV_64FC4];
-    static SortByKey_STL instance;
-};
-
-SortByKey_STL SortByKey_STL::instance = SortByKey_STL();
-
-SortByKey_STL::SortByKey_STL()
-{
-    memset(instance.quick_sorters, 0, sizeof(quick_sorters));
-#define NEW_SORTER(KT, VT) \
-    instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>;
-
-    NEW_SORTER(int, int);
-    NEW_SORTER(int, Vec2i);
-    NEW_SORTER(int, float);
-    NEW_SORTER(int, Vec2f);
-
-    NEW_SORTER(float, int);
-    NEW_SORTER(float, Vec2i);
-    NEW_SORTER(float, float);
-    NEW_SORTER(float, Vec2f);
-#undef NEW_SORTER
-}
-
-void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt)
-{
-    instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt);
-}
-
-bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_,
-                               const Mat& /*dkeys_*/, const Mat& dvals_)
-{
-    int cn_val = gvals_.channels();
-    int count  = gkeys_.cols;
-
-    //for convenience we convert depth to float and channels to 1
-    Mat gkeys, gvals, dkeys, dvals;
-    gkeys_.reshape(1).convertTo(gkeys, CV_32F);
-    gvals_.reshape(1).convertTo(gvals, CV_32F);
-    //dkeys_.reshape(1).convertTo(dkeys, CV_32F);
-    dvals_.reshape(1).convertTo(dvals, CV_32F);
-    float * gkptr = gkeys.ptr<float>();
-    float * gvptr = gvals.ptr<float>();
-    //float * dkptr = dkeys.ptr<float>();
-    float * dvptr = dvals.ptr<float>();
-
-    for(int i = 0; i < count - 1; ++i)
-    {
-        int iden_count = 0;
-        // firstly calculate the number of identical keys
-        while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count])
-        {
-            ++ iden_count;
-        }
-
-        // sort dv and gv
-        int num_of_val = (iden_count + 1) * cn_val;
-        std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val);
-        std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val);
-
-        // then check if [i, i + iden_count) is the same
-        for(int j = 0; j < num_of_val; ++j)
-        {
-            if(gvptr[i + j] != dvptr[i + j])
-            {
-                return false;
-            }
-        }
-        i += iden_count;
-    }
-    return true;
-}
-}
-
-#define INPUT_SIZES  Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
-#define KEY_TYPES    Values(MatType(CV_32SC1), MatType(CV_32FC1))
-#define VAL_TYPES    Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2))
-#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/)
-#define F_OR_T       Values(IsGreaterThan(false), IsGreaterThan(true))
-
-PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan)
-{
-    InputSize input_size;
-    MatType key_type, val_type;
-    SortMethod method;
-    IsGreaterThan is_gt;
-
-    Mat mat_key, mat_val;
-    virtual void SetUp()
-    {
-        input_size = GET_PARAM(0);
-        key_type   = GET_PARAM(1);
-        val_type   = GET_PARAM(2);
-        method     = GET_PARAM(3);
-        is_gt      = GET_PARAM(4);
-
-        using namespace cv;
-        // fill key and val
-        mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX);
-        mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX);
-    }
-};
-
-OCL_TEST_P(SortByKey, Accuracy)
-{
-    using namespace cv;
-    ocl::oclMat oclmat_key(mat_key);
-    ocl::oclMat oclmat_val(mat_val);
-
-    ocl::sortByKey(oclmat_key, oclmat_val, method, is_gt);
-    SortByKey_STL::sort(mat_key, mat_val, is_gt);
-
-    EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0);
-    EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val));
-}
-INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T));
diff --git a/modules/ocl/test/test_split_merge.cpp b/modules/ocl/test/test_split_merge.cpp
deleted file mode 100644
index b21fedd77..000000000
--- a/modules/ocl/test/test_split_merge.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cvtest;
-using namespace testing;
-using namespace std;
-
-#define MAX_CHANNELS 4
-
-PARAM_TEST_CASE(MergeTestBase, MatDepth, Channels, bool)
-{
-    int type;
-    int channels;
-    bool use_roi;
-
-    //src mat
-    cv::Mat mat[MAX_CHANNELS];
-    //dst mat
-    cv::Mat dst;
-
-    // set up roi
-    int roicols, roirows;
-    int srcx[MAX_CHANNELS];
-    int srcy[MAX_CHANNELS];
-    int dstx, dsty;
-
-    //src mat with roi
-    cv::Mat mat_roi[MAX_CHANNELS];
-
-    //dst mat with roi
-    cv::Mat dst_roi;
-
-    //ocl dst mat for testing
-    cv::ocl::oclMat gdst_whole;
-
-    //ocl mat with roi
-    cv::ocl::oclMat gmat[MAX_CHANNELS];
-    cv::ocl::oclMat gdst;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
-
-        cv::Size size(MWIDTH, MHEIGHT);
-
-        for (int i = 0; i < channels; ++i)
-            mat[i] = randomMat(size, CV_MAKETYPE(type, 1), 5, 16, false);
-        dst = randomMat(size, CV_MAKETYPE(type, channels), 5, 16, false);
-    }
-
-    void random_roi()
-    {
-        if (use_roi)
-        {
-            //randomize ROI
-            roicols = rng.uniform(1, mat[0].cols);
-            roirows = rng.uniform(1, mat[0].rows);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                srcx[i] = rng.uniform(0, mat[i].cols - roicols);
-                srcy[i] = rng.uniform(0, mat[i].rows - roirows);
-            }
-
-            dstx = rng.uniform(0, dst.cols  - roicols);
-            dsty = rng.uniform(0, dst.rows  - roirows);
-        }
-        else
-        {
-            roicols = mat[0].cols;
-            roirows = mat[0].rows;
-            for (int i = 0; i < channels; ++i)
-                srcx[i] = srcy[i] = 0;
-
-            dstx = dsty = 0;
-        }
-
-        for (int i = 0; i < channels; ++i)
-            mat_roi[i] = mat[i](Rect(srcx[i], srcy[i], roicols, roirows));
-
-        dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
-
-        gdst_whole = dst;
-        gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
-
-        for (int i = 0; i < channels; ++i)
-            gmat[i] = mat_roi[i];
-    }
-};
-
-struct Merge : MergeTestBase {};
-
-OCL_TEST_P(Merge, Accuracy)
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::merge(mat_roi, channels, dst_roi);
-        cv::ocl::merge(gmat, channels, gdst);
-
-        EXPECT_MAT_NEAR(dst, Mat(gdst_whole), 0.0);
-    }
-}
-
-PARAM_TEST_CASE(SplitTestBase, MatType, int, bool)
-{
-    int type;
-    int channels;
-    bool use_roi;
-
-    cv::Mat src, src_roi;
-    cv::Mat dst[MAX_CHANNELS], dst_roi[MAX_CHANNELS];
-
-    cv::ocl::oclMat gsrc_whole, gsrc_roi;
-    cv::ocl::oclMat gdst_whole[MAX_CHANNELS], gdst_roi[MAX_CHANNELS];
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        channels = GET_PARAM(1);
-        use_roi = GET_PARAM(2);
-    }
-
-    void random_roi()
-    {
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKETYPE(type, channels), 0, 256);
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-
-        for (int i = 0; i < channels; ++i)
-        {
-            Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
-            randomSubMat(dst[i], dst_roi[i], roiSize, dstBorder, CV_MAKETYPE(type, 1), 5, 16);
-            generateOclMat(gdst_whole[i], gdst_roi[i], dst[i], roiSize, dstBorder);
-        }
-    }
-};
-
-struct Split : SplitTestBase {};
-
-#ifdef ANDROID
-// NOTE: The test fail on Android is the top of the iceberg only
-// The real fail reason is memory access vialation somewhere else
-OCL_TEST_P(Split, DISABLED_Accuracy)
-#else
-OCL_TEST_P(Split, Accuracy)
-#endif
-{
-    for(int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::split(src_roi, dst_roi);
-        cv::ocl::split(gsrc_roi, gdst_roi);
-
-        for (int i = 0; i < channels; ++i)
-        {
-            EXPECT_MAT_NEAR(dst[i], gdst_whole[i], 0.0);
-            EXPECT_MAT_NEAR(dst_roi[i], gdst_roi[i], 0.0);
-        }
-    }
-}
-
-
-INSTANTIATE_TEST_CASE_P(SplitMerge, Merge, Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F), Values(1, 2, 3, 4), Bool()));
-
-
-INSTANTIATE_TEST_CASE_P(SplitMerge, Split , Combine(
-                            Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F), Values(1, 2, 3, 4), Bool()));
-
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_warp.cpp b/modules/ocl/test/test_warp.cpp
deleted file mode 100644
index 85f33754e..000000000
--- a/modules/ocl/test/test_warp.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Niko Li, newlife20080214@gmail.com
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//    Shengen Yan, yanshengen@gmail.com
-//    Jiang Liyuan, lyuan001.good@163.com
-//    Rock Li, Rock.Li@amd.com
-//    Wu Zailong, bullet@yeah.net
-//    Xu Pang, pangxu010@163.com
-//    Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_OPENCL
-
-using namespace cv;
-using namespace testing;
-using namespace std;
-
-static MatType noType = -1;
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// warpAffine  & warpPerspective
-
-PARAM_TEST_CASE(WarpTestBase, MatType, Interpolation, bool, bool)
-{
-    int type, interpolation;
-    Size dsize;
-    bool useRoi, mapInverse;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        interpolation = GET_PARAM(1);
-        mapInverse = GET_PARAM(2);
-        useRoi = GET_PARAM(3);
-
-        if (mapInverse)
-            interpolation |= WARP_INVERSE_MAP;
-    }
-
-    void random_roi()
-    {
-        dsize = randomSize(1, MAX_VALUE);
-
-        Size roiSize = randomSize(1, MAX_VALUE);
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, dsize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, dsize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-/////warpAffine
-
-typedef WarpTestBase WarpAffine;
-
-OCL_TEST_P(WarpAffine, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        Mat M = getRotationMatrix2D(Point2f(src_roi.cols / 2.0f, src_roi.rows / 2.0f),
-            rng.uniform(-180.f, 180.f), rng.uniform(0.4f, 2.0f));
-
-        warpAffine(src_roi, dst_roi, M, dsize, interpolation);
-        ocl::warpAffine(gsrc_roi, gdst_roi, M, dsize, interpolation);
-
-        Near(1.0);
-    }
-}
-
-// warpPerspective
-
-typedef WarpTestBase WarpPerspective;
-
-OCL_TEST_P(WarpPerspective, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        float cols = static_cast<float>(src_roi.cols), rows = static_cast<float>(src_roi.rows);
-        float cols2 = cols / 2.0f, rows2 = rows / 2.0f;
-        Point2f sp[] = { Point2f(0.0f, 0.0f), Point2f(cols, 0.0f), Point2f(0.0f, rows), Point2f(cols, rows) };
-        Point2f dp[] = { Point2f(rng.uniform(0.0f, cols2), rng.uniform(0.0f, rows2)),
-            Point2f(rng.uniform(cols2, cols), rng.uniform(0.0f, rows2)),
-            Point2f(rng.uniform(0.0f, cols2), rng.uniform(rows2, rows)),
-            Point2f(rng.uniform(cols2, cols), rng.uniform(rows2, rows)) };
-        Mat M = getPerspectiveTransform(sp, dp);
-
-        warpPerspective(src_roi, dst_roi, M, dsize, interpolation);
-        ocl::warpPerspective(gsrc_roi, gdst_roi, M, dsize, interpolation);
-
-        Near(1.0);
-    }
-}
-
-// buildWarpPerspectiveMaps
-
-PARAM_TEST_CASE(BuildWarpPerspectiveMaps, bool, bool)
-{
-    bool useRoi, mapInverse;
-    Size dsize;
-
-    Mat xmap_whole, ymap_whole, xmap_roi, ymap_roi;
-    ocl::oclMat gxmap_whole, gymap_whole, gxmap_roi, gymap_roi;
-
-    void SetUp()
-    {
-        mapInverse = GET_PARAM(0);
-        useRoi = GET_PARAM(1);
-    }
-
-    void random_roi()
-    {
-        dsize = randomSize(1, MAX_VALUE);
-
-        Border xmapBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(xmap_whole, xmap_roi, dsize, xmapBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE);
-
-        Border ymapBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(ymap_whole, ymap_roi, dsize, ymapBorder, CV_32FC1, -MAX_VALUE, MAX_VALUE);
-
-        generateOclMat(gxmap_whole, gxmap_roi, xmap_whole, dsize, xmapBorder);
-        generateOclMat(gymap_whole, gymap_roi, ymap_whole, dsize, ymapBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gxmap_whole.download(whole);
-        gxmap_roi.download(roi);
-
-        EXPECT_MAT_NEAR(xmap_whole, whole, threshold);
-        EXPECT_MAT_NEAR(xmap_roi, roi, threshold);
-    }
-
-    void Near1(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gymap_whole.download(whole);
-        gymap_roi.download(roi);
-
-        EXPECT_MAT_NEAR(ymap_whole, whole, threshold);
-        EXPECT_MAT_NEAR(ymap_roi, roi, threshold);
-    }
-};
-
-static void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, Mat &xmap, Mat &ymap)
-{
-    CV_Assert(M.rows == 3 && M.cols == 3);
-    CV_Assert(dsize.area() > 0);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[3 * 3];
-    Mat coeffsMat(3, 3, CV_32F, (void *)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invert(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    for (int y = 0; y < dsize.height; ++y)
-    {
-        float * const xmap_ptr = xmap.ptr<float>(y);
-        float * const ymap_ptr = ymap.ptr<float>(y);
-
-        for (int x = 0; x < dsize.width; ++x)
-        {
-            float coeff = 1.0f / (x * coeffs[6] + y * coeffs[7] + coeffs[8]);
-            xmap_ptr[x] = (x * coeffs[0] + y * coeffs[1] + coeffs[2]) * coeff;
-            ymap_ptr[x] = (x * coeffs[3] + y * coeffs[4] + coeffs[5]) * coeff;
-        }
-    }
-}
-
-OCL_TEST_P(BuildWarpPerspectiveMaps, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        float cols = static_cast<float>(MAX_VALUE), rows = static_cast<float>(MAX_VALUE);
-        float cols2 = cols / 2.0f, rows2 = rows / 2.0f;
-        Point2f sp[] = { Point2f(0.0f, 0.0f), Point2f(cols, 0.0f), Point2f(0.0f, rows), Point2f(cols, rows) };
-        Point2f dp[] = { Point2f(rng.uniform(0.0f, cols2), rng.uniform(0.0f, rows2)),
-            Point2f(rng.uniform(cols2, cols), rng.uniform(0.0f, rows2)),
-            Point2f(rng.uniform(0.0f, cols2), rng.uniform(rows2, rows)),
-            Point2f(rng.uniform(cols2, cols), rng.uniform(rows2, rows)) };
-        Mat M = getPerspectiveTransform(sp, dp);
-
-        buildWarpPerspectiveMaps(M, mapInverse, dsize, xmap_roi, ymap_roi);
-        ocl::buildWarpPerspectiveMaps(M, mapInverse, dsize, gxmap_roi, gymap_roi);
-
-        Near(5e-3);
-        Near1(5e-3);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// remap
-
-PARAM_TEST_CASE(Remap, MatDepth, Channels, pair<MatType, MatType>, Border, bool)
-{
-    int srcType, map1Type, map2Type;
-    int borderType;
-    bool useRoi;
-
-    Scalar val;
-
-    Mat src, src_roi;
-    Mat dst, dst_roi;
-    Mat map1, map1_roi;
-    Mat map2, map2_roi;
-
-    // ocl mat with roi
-    ocl::oclMat gsrc, gsrc_roi;
-    ocl::oclMat gdst, gdst_roi;
-    ocl::oclMat gmap1, gmap1_roi;
-    ocl::oclMat gmap2, gmap2_roi;
-
-    virtual void SetUp()
-    {
-        srcType = CV_MAKE_TYPE(GET_PARAM(0), GET_PARAM(1));
-        map1Type = GET_PARAM(2).first;
-        map2Type = GET_PARAM(2).second;
-        borderType = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-    }
-
-    void random_roi()
-    {
-        val = randomScalar(-MAX_VALUE, MAX_VALUE);
-        Size srcROISize = randomSize(1, MAX_VALUE);
-        Size dstROISize = randomSize(1, MAX_VALUE);
-
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, srcROISize, srcBorder, srcType, 5, 256);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst, dst_roi, dstROISize, dstBorder, srcType, -MAX_VALUE, MAX_VALUE);
-
-        int mapMaxValue = MAX_VALUE << 2;
-        Border map1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(map1, map1_roi, dstROISize, map1Border, map1Type, -mapMaxValue, mapMaxValue);
-
-        Border map2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        if (map2Type != noType)
-        {
-            int mapMinValue = -mapMaxValue;
-            if (map2Type == CV_16UC1 || map2Type == CV_16SC1)
-                mapMinValue = 0, mapMaxValue = INTER_TAB_SIZE2;
-            randomSubMat(map2, map2_roi, dstROISize, map2Border, map2Type, mapMinValue, mapMaxValue);
-        }
-
-        generateOclMat(gsrc, gsrc_roi, src, srcROISize, srcBorder);
-        generateOclMat(gdst, gdst_roi, dst, dstROISize, dstBorder);
-        generateOclMat(gmap1, gmap1_roi, map1, dstROISize, map1Border);
-        if (noType != map2Type)
-            generateOclMat(gmap2, gmap2_roi, map2, dstROISize, map2Border);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-typedef Remap Remap_INTER_NEAREST;
-
-OCL_TEST_P(Remap_INTER_NEAREST, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        remap(src_roi, dst_roi, map1_roi, map2_roi, INTER_NEAREST, borderType, val);
-        ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, INTER_NEAREST, borderType, val);
-
-        Near(1.0);
-    }
-}
-
-typedef Remap Remap_INTER_LINEAR;
-
-OCL_TEST_P(Remap_INTER_LINEAR, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::remap(src_roi, dst_roi, map1_roi, map2_roi, INTER_LINEAR, borderType, val);
-        ocl::remap(gsrc_roi, gdst_roi, gmap1_roi, gmap2_roi, INTER_LINEAR, borderType, val);
-
-        Near(2.0);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// resize
-
-PARAM_TEST_CASE(Resize, MatType, double, double, Interpolation, bool)
-{
-    int type, interpolation;
-    double fx, fy;
-    bool useRoi;
-
-    Mat src, dst_whole, src_roi, dst_roi;
-    ocl::oclMat gsrc_whole, gsrc_roi, gdst_whole, gdst_roi;
-
-    virtual void SetUp()
-    {
-        type = GET_PARAM(0);
-        fx = GET_PARAM(1);
-        fy = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-    }
-
-    void random_roi()
-    {
-        CV_Assert(fx > 0 && fy > 0);
-
-        Size srcRoiSize = randomSize(1, MAX_VALUE), dstRoiSize;
-        dstRoiSize.width = cvRound(srcRoiSize.width * fx);
-        dstRoiSize.height = cvRound(srcRoiSize.height * fy);
-
-        if (dstRoiSize.area() == 0)
-            return random_roi();
-
-        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(src, src_roi, srcRoiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
-        randomSubMat(dst_whole, dst_roi, dstRoiSize, dstBorder, type, -MAX_VALUE, MAX_VALUE);
-
-        generateOclMat(gsrc_whole, gsrc_roi, src, srcRoiSize, srcBorder);
-        generateOclMat(gdst_whole, gdst_roi, dst_whole, dstRoiSize, dstBorder);
-    }
-
-    void Near(double threshold = 0.0)
-    {
-        Mat whole, roi;
-        gdst_whole.download(whole);
-        gdst_roi.download(roi);
-
-        EXPECT_MAT_NEAR(dst_whole, whole, threshold);
-        EXPECT_MAT_NEAR(dst_roi, roi, threshold);
-    }
-};
-
-OCL_TEST_P(Resize, Mat)
-{
-    for (int j = 0; j < LOOP_TIMES; j++)
-    {
-        random_roi();
-
-        cv::resize(src_roi, dst_roi, Size(), fx, fy, interpolation);
-        ocl::resize(gsrc_roi, gdst_roi, Size(), fx, fy, interpolation);
-
-        Near(1.0);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpAffine, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR, (Interpolation)INTER_CUBIC),
-                            Bool(),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarp, WarpPerspective, Combine(
-                            Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR, (Interpolation)INTER_CUBIC),
-                            Bool(),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarp, BuildWarpPerspectiveMaps, Combine(Bool(), Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine(
-                            Values(CV_8U, CV_16U, CV_16S, CV_32F, CV_64F),
-                            Values(1, 2, 3, 4),
-                            Values(pair<MatType, MatType>((MatType)CV_32FC1, (MatType)CV_32FC1),
-                                   pair<MatType, MatType>((MatType)CV_16SC2, (MatType)CV_16UC1),
-                                   pair<MatType, MatType>((MatType)CV_32FC2, noType)),
-                            Values((Border)BORDER_CONSTANT,
-                                   (Border)BORDER_REPLICATE,
-                                   (Border)BORDER_WRAP,
-                                   (Border)BORDER_REFLECT,
-                                   (Border)BORDER_REFLECT_101),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_NEAREST, Combine(
-                            Values(CV_8U, CV_16U, CV_16S, CV_32F, CV_64F),
-                            Values(1, 2, 3, 4),
-                            Values(pair<MatType, MatType>((MatType)CV_32FC1, (MatType)CV_32FC1),
-                                   pair<MatType, MatType>((MatType)CV_32FC2, noType),
-                                   pair<MatType, MatType>((MatType)CV_16SC2, (MatType)CV_16UC1),
-                                   pair<MatType, MatType>((MatType)CV_16SC2, noType)),
-                            Values((Border)BORDER_CONSTANT,
-                                   (Border)BORDER_REPLICATE,
-                                   (Border)BORDER_WRAP,
-                                   (Border)BORDER_REFLECT,
-                                   (Border)BORDER_REFLECT_101),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarpResize, Resize, Combine(
-                            Values((MatType)CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(0.7, 0.4, 2.0),
-                            Values(0.3, 0.6, 2.0),
-                            Values((Interpolation)INTER_NEAREST, (Interpolation)INTER_LINEAR),
-                            Bool()));
-
-INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine(
-                            Values((MatType)CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
-                            Values(0.7, 0.4, 0.5),
-                            Values(0.3, 0.6, 0.5),
-                            Values((Interpolation)INTER_AREA),
-                            Bool()));
-
-#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/utility.cpp b/modules/ocl/test/utility.cpp
deleted file mode 100644
index 7d43b2adc..000000000
--- a/modules/ocl/test/utility.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-#define VARNAME(A) #A
-using namespace std;
-using namespace cv;
-using namespace cvtest;
-
-namespace cvtest {
-//std::string generateVarList(int first,...)
-//{
-//	vector<std::string> varname;
-//
-//	va_list argp;
-//	string s;
-//	stringstream ss;
-//	va_start(argp,first);
-//	int i=first;
-//	while(i!=-1)
-//	{
-//		ss<<i<<",";
-//		i=va_arg(argp,int);
-//	};
-//	s=ss.str();
-//	va_end(argp);
-//	return s;
-//};
-
-//std::string generateVarList(int& p1,int& p2)
-//{
-//	stringstream ss;
-//	ss<<VARNAME(p1)<<":"<<src1x<<","<<VARNAME(p2)<<":"<<src1y;
-//	return ss.str();
-//};
-
-cv::ocl::oclMat createMat_ocl(cv::RNG& rng, Size size, int type, bool useRoi)
-{
-    Size size0 = size;
-
-    if (useRoi)
-    {
-        size0.width += rng.uniform(5, 15);
-        size0.height += rng.uniform(5, 15);
-    }
-
-    cv::ocl::oclMat d_m(size0, type);
-
-    if (size0 != size)
-        d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));
-
-    return d_m;
-}
-
-cv::ocl::oclMat loadMat_ocl(cv::RNG& rng, const Mat& m, bool useRoi)
-{
-    CV_Assert(m.type() == CV_8UC1 || m.type() == CV_8UC3);
-    cv::ocl::oclMat d_m;
-    d_m = createMat_ocl(rng, m.size(), m.type(), useRoi);
-
-    Size ls;
-    Point pt;
-
-    d_m.locateROI(ls, pt);
-
-    Rect roi(pt.x, pt.y, d_m.size().width, d_m.size().height);
-
-    cv::ocl::oclMat m_ocl(m);
-
-    cv::ocl::oclMat d_m_roi(d_m, roi);
-
-    m_ocl.copyTo(d_m);
-    return d_m;
-}
-
-vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
-{
-    vector<MatType> v;
-
-    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-    {
-        for (int cn = cn_start; cn <= cn_end; ++cn)
-        {
-            v.push_back(CV_MAKETYPE(depth, cn));
-        }
-    }
-
-    return v;
-}
-
-const vector<MatType> &all_types()
-{
-    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
-
-    return v;
-}
-
-Mat readImage(const string &fileName, int flags)
-{
-    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
-}
-
-Mat readImageType(const string &fname, int type)
-{
-    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
-    if (CV_MAT_CN(type) == 4)
-    {
-        Mat temp;
-        cvtColor(src, temp, cv::COLOR_BGR2BGRA);
-        swap(src, temp);
-    }
-    src.convertTo(src, CV_MAT_DEPTH(type));
-    return src;
-}
-
-double checkNorm(const Mat &m)
-{
-    return norm(m, NORM_INF);
-}
-
-double checkNorm(const Mat &m1, const Mat &m2)
-{
-    return norm(m1, m2, NORM_INF);
-}
-
-double checkSimilarity(const Mat &m1, const Mat &m2)
-{
-    Mat diff;
-    matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
-
-/*
-void cv::ocl::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    (*os) << info.name();
-}
-*/
-
-void PrintTo(const Inverse &inverse, std::ostream *os)
-{
-    if (inverse)
-        (*os) << "inverse";
-    else
-        (*os) << "direct";
-}
-
-double checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
-{
-    double final_test_result = 0.0;
-    size_t sz1 = ob1.size();
-    size_t sz2 = ob2.size();
-
-    if(sz1 != sz2)
-    {
-        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
-    }
-    else
-    {
-        if(sz1==0 && sz2==0)
-            return 0;
-        cv::Mat cpu_result(sz, CV_8UC1);
-        cpu_result.setTo(0);
-
-        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {
-            cv::Mat cpu_result_roi(cpu_result, *r);
-            cpu_result_roi.setTo(1);
-            cpu_result.copyTo(cpu_result);
-        }
-        int cpu_area = cv::countNonZero(cpu_result > 0);
-
-        cv::Mat gpu_result(sz, CV_8UC1);
-        gpu_result.setTo(0);
-        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
-        {
-            cv::Mat gpu_result_roi(gpu_result, *r2);
-            gpu_result_roi.setTo(1);
-            gpu_result.copyTo(gpu_result);
-        }
-
-        cv::Mat result_;
-        multiply(cpu_result, gpu_result, result_);
-        int result = cv::countNonZero(result_ > 0);
-        if(cpu_area!=0 && result!=0)
-            final_test_result = 1.0 - (double)result/(double)cpu_area;
-        else if(cpu_area==0 && result!=0)
-            final_test_result = -1;
-    }
-    return final_test_result;
-}
-
-void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
-{
-    Mat diff, diff_thresh;
-    absdiff(gold, actual, diff);
-    diff.convertTo(diff, CV_32F);
-    threshold(diff, diff_thresh, eps, 255.0, cv::THRESH_BINARY);
-
-    if (alwaysShow || cv::countNonZero(diff_thresh.reshape(1)) > 0)
-    {
-#if 0
-        std::cout << "Src: " << std::endl << src << std::endl;
-        std::cout << "Reference: " << std::endl << gold << std::endl;
-        std::cout << "OpenCL: " << std::endl << actual << std::endl;
-#endif
-
-        namedWindow("src", WINDOW_NORMAL);
-        namedWindow("gold", WINDOW_NORMAL);
-        namedWindow("actual", WINDOW_NORMAL);
-        namedWindow("diff", WINDOW_NORMAL);
-
-        imshow("src", src);
-        imshow("gold", gold);
-        imshow("actual", actual);
-        imshow("diff", diff);
-
-        waitKey();
-    }
-}
-
-namespace
-{
-    bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
-    {
-        const double maxPtDif = 1.0;
-        const double maxSizeDif = 1.0;
-        const double maxAngleDif = 2.0;
-        const double maxResponseDif = 0.1;
-
-        double dist = cv::norm(p1.pt - p2.pt);
-
-        if (dist < maxPtDif &&
-            fabs(p1.size - p2.size) < maxSizeDif &&
-            abs(p1.angle - p2.angle) < maxAngleDif &&
-            abs(p1.response - p2.response) < maxResponseDif &&
-            p1.octave == p2.octave &&
-            p1.class_id == p2.class_id)
-        {
-            return true;
-        }
-
-        return false;
-    }
-
-    struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
-    {
-        bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
-        {
-            return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
-        }
-    };
-}
-
-testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
-{
-    if (gold.size() != actual.size())
-    {
-        return testing::AssertionFailure() << "KeyPoints size mistmach\n"
-                                           << "\"" << gold_expr << "\" : " << gold.size() << "\n"
-                                           << "\"" << actual_expr << "\" : " << actual.size();
-    }
-
-    std::sort(actual.begin(), actual.end(), KeyPointLess());
-    std::sort(gold.begin(), gold.end(), KeyPointLess());
-
-    for (size_t i = 0; i < gold.size(); ++i)
-    {
-        const cv::KeyPoint& p1 = gold[i];
-        const cv::KeyPoint& p2 = actual[i];
-
-        if (!keyPointsEquals(p1, p2))
-        {
-            return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
-                                               << "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
-                                               << "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
-                                               << "size : " << p1.size << " vs " << p2.size << "\n"
-                                               << "angle : " << p1.angle << " vs " << p2.angle << "\n"
-                                               << "response : " << p1.response << " vs " << p2.response << "\n"
-                                               << "octave : " << p1.octave << " vs " << p2.octave << "\n"
-                                               << "class_id : " << p1.class_id << " vs " << p2.class_id;
-        }
-    }
-
-    return ::testing::AssertionSuccess();
-}
-
-} // namespace cvtest
diff --git a/modules/ocl/test/utility.hpp b/modules/ocl/test/utility.hpp
deleted file mode 100644
index ab1a52b7f..000000000
--- a/modules/ocl/test/utility.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_UTILITY_HPP__
-#define __OPENCV_TEST_UTILITY_HPP__
-#include "opencv2/core.hpp"
-
-
-extern int LOOP_TIMES;
-
-#define MWIDTH 256
-#define MHEIGHT 256
-
-#define MIN_VALUE 171
-#define MAX_VALUE 357
-
-namespace cvtest {
-
-testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual);
-#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual)
-
-void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false);
-
-cv::ocl::oclMat createMat_ocl(cv::RNG& rng, Size size, int type, bool useRoi);
-cv::ocl::oclMat loadMat_ocl(cv::RNG& rng, const Mat& m, bool useRoi);
-
-// This function test if gpu_rst matches cpu_rst.
-// If the two vectors are not equal, it will return the difference in vector size
-// Else it will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-// The smaller, the better matched
-double checkRectSimilarity(cv::Size sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);
-
-
-//! read image from testdata folder.
-cv::Mat readImage(const std::string &fileName, int flags = cv::IMREAD_COLOR);
-cv::Mat readImageType(const std::string &fname, int type);
-
-double checkNorm(const cv::Mat &m);
-double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-
-inline double checkNormRelative(const Mat &m1, const Mat &m2)
-{
-    return cv::norm(m1, m2, cv::NORM_INF) /
-            std::max((double)std::numeric_limits<float>::epsilon(),
-                     (double)std::max(cv::norm(m1, cv::NORM_INF), norm(m2, cv::NORM_INF)));
-}
-
-#define EXPECT_MAT_NORM(mat, eps) \
-{ \
-    EXPECT_LE(checkNorm(cv::Mat(mat)), eps) \
-}
-
-#define EXPECT_MAT_NEAR(mat1, mat2, eps) \
-{ \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps) \
-       << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
-}
-
-#define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
-{ \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps) \
-       << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
-}
-
-#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
-{ \
-    ASSERT_EQ(mat1.type(), mat2.type()); \
-    ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkSimilarity(cv::Mat(mat1), cv::Mat(mat2)), eps); \
-}
-
-
-using perf::MatDepth;
-using perf::MatType;
-
-//! return vector with types from specified range.
-std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
-
-//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
-const std::vector<MatType> &all_types();
-
-class Inverse
-{
-public:
-    inline Inverse(bool val = false) : val_(val) {}
-
-    inline operator bool() const
-    {
-        return val_;
-    }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const Inverse &useRoi, std::ostream *os);
-
-#define OCL_RNG_SEED 123456
-
-template <typename T>
-struct TSTestWithParam : public ::testing::TestWithParam<T>
-{
-    cv::RNG rng;
-
-    TSTestWithParam()
-    {
-        rng = cv::RNG(OCL_RNG_SEED);
-    }
-
-    int randomInt(int minVal, int maxVal)
-    {
-        return rng.uniform(minVal, maxVal);
-    }
-
-    double randomDouble(double minVal, double maxVal)
-    {
-        return rng.uniform(minVal, maxVal);
-    }
-
-    double randomDoubleLog(double minVal, double maxVal)
-    {
-        double logMin = log((double)minVal + 1);
-        double logMax = log((double)maxVal + 1);
-        double pow = rng.uniform(logMin, logMax);
-        double v = exp(pow) - 1;
-        CV_Assert(v >= minVal && (v < maxVal || (v == minVal && v == maxVal)));
-        return v;
-    }
-
-    Size randomSize(int minVal, int maxVal)
-    {
-#if 1
-        return cv::Size((int)randomDoubleLog(minVal, maxVal), (int)randomDoubleLog(minVal, maxVal));
-#else
-        return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-#endif
-    }
-
-    Size randomSize(int minValX, int maxValX, int minValY, int maxValY)
-    {
-#if 1
-        return cv::Size(randomDoubleLog(minValX, maxValX), randomDoubleLog(minValY, maxValY));
-#else
-        return cv::Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-#endif
-    }
-
-    Scalar randomScalar(double minVal, double maxVal)
-    {
-        return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
-    }
-
-    Mat randomMat(Size size, int type, double minVal, double maxVal, bool useRoi = false)
-    {
-        RNG dataRng(rng.next());
-        return cvtest::randomMat(dataRng, size, type, minVal, maxVal, useRoi);
-    }
-
-    struct Border
-    {
-        int top, bot, lef, rig;
-    };
-
-    Border randomBorder(int minValue = 0, int maxValue = MAX_VALUE)
-    {
-        Border border = {
-                (int)randomDoubleLog(minValue, maxValue),
-                (int)randomDoubleLog(minValue, maxValue),
-                (int)randomDoubleLog(minValue, maxValue),
-                (int)randomDoubleLog(minValue, maxValue)
-        };
-        return border;
-    }
-
-    void randomSubMat(Mat& whole, Mat& subMat, const Size& roiSize, const Border& border, int type, double minVal, double maxVal)
-    {
-        Size wholeSize = Size(roiSize.width + border.lef + border.rig, roiSize.height + border.top + border.bot);
-        whole = randomMat(wholeSize, type, minVal, maxVal, false);
-        subMat = whole(Rect(border.lef, border.top, roiSize.width, roiSize.height));
-    }
-
-    void generateOclMat(cv::ocl::oclMat& whole, cv::ocl::oclMat& subMat, const Mat& wholeMat, const Size& roiSize, const Border& border)
-    {
-        whole = wholeMat;
-        subMat = whole(Rect(border.lef, border.top, roiSize.width, roiSize.height));
-    }
-};
-
-#define PARAM_TEST_CASE(name, ...) struct name : public TSTestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define ALL_TYPES testing::ValuesIn(all_types())
-#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113), cv::Size(1300, 1300))
-
-#define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4))
-#ifndef IMPLEMENT_PARAM_CLASS
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-#endif // IMPLEMENT_PARAM_CLASS
-
-} // namespace cvtest
-
-enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
-CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
-
-CV_ENUM(CmpCode, CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE)
-CV_ENUM(NormCode, NORM_INF, NORM_L1, NORM_L2, NORM_TYPE_MASK, NORM_RELATIVE, NORM_MINMAX)
-CV_ENUM(ReduceOp, REDUCE_SUM, REDUCE_AVG, REDUCE_MAX, REDUCE_MIN)
-CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
-CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV)
-CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
-CV_ENUM(Border, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
-CV_ENUM(TemplateMethod, TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED)
-
-CV_FLAGS(GemmFlags, GEMM_1_T, GEMM_2_T, GEMM_3_T);
-CV_FLAGS(WarpFlags, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, WARP_INVERSE_MAP)
-CV_FLAGS(DftFlags, DFT_INVERSE, DFT_SCALE, DFT_ROWS, DFT_COMPLEX_OUTPUT, DFT_REAL_OUTPUT)
-
-# define OCL_TEST_P(test_case_name, test_name) \
-    class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : \
-        public test_case_name { \
-    public: \
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() { } \
-        virtual void TestBody(); \
-        void OCLTestBody(); \
-    private: \
-        static int AddToRegistry() \
-        { \
-            ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-              GetTestCasePatternHolder<test_case_name>(\
-                  #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                      #test_case_name, \
-                      #test_name, \
-                      new ::testing::internal::TestMetaFactory< \
-                          GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
-            return 0; \
-        } \
-    \
-        static int gtest_registering_dummy_; \
-        GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-    }; \
-    \
-    int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-    \
-    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
-    { \
-        try \
-        { \
-            OCLTestBody(); \
-        } \
-        catch (const cv::Exception & ex) \
-        { \
-            if (ex.code == cv::Error::OpenCLDoubleNotSupported)\
-                std::cout << "Test skipped (selected device does not support double)" << std::endl; \
-            else if (ex.code == cv::Error::OpenCLNoAMDBlasFft) \
-                std::cout << "Test skipped (AMD Blas / Fft libraries are not available)" << std::endl; \
-            else \
-                throw; \
-        } \
-    } \
-    \
-    void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::OCLTestBody()
-
-#endif // __OPENCV_TEST_UTILITY_HPP__
diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp
new file mode 100644
index 000000000..0bdf08363
--- /dev/null
+++ b/modules/photo/perf/opencl/perf_denoising.cpp
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+OCL_PERF_TEST(Photo, DenoisingGrayscale)
+{
+    Mat _original = imread(getDataPath("cv/denoising/lena_noised_gaussian_sigma=10.png"), IMREAD_GRAYSCALE);
+    ASSERT_FALSE(_original.empty()) << "Could not load input image";
+
+    UMat result(_original.size(), _original.type()), original;
+    _original.copyTo(original);
+
+    declare.in(original).out(result).iterations(10);
+
+    OCL_TEST_CYCLE()
+            cv::fastNlMeansDenoising(original, result, 10);
+
+    SANITY_CHECK(result);
+}
+
+OCL_PERF_TEST(Photo, DenoisingColored)
+{
+    Mat _original = imread(getDataPath("cv/denoising/lena_noised_gaussian_sigma=10.png"));
+    ASSERT_FALSE(_original.empty()) << "Could not load input image";
+
+    UMat result(_original.size(), _original.type()), original;
+    _original.copyTo(original);
+
+    declare.in(original).out(result).iterations(10);
+
+    OCL_TEST_CYCLE()
+            cv::fastNlMeansDenoisingColored(original, result, 10, 10);
+
+    SANITY_CHECK(result);
+}
+
+OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti)
+{
+    const int imgs_count = 3;
+
+    vector<UMat> original(imgs_count);
+    Mat tmp;
+    for (int i = 0; i < imgs_count; i++)
+    {
+        string original_path = format("cv/denoising/lena_noised_gaussian_sigma=20_multi_%d.png", i);
+        tmp = imread(getDataPath(original_path), IMREAD_GRAYSCALE);
+        ASSERT_FALSE(tmp.empty()) << "Could not load input image " << original_path;
+        tmp.copyTo(original[i]);
+        declare.in(original[i]);
+    }
+    UMat result(tmp.size(), tmp.type());
+    declare.out(result).iterations(10);
+
+    OCL_TEST_CYCLE()
+            cv::fastNlMeansDenoisingMulti(original, result, imgs_count / 2, imgs_count, 15);
+
+    SANITY_CHECK(result);
+}
+
+OCL_PERF_TEST(Photo, DenoisingColoredMulti)
+{
+    const int imgs_count = 3;
+
+    vector<UMat> original(imgs_count);
+    Mat tmp;
+    for (int i = 0; i < imgs_count; i++)
+    {
+        string original_path = format("cv/denoising/lena_noised_gaussian_sigma=20_multi_%d.png", i);
+        tmp = imread(getDataPath(original_path), IMREAD_COLOR);
+        ASSERT_FALSE(tmp.empty()) << "Could not load input image " << original_path;
+
+        tmp.copyTo(original[i]);
+        declare.in(original[i]);
+    }
+    UMat result(tmp.size(), tmp.type());
+    declare.out(result).iterations(10);
+
+    OCL_TEST_CYCLE()
+            cv::fastNlMeansDenoisingColoredMulti(original, result, imgs_count / 2, imgs_count, 10, 15);
+
+    SANITY_CHECK(result);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/photo/src/cuda/nlm.cu b/modules/photo/src/cuda/nlm.cu
index 371dab592..2c3623961 100644
--- a/modules/photo/src/cuda/nlm.cu
+++ b/modules/photo/src/cuda/nlm.cu
@@ -264,7 +264,7 @@ namespace cv { namespace cuda { namespace device
         __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
         __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
 
-        template <class T> struct FastNonLocalMenas
+        template <class T> struct FastNonLocalMeans
         {
             enum
             {
@@ -288,7 +288,7 @@ namespace cv { namespace cuda { namespace device
             int block_window;
             float minus_h2_inv;
 
-            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
+            FastNonLocalMeans(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
                 search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
 
             PtrStep<T> src;
@@ -392,7 +392,7 @@ namespace cv { namespace cuda { namespace device
                 }
             }
 
-            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
+            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, T& dst) const
             {
                 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
 
@@ -469,18 +469,18 @@ namespace cv { namespace cuda { namespace device
 
                         __syncthreads();
 
-                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
+                        convolve_window(i, j, dist_sums, dst(i, j));
                     }
             }
 
         };
 
         template<typename T>
-        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
+        __global__ void fast_nlm_kernel(const FastNonLocalMeans<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
 
         void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
         {
-            typedef FastNonLocalMenas<uchar> FNLM;
+            typedef FastNonLocalMeans<uchar> FNLM;
             dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
 
             buffer_cols = search_window * search_window * grid.y;
@@ -491,7 +491,7 @@ namespace cv { namespace cuda { namespace device
         void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
                           int search_window, int block_window, float h, cudaStream_t stream)
         {
-            typedef FastNonLocalMenas<T> FNLM;
+            typedef FastNonLocalMeans<T> FNLM;
             FNLM fnlm(search_window, block_window, h);
 
             fnlm.src = (PtrStepSz<T>)src;
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 166d7f892..a5532041c 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -133,7 +133,7 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
     {
         CV_Error(Error::StsBadArg,
             "imgToDenoiseIndex and temporalWindowSize "
-            "should be choosen corresponding srcImgs size!");
+            "should be chosen corresponding srcImgs size!");
     }
 
     for (int i = 1; i < src_imgs_size; i++) {
diff --git a/modules/photo/src/hdr_common.cpp b/modules/photo/src/hdr_common.cpp
index 6b67a58e0..9a2d720e4 100644
--- a/modules/photo/src/hdr_common.cpp
+++ b/modules/photo/src/hdr_common.cpp
@@ -102,4 +102,4 @@ Mat linearResponse(int channels)
     return response;
 }
 
-};
+}
diff --git a/modules/photo/src/hdr_common.hpp b/modules/photo/src/hdr_common.hpp
index b00227f90..26fb8e419 100644
--- a/modules/photo/src/hdr_common.hpp
+++ b/modules/photo/src/hdr_common.hpp
@@ -57,6 +57,6 @@ void mapLuminance(Mat src, Mat dst, Mat lum, Mat new_lum, float saturation);
 Mat RobertsonWeights();
 
 Mat linearResponse(int channels);
-};
+}
 
 #endif
diff --git a/modules/photo/src/seamless_cloning.hpp b/modules/photo/src/seamless_cloning.hpp
index 73d719c1b..143d55089 100644
--- a/modules/photo/src/seamless_cloning.hpp
+++ b/modules/photo/src/seamless_cloning.hpp
@@ -79,61 +79,34 @@ class Cloning
 
 void Cloning::getGradientx( const Mat &img, Mat &gx)
 {
-    int w = img.size().width;
-    int h = img.size().height;
-    int channel = img.channels();
-    for(int i=0;i<h;i++)
-        for(int j=0;j<w;j++)
-            for(int c=0;c<channel;++c)
-            {
-                gx.at<float>(i,j*channel+c) =
-                    (float)img.at<uchar>(i,(j+1)*channel+c) - (float)img.at<uchar>(i,j*channel+c);
-            }
-
+    Mat kernel = Mat::zeros(1, 3, CV_8S);
+    kernel.at<char>(0,2) = 1;
+    kernel.at<char>(0,1) = -1;
+    filter2D(img, gx, CV_32F, kernel);
 }
 
 void Cloning::getGradienty( const Mat &img, Mat &gy)
 {
-    int w = img.size().width;
-    int h = img.size().height;
-    int channel = img.channels();
-    for(int i=0;i<h;i++)
-        for(int j=0;j<w;j++)
-            for(int c=0;c<channel;++c)
-            {
-                gy.at<float>(i,j*channel+c) =
-                    (float)img.at<uchar>((i+1),j*channel+c) - (float)img.at<uchar>(i,j*channel+c);
-
-            }
+    Mat kernel = Mat::zeros(3, 1, CV_8S);
+    kernel.at<char>(2,0) = 1;
+    kernel.at<char>(1,0) = -1;
+    filter2D(img, gy, CV_32F, kernel);
 }
 
 void Cloning::lapx( const Mat &img, Mat &gxx)
 {
-    int w = img.size().width;
-    int h = img.size().height;
-    int channel = img.channels();
-    for(int i=0;i<h;i++)
-        for(int j=0;j<w-1;j++)
-            for(int c=0;c<channel;++c)
-            {
-                gxx.at<float>(i,(j+1)*channel+c) =
-                    (float)img.at<float>(i,(j+1)*channel+c) - (float)img.at<float>(i,j*channel+c);
-            }
+    Mat kernel = Mat::zeros(1, 3, CV_8S);
+    kernel.at<char>(0,0) = -1;
+    kernel.at<char>(0,1) = 1;
+    filter2D(img, gxx, CV_32F, kernel);
 }
 
 void Cloning::lapy( const Mat &img, Mat &gyy)
 {
-    int w = img.size().width;
-    int h = img.size().height;
-    int channel = img.channels();
-    for(int i=0;i<h-1;i++)
-        for(int j=0;j<w;j++)
-            for(int c=0;c<channel;++c)
-            {
-                gyy.at<float>(i+1,j*channel+c) =
-                    (float)img.at<float>((i+1),j*channel+c) - (float)img.at<float>(i,j*channel+c);
-
-            }
+    Mat kernel = Mat::zeros(3, 1, CV_8S);
+    kernel.at<char>(0,0) = -1;
+    kernel.at<char>(1,0) = 1;
+    filter2D(img, gyy, CV_32F, kernel);
 }
 
 void Cloning::dst(double *mod_diff, double *sineTransform,int h,int w)
diff --git a/modules/photo/test/test_denoising.cuda.cpp b/modules/photo/test/test_denoising.cuda.cpp
index 2051368a0..dce20b9f5 100644
--- a/modules/photo/test/test_denoising.cuda.cpp
+++ b/modules/photo/test/test_denoising.cuda.cpp
@@ -61,6 +61,7 @@ TEST(CUDA_BruteForceNonLocalMeans, Regression)
 
     cv::Mat bgr  = readImage("../gpu/denoising/lena_noised_gaussian_sigma=20_multi_0.png", cv::IMREAD_COLOR);
     ASSERT_FALSE(bgr.empty());
+    cv::resize(bgr, bgr, cv::Size(256, 256));
 
     cv::Mat gray;
     cv::cvtColor(bgr, gray, cv::COLOR_BGR2GRAY);
@@ -77,6 +78,8 @@ TEST(CUDA_BruteForceNonLocalMeans, Regression)
     cv::Mat bgr_gold  = readImage("../gpu/denoising/nlm_denoised_lena_bgr.png", cv::IMREAD_COLOR);
     cv::Mat gray_gold  = readImage("../gpu/denoising/nlm_denoised_lena_gray.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(bgr_gold.empty() || gray_gold.empty());
+    cv::resize(bgr_gold, bgr_gold, cv::Size(256, 256));
+    cv::resize(gray_gold, gray_gold, cv::Size(256, 256));
 
     EXPECT_MAT_NEAR(bgr_gold, dbgr, 1e-4);
     EXPECT_MAT_NEAR(gray_gold, dgray, 1e-4);
diff --git a/modules/python/CMakeLists.txt b/modules/python/CMakeLists.txt
index e352ec419..f9352d24a 100644
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@@ -109,14 +109,14 @@ endif()
 if(WIN32)
   set(PYTHON_INSTALL_ARCHIVE "")
 else()
-  set(PYTHON_INSTALL_ARCHIVE ARCHIVE DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT main)
+  set(PYTHON_INSTALL_ARCHIVE ARCHIVE DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT python)
 endif()
 
 if(NOT INSTALL_CREATE_DISTRIB)
   install(TARGETS ${the_module}
           ${PYTHON_INSTALL_CONFIGURATIONS}
-          RUNTIME DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT main
-          LIBRARY DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT main
+          RUNTIME DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT python
+          LIBRARY DESTINATION ${PYTHON_PACKAGES_PATH} COMPONENT python
           ${PYTHON_INSTALL_ARCHIVE}
           )
 else()
@@ -127,7 +127,7 @@ else()
   endif()
   install(TARGETS ${the_module}
           CONFIGURATIONS Release
-          RUNTIME DESTINATION python/${__ver}/${OpenCV_ARCH} COMPONENT main
-          LIBRARY DESTINATION python/${__ver}/${OpenCV_ARCH} COMPONENT main
+          RUNTIME DESTINATION python/${__ver}/${OpenCV_ARCH} COMPONENT python
+          LIBRARY DESTINATION python/${__ver}/${OpenCV_ARCH} COMPONENT python
           )
 endif()
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 734f121a3..dfb1d6898 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -1,3 +1,8 @@
+#if defined(_MSC_VER) && (_MSC_VER >= 1800)
+// eliminating duplicated round() declaration
+#define HAVE_ROUND
+#endif
+
 #include <Python.h>
 
 #define MODULESTR "cv2"
@@ -184,7 +189,6 @@ public:
     UMatData* allocate(PyObject* o, int dims, const int* sizes, int type, size_t* step) const
     {
         UMatData* u = new UMatData(this);
-        u->refcount = 1;
         u->data = u->origdata = (uchar*)PyArray_DATA((PyArrayObject*) o);
         npy_intp* _strides = PyArray_STRIDES((PyArrayObject*) o);
         for( int i = 0; i < dims - 1; i++ )
@@ -195,13 +199,13 @@ public:
         return u;
     }
 
-    UMatData* allocate(int dims0, const int* sizes, int type, void* data, size_t* step, int flags) const
+    UMatData* allocate(int dims0, const int* sizes, int type, void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
     {
         if( data != 0 )
         {
             CV_Error(Error::StsAssert, "The data should normally be NULL!");
             // probably this is safe to do in such extreme case
-            return stdAllocator->allocate(dims0, sizes, type, data, step, flags);
+            return stdAllocator->allocate(dims0, sizes, type, data, step, flags, usageFlags);
         }
         PyEnsureGIL gil;
 
@@ -224,9 +228,9 @@ public:
         return allocate(o, dims0, sizes, type, step);
     }
 
-    bool allocate(UMatData* u, int accessFlags) const
+    bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
     {
-        return stdAllocator->allocate(u, accessFlags);
+        return stdAllocator->allocate(u, accessFlags, usageFlags);
     }
 
     void deallocate(UMatData* u) const
@@ -411,6 +415,7 @@ static bool pyopencv_to(PyObject* o, Mat& m, const ArgInfo info)
 
     m = Mat(ndims, size, type, PyArray_DATA(oarr), step);
     m.u = g_numpyAllocator.allocate(o, ndims, size, type, step);
+    m.addref();
 
     if( !needcopy )
     {
diff --git a/modules/shape/src/scd_def.hpp b/modules/shape/src/scd_def.hpp
index 1a180fd84..d5bb5e490 100644
--- a/modules/shape/src/scd_def.hpp
+++ b/modules/shape/src/scd_def.hpp
@@ -120,7 +120,6 @@ public:
 
 private:
     float minMatchCost;
-    float betaAdditional;
 protected:
     void buildCostMatrix(const cv::Mat& descriptors1, const cv::Mat& descriptors2,
                                      cv::Mat& costMatrix, cv::Ptr<cv::HistogramCostExtractor>& comparer) const;
diff --git a/modules/softcascade/src/cuda_invoker.hpp b/modules/softcascade/src/cuda_invoker.hpp
index 00b7fe4b9..81229dfb4 100644
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
@@ -110,7 +110,7 @@ struct Detection
 
     Detection(){}
     __device_inline__ Detection(int _x, int _y, uchar _w, uchar _h, float c)
-    : x(static_cast<ushort>(_x)), y(static_cast<ushort>(_y)), w(_w), h(_h), confidence(c), kind(0) {};
+    : x(static_cast<ushort>(_x)), y(static_cast<ushort>(_y)), w(_w), h(_h), confidence(c), kind(0) {}
 };
 
 struct GK107PolicyX4
diff --git a/modules/softcascade/src/integral_channel_builder.cpp b/modules/softcascade/src/integral_channel_builder.cpp
index 08f155e4c..540005b76 100644
--- a/modules/softcascade/src/integral_channel_builder.cpp
+++ b/modules/softcascade/src/integral_channel_builder.cpp
@@ -118,7 +118,7 @@ public:
 using cv::softcascade::ChannelFeatureBuilder;
 using cv::softcascade::ChannelFeature;
 
-CV_INIT_ALGORITHM(HOG6MagLuv,  "ChannelFeatureBuilder.HOG6MagLuv", );
+CV_INIT_ALGORITHM(HOG6MagLuv,  "ChannelFeatureBuilder.HOG6MagLuv", )
 
 ChannelFeatureBuilder::~ChannelFeatureBuilder() {}
 
diff --git a/modules/softcascade/src/octave.cpp b/modules/softcascade/src/octave.cpp
index 5c5aa2eec..96b8c6192 100644
--- a/modules/softcascade/src/octave.cpp
+++ b/modules/softcascade/src/octave.cpp
@@ -445,7 +445,7 @@ void BoostedSoftCascadeOctave::write( CvFileStorage* fs, cv::String _name) const
 
 }
 
-CV_INIT_ALGORITHM(BoostedSoftCascadeOctave, "Octave.BoostedSoftCascadeOctave", );
+CV_INIT_ALGORITHM(BoostedSoftCascadeOctave, "Octave.BoostedSoftCascadeOctave", )
 
 Octave::~Octave(){}
 
diff --git a/modules/softcascade/src/softcascade_init.cpp b/modules/softcascade/src/softcascade_init.cpp
index 6f3c8b6a5..59a33f8c0 100644
--- a/modules/softcascade/src/softcascade_init.cpp
+++ b/modules/softcascade/src/softcascade_init.cpp
@@ -49,12 +49,12 @@ CV_INIT_ALGORITHM(Detector, "SoftCascade.Detector",
                   obj.info()->addParam(obj, "minScale",    obj.minScale);
                   obj.info()->addParam(obj, "maxScale",    obj.maxScale);
                   obj.info()->addParam(obj, "scales",      obj.scales);
-                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
+                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria))
 
 CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
                   obj.info()->addParam(obj, "minScale", obj.minScale);
                   obj.info()->addParam(obj, "maxScale", obj.maxScale);
-                  obj.info()->addParam(obj, "scales",   obj.scales));
+                  obj.info()->addParam(obj, "scales",   obj.scales))
 
 bool initModule_softcascade(void)
 {
diff --git a/modules/stitching/doc/exposure_compensation.rst b/modules/stitching/doc/exposure_compensation.rst
index eff98c01c..ec0d5db32 100644
--- a/modules/stitching/doc/exposure_compensation.rst
+++ b/modules/stitching/doc/exposure_compensation.rst
@@ -62,8 +62,8 @@ Stub exposure compensator which does nothing. ::
     {
     public:
         void feed(const std::vector<Point> &/*corners*/, const std::vector<Mat> &/*images*/,
-                  const std::vector<std::pair<Mat,uchar> > &/*masks*/) {};
-        void apply(int /*index*/, Point /*corner*/, Mat &/*image*/, const Mat &/*mask*/) {};
+                  const std::vector<std::pair<Mat,uchar> > &/*masks*/) { }
+        void apply(int /*index*/, Point /*corner*/, Mat &/*image*/, const Mat &/*mask*/) { }
     };
 
 .. seealso:: :ocv:class:`detail::ExposureCompensator`
diff --git a/modules/stitching/doc/warpers.rst b/modules/stitching/doc/warpers.rst
index 1025ffa0c..278020f07 100644
--- a/modules/stitching/doc/warpers.rst
+++ b/modules/stitching/doc/warpers.rst
@@ -14,17 +14,17 @@ Rotation-only model image warper interface. ::
     public:
         virtual ~RotationWarper() {}
 
-        virtual Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R) = 0;
+        virtual Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) = 0;
 
-        virtual Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap) = 0;
+        virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) = 0;
 
-        virtual Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                           Mat &dst) = 0;
+        virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                           OutputArray dst) = 0;
 
-        virtual void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                  Size dst_size, Mat &dst) = 0;
+        virtual void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                  Size dst_size, OutputArray dst) = 0;
 
-        virtual Rect warpRoi(Size src_size, const Mat &K, const Mat &R) = 0;
+        virtual Rect warpRoi(Size src_size, InputArray K, InputArray R) = 0;
     };
 
 detail::RotationWarper::warpPoint
@@ -32,7 +32,7 @@ detail::RotationWarper::warpPoint
 
 Projects the image point.
 
-.. ocv:function:: Point2f detail::RotationWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R)
+.. ocv:function:: Point2f detail::RotationWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R)
 
     :param pt: Source point
 
@@ -47,7 +47,7 @@ detail::RotationWarper::buildMaps
 
 Builds the projection maps according to the given camera data.
 
-.. ocv:function:: Rect detail::RotationWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+.. ocv:function:: Rect detail::RotationWarper::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
 
     :param src_size: Source image size
 
@@ -66,7 +66,7 @@ detail::RotationWarper::warp
 
 Projects the image.
 
-.. ocv:function:: Point detail::RotationWarper::warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode, Mat &dst)
+.. ocv:function:: Point detail::RotationWarper::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
 
     :param src: Source image
 
@@ -87,7 +87,7 @@ detail::RotationWarper::warpBackward
 
 Projects the image backward.
 
-.. ocv:function:: void detail::RotationWarper::warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode, Size dst_size, Mat &dst)
+.. ocv:function:: void detail::RotationWarper::warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, Size dst_size, OutputArray dst)
 
     :param src: Projected image
 
@@ -106,7 +106,7 @@ Projects the image backward.
 detail::RotationWarper::warpRoi
 -------------------------------
 
-.. ocv:function:: Rect detail::RotationWarper::warpRoi(Size src_size, const Mat &K, const Mat &R)
+.. ocv:function:: Rect detail::RotationWarper::warpRoi(Size src_size, InputArray K, InputArray R)
 
     :param src_size: Source image bounding box
 
@@ -124,9 +124,9 @@ Base class for warping logic implementation. ::
 
     struct CV_EXPORTS ProjectorBase
     {
-        void setCameraParams(const Mat &K = Mat::eye(3, 3, CV_32F),
-                            const Mat &R = Mat::eye(3, 3, CV_32F),
-                            const Mat &T = Mat::zeros(3, 1, CV_32F));
+        void setCameraParams(InputArray K = Mat::eye(3, 3, CV_32F),
+                            InputArray R = Mat::eye(3, 3, CV_32F),
+                            InputArray T = Mat::zeros(3, 1, CV_32F));
 
         float scale;
         float k[9];
@@ -146,17 +146,17 @@ Base class for rotation-based warper using a `detail::ProjectorBase`_ derived cl
     class CV_EXPORTS RotationWarperBase : public RotationWarper
     {
     public:
-        Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R);
+        Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
 
-        Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap);
+        Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
 
-        Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                Mat &dst);
+        Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                OutputArray dst);
 
-        void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                        Size dst_size, Mat &dst);
+        void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                        Size dst_size, OutputArray dst);
 
-        Rect warpRoi(Size src_size, const Mat &K, const Mat &R);
+        Rect warpRoi(Size src_size, InputArray K, InputArray R);
 
     protected:
 
@@ -183,14 +183,14 @@ Warper that maps an image onto the z = 1 plane. ::
 
         void setScale(float scale) { projector_.scale = scale; }
 
-        Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T);
+        Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
 
-        Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap);
+        Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
 
-        Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                   Mat &dst);
+        Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                   OutputArray dst);
 
-        Rect warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T);
+        Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
 
     protected:
         void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
diff --git a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
index f39785f57..84a8ce4fe 100644
--- a/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/exposure_compensate.hpp
@@ -68,8 +68,8 @@ class CV_EXPORTS NoExposureCompensator : public ExposureCompensator
 {
 public:
     void feed(const std::vector<Point> &/*corners*/, const std::vector<Mat> &/*images*/,
-              const std::vector<std::pair<Mat,uchar> > &/*masks*/) {};
-    void apply(int /*index*/, Point /*corner*/, Mat &/*image*/, const Mat &/*mask*/) {};
+              const std::vector<std::pair<Mat,uchar> > &/*masks*/) { }
+    void apply(int /*index*/, Point /*corner*/, Mat &/*image*/, const Mat &/*mask*/) { }
 };
 
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
index 8906d88a3..093f07cc1 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
@@ -56,17 +56,17 @@ class CV_EXPORTS RotationWarper
 public:
     virtual ~RotationWarper() {}
 
-    virtual Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R) = 0;
+    virtual Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R) = 0;
 
-    virtual Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap) = 0;
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) = 0;
 
-    virtual Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                       Mat &dst) = 0;
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                       OutputArray dst) = 0;
 
-    virtual void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                              Size dst_size, Mat &dst) = 0;
+    virtual void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                              Size dst_size, OutputArray dst) = 0;
 
-    virtual Rect warpRoi(Size src_size, const Mat &K, const Mat &R) = 0;
+    virtual Rect warpRoi(Size src_size, InputArray K, InputArray R) = 0;
 
     virtual float getScale() const { return 1.f; }
     virtual void setScale(float) {}
@@ -75,9 +75,9 @@ public:
 
 struct CV_EXPORTS ProjectorBase
 {
-    void setCameraParams(const Mat &K = Mat::eye(3, 3, CV_32F),
-                         const Mat &R = Mat::eye(3, 3, CV_32F),
-                         const Mat &T = Mat::zeros(3, 1, CV_32F));
+    void setCameraParams(InputArray K = Mat::eye(3, 3, CV_32F),
+                         InputArray R = Mat::eye(3, 3, CV_32F),
+                         InputArray T = Mat::zeros(3, 1, CV_32F));
 
     float scale;
     float k[9];
@@ -92,17 +92,17 @@ template <class P>
 class CV_EXPORTS RotationWarperBase : public RotationWarper
 {
 public:
-    Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R);
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst);
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst);
 
-    void warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                      Size dst_size, Mat &dst);
+    void warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                      Size dst_size, OutputArray dst);
 
-    Rect warpRoi(Size src_size, const Mat &K, const Mat &R);
+    Rect warpRoi(Size src_size, InputArray K, InputArray R);
 
     float getScale() const { return projector_.scale; }
     void setScale(float val) { projector_.scale = val; }
@@ -132,14 +132,14 @@ class CV_EXPORTS PlaneWarper : public RotationWarperBase<PlaneProjector>
 public:
     PlaneWarper(float scale = 1.f) { projector_.scale = scale; }
 
-    Point2f warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T);
+    Point2f warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap);
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               Mat &dst);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               OutputArray dst);
 
-    Rect warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T);
+    Rect warpRoi(Size src_size, InputArray K, InputArray R, InputArray T);
 
 protected:
     void detectResultRoi(Size src_size, Point &dst_tl, Point &dst_br);
@@ -333,7 +333,7 @@ class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 public:
     PlaneWarperGpu(float scale = 1.f) : PlaneWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -341,7 +341,7 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, T, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -349,8 +349,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -358,8 +358,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, T, interp_mode, border_mode, d_dst_);
@@ -367,15 +367,15 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -387,7 +387,7 @@ class CV_EXPORTS SphericalWarperGpu : public SphericalWarper
 public:
     SphericalWarperGpu(float scale) : SphericalWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -395,8 +395,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -404,10 +404,10 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -419,7 +419,7 @@ class CV_EXPORTS CylindricalWarperGpu : public CylindricalWarper
 public:
     CylindricalWarperGpu(float scale) : CylindricalWarper(scale) {}
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
         d_xmap_.download(xmap);
@@ -427,8 +427,8 @@ public:
         return result;
     }
 
-    Point warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               Mat &dst)
+    Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               OutputArray dst)
     {
         d_src_.upload(src);
         Point result = warp(d_src_, K, R, interp_mode, border_mode, d_dst_);
@@ -436,10 +436,10 @@ public:
         return result;
     }
 
-    Rect buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap);
+    Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
-    Point warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-               cuda::GpuMat &dst);
+    Point warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+               cuda::GpuMat & dst);
 
 private:
     cuda::GpuMat d_xmap_, d_ymap_, d_src_, d_dst_;
@@ -503,6 +503,45 @@ protected:
     }
 };
 
+/////////////////////////////////////// OpenCL Accelerated Warpers /////////////////////////////////////
+
+class CV_EXPORTS PlaneWarperOcl : public PlaneWarper
+{
+public:
+    PlaneWarperOcl(float scale = 1.f) : PlaneWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+    {
+        return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32FC1), xmap, ymap);
+    }
+
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+    {
+        return warp(src, K, R, Mat::zeros(3, 1, CV_32FC1), interp_mode, border_mode, dst);
+    }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst);
+};
+
+class CV_EXPORTS SphericalWarperOcl :  public SphericalWarper
+{
+public:
+    SphericalWarperOcl(float scale) : SphericalWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
+};
+
+class CV_EXPORTS CylindricalWarperOcl :  public CylindricalWarper
+{
+public:
+    CylindricalWarperOcl(float scale) : CylindricalWarper(scale) { }
+
+    virtual Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap);
+    virtual Point warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst);
+};
+
 } // namespace detail
 } // namespace cv
 
diff --git a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
index 144e9e32d..f6eae4fa7 100644
--- a/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers_inl.hpp
@@ -51,7 +51,7 @@ namespace cv {
 namespace detail {
 
 template <class P>
-Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, const Mat &K, const Mat &R)
+Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, InputArray K, InputArray R)
 {
     projector_.setCameraParams(K, R);
     Point2f uv;
@@ -61,15 +61,17 @@ Point2f RotationWarperBase<P>::warpPoint(const Point2f &pt, const Mat &K, const
 
 
 template <class P>
-Rect RotationWarperBase<P>::buildMaps(Size src_size, const Mat &K, const Mat &R, Mat &xmap, Mat &ymap)
+Rect RotationWarperBase<P>::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray _xmap, OutputArray _ymap)
 {
     projector_.setCameraParams(K, R);
 
     Point dst_tl, dst_br;
     detectResultRoi(src_size, dst_tl, dst_br);
 
-    xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
-    ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+
+    Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
 
     float x, y;
     for (int v = dst_tl.y; v <= dst_br.y; ++v)
@@ -87,8 +89,8 @@ Rect RotationWarperBase<P>::buildMaps(Size src_size, const Mat &K, const Mat &R,
 
 
 template <class P>
-Point RotationWarperBase<P>::warp(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                  Mat &dst)
+Point RotationWarperBase<P>::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                  OutputArray dst)
 {
     Mat xmap, ymap;
     Rect dst_roi = buildMaps(src.size(), K, R, xmap, ymap);
@@ -101,14 +103,16 @@ Point RotationWarperBase<P>::warp(const Mat &src, const Mat &K, const Mat &R, in
 
 
 template <class P>
-void RotationWarperBase<P>::warpBackward(const Mat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                         Size dst_size, Mat &dst)
+void RotationWarperBase<P>::warpBackward(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                         Size dst_size, OutputArray dst)
 {
     projector_.setCameraParams(K, R);
 
     Point src_tl, src_br;
     detectResultRoi(dst_size, src_tl, src_br);
-    CV_Assert(src_br.x - src_tl.x + 1 == src.cols && src_br.y - src_tl.y + 1 == src.rows);
+
+    Size size = src.size();
+    CV_Assert(src_br.x - src_tl.x + 1 == size.width && src_br.y - src_tl.y + 1 == size.height);
 
     Mat xmap(dst_size, CV_32F);
     Mat ymap(dst_size, CV_32F);
@@ -130,7 +134,7 @@ void RotationWarperBase<P>::warpBackward(const Mat &src, const Mat &K, const Mat
 
 
 template <class P>
-Rect RotationWarperBase<P>::warpRoi(Size src_size, const Mat &K, const Mat &R)
+Rect RotationWarperBase<P>::warpRoi(Size src_size, InputArray K, InputArray R)
 {
     projector_.setCameraParams(K, R);
 
diff --git a/modules/stitching/include/opencv2/stitching/warpers.hpp b/modules/stitching/include/opencv2/stitching/warpers.hpp
index da5fe2618..cdcb35c20 100644
--- a/modules/stitching/include/opencv2/stitching/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/warpers.hpp
@@ -167,6 +167,24 @@ public:
 };
 #endif
 
+class PlaneWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::PlaneWarperOcl>(scale); }
+};
+
+class SphericalWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::SphericalWarperOcl>(scale); }
+};
+
+class CylindricalWarperOcl: public WarperCreator
+{
+public:
+    Ptr<detail::RotationWarper> create(float scale) const { return makePtr<detail::CylindricalWarperOcl>(scale); }
+};
+
 } // namespace cv
 
 #endif // __OPENCV_STITCHING_WARPER_CREATORS_HPP__
diff --git a/modules/stitching/perf/opencl/perf_warpers.cpp b/modules/stitching/perf/opencl/perf_warpers.cpp
new file mode 100644
index 000000000..21fe22da7
--- /dev/null
+++ b/modules/stitching/perf/opencl/perf_warpers.cpp
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/stitching/warpers.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////////// Stitching Warpers ///////////////////////////
+
+enum
+{
+    SphericalWarperType = 0,
+    CylindricalWarperType = 1,
+    PlaneWarperType = 2
+};
+
+class WarperBase
+{
+public:
+    explicit WarperBase(int type, Size srcSize)
+    {
+        Ptr<WarperCreator> creator;
+        if (cv::ocl::useOpenCL())
+        {
+            if (type == SphericalWarperType)
+                creator = makePtr<SphericalWarperOcl>();
+            else if (type == CylindricalWarperType)
+                creator = makePtr<CylindricalWarperOcl>();
+            else if (type == PlaneWarperType)
+                creator = makePtr<PlaneWarperOcl>();
+        }
+        else
+        {
+            if (type == SphericalWarperType)
+                creator = makePtr<SphericalWarper>();
+            else if (type == CylindricalWarperType)
+                creator = makePtr<CylindricalWarper>();
+            else if (type == PlaneWarperType)
+                creator = makePtr<PlaneWarper>();
+        }
+        CV_Assert(!creator.empty());
+
+        K = Mat::eye(3, 3, CV_32FC1);
+        K.at<float>(0,0) = (float)srcSize.width;
+        K.at<float>(0,2) = (float)srcSize.width/2;
+        K.at<float>(1,1) = (float)srcSize.height;
+        K.at<float>(1,2) = (float)srcSize.height/2;
+        K.at<float>(2,2) = 1.0f;
+        R = Mat::eye(3, 3, CV_32FC1);
+        float scale = (float)srcSize.width;
+
+        warper = creator->create(scale);
+    }
+
+    Rect buildMaps(Size src_size, OutputArray xmap, OutputArray ymap) const
+    {
+        return warper->buildMaps(src_size, K, R, xmap, ymap);
+    }
+
+    Point warp(InputArray src, int interp_mode, int border_mode, OutputArray dst) const
+    {
+        return warper->warp(src, K, R, interp_mode, border_mode, dst);
+    }
+
+private:
+    Ptr<detail::RotationWarper> warper;
+    Mat K, R;
+};
+
+CV_ENUM(WarperType, SphericalWarperType, CylindricalWarperType, PlaneWarperType)
+
+typedef tuple<Size, WarperType> StitchingWarpersParams;
+typedef TestBaseWithParam<StitchingWarpersParams> StitchingWarpersFixture;
+
+static void prepareWarperSrc(InputOutputArray src, Size srcSize)
+{
+    src.create(srcSize, CV_8UC1);
+    src.setTo(Scalar::all(64));
+    ellipse(src, Point(srcSize.width/2, srcSize.height/2), Size(srcSize.width/2, srcSize.height/2),
+            360, 0, 360, Scalar::all(255), 2);
+    ellipse(src, Point(srcSize.width/2, srcSize.height/2), Size(srcSize.width/3, srcSize.height/3),
+            360, 0, 360, Scalar::all(128), 2);
+    rectangle(src, Point(10, 10), Point(srcSize.width - 10, srcSize.height - 10), Scalar::all(128), 2);
+}
+
+OCL_PERF_TEST_P(StitchingWarpersFixture, StitchingWarpers_BuildMaps,
+                ::testing::Combine(OCL_TEST_SIZES, WarperType::all()))
+{
+    const StitchingWarpersParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const WarperBase warper(get<1>(params), srcSize);
+
+    UMat xmap, ymap;
+
+    OCL_TEST_CYCLE() warper.buildMaps(srcSize, xmap, ymap);
+
+    SANITY_CHECK(xmap, 1e-3);
+    SANITY_CHECK(ymap, 1e-3);
+}
+
+OCL_PERF_TEST_P(StitchingWarpersFixture, StitchingWarpers_Warp,
+                ::testing::Combine(OCL_TEST_SIZES, WarperType::all()))
+{
+    const StitchingWarpersParams params = GetParam();
+    const Size srcSize = get<0>(params);
+    const WarperBase warper(get<1>(params), srcSize);
+
+    UMat src, dst;
+    prepareWarperSrc(src, srcSize);
+    declare.in(src, WARMUP_READ);
+
+    OCL_TEST_CYCLE() warper.warp(src, INTER_LINEAR, BORDER_REPLICATE, dst);
+
+#if 0
+    namedWindow("src", WINDOW_NORMAL);
+    namedWindow("dst", WINDOW_NORMAL);
+    imshow("src", src);
+    imshow("dst", dst);
+    std::cout << dst.size() << " " << dst.size().area() << std::endl;
+    cv::waitKey();
+#endif
+
+    SANITY_CHECK(dst, 1e-5);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/stitching/src/blenders.cpp b/modules/stitching/src/blenders.cpp
index 3e9cfb7c7..446bfc131 100644
--- a/modules/stitching/src/blenders.cpp
+++ b/modules/stitching/src/blenders.cpp
@@ -512,6 +512,7 @@ void createLaplacePyrGpu(const Mat &img, int num_levels, std::vector<Mat> &pyr)
     (void)img;
     (void)num_levels;
     (void)pyr;
+    CV_Error(Error::StsNotImplemented, "CUDA optimization is unavailable");
 #endif
 }
 
@@ -549,6 +550,7 @@ void restoreImageFromLaplacePyrGpu(std::vector<Mat> &pyr)
     gpu_pyr[0].download(pyr[0]);
 #else
     (void)pyr;
+    CV_Error(Error::StsNotImplemented, "CUDA optimization is unavailable");
 #endif
 }
 
diff --git a/modules/ocl/src/opencl/build_warps.cl b/modules/stitching/src/opencl/warpers.cl
similarity index 52%
rename from modules/ocl/src/opencl/build_warps.cl
rename to modules/stitching/src/opencl/warpers.cl
index bd5e002b5..032ddf3ce 100644
--- a/modules/ocl/src/opencl/build_warps.cl
+++ b/modules/stitching/src/opencl/warpers.cl
@@ -43,24 +43,18 @@
 //
 //M*/
 
-__kernel void buildWarpPlaneMaps(__global float * xmap, __global float * ymap,
-                                 __constant float * KRT,
-                                 int tl_u, int tl_v,
-                                 int cols, int rows,
-                                 int xmap_step, int ymap_step,
-                                 int xmap_offset, int ymap_offset,
-                                 float scale)
+__kernel void buildWarpPlaneMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                 __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                 __constant float * ck_rinv, __constant float * ct,
+                                 int tl_u, int tl_v, float scale)
 {
     int du = get_global_id(0);
     int dv = get_global_id(1);
 
-    __constant float * ck_rinv = KRT;
-    __constant float * ct      = KRT + 9;
-
     if (du < cols && dv < rows)
     {
-        int xmap_index = mad24(dv, xmap_step, xmap_offset + du);
-        int ymap_index = mad24(dv, ymap_step, ymap_offset + du);
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
 
         float u = tl_u + du;
         float v = tl_v + dv;
@@ -77,26 +71,22 @@ __kernel void buildWarpPlaneMaps(__global float * xmap, __global float * ymap,
         x /= z;
         y /= z;
 
-        xmap[xmap_index] = x;
-        ymap[ymap_index] = y;
+        xmap[0] = x;
+        ymap[0] = y;
     }
 }
 
-__kernel void buildWarpCylindricalMaps(__global float * xmap, __global float * ymap,
-                                       __constant float * ck_rinv,
-                                       int tl_u, int tl_v,
-                                       int cols, int rows,
-                                       int xmap_step, int ymap_step,
-                                       int xmap_offset, int ymap_offset,
-                                       float scale)
+__kernel void buildWarpCylindricalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                       __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                       __constant float * ck_rinv, int tl_u, int tl_v, float scale)
 {
     int du = get_global_id(0);
     int dv = get_global_id(1);
 
     if (du < cols && dv < rows)
     {
-        int xmap_index = mad24(dv, xmap_step, xmap_offset + du);
-        int ymap_index = mad24(dv, ymap_step, ymap_offset + du);
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
 
         float u = tl_u + du;
         float v = tl_v + dv;
@@ -112,29 +102,25 @@ __kernel void buildWarpCylindricalMaps(__global float * xmap, __global float * y
         y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
         z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
 
-        if (z > 0) { x /= z; y /= z; }
+        if (z > 0) x /= z, y /= z;
         else x = y = -1;
 
-        xmap[xmap_index] = x;
-        ymap[ymap_index] = y;
+        xmap[0] = x;
+        ymap[0] = y;
     }
 }
 
-__kernel void buildWarpSphericalMaps(__global float * xmap, __global float * ymap,
-                                     __constant float * ck_rinv,
-                                     int tl_u, int tl_v,
-                                     int cols, int rows,
-                                     int xmap_step, int ymap_step,
-                                     int xmap_offset, int ymap_offset,
-                                     float scale)
+__kernel void buildWarpSphericalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset,
+                                     __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols,
+                                     __constant float * ck_rinv, int tl_u, int tl_v, float scale)
 {
     int du = get_global_id(0);
     int dv = get_global_id(1);
 
     if (du < cols && dv < rows)
     {
-        int xmap_index = mad24(dv, xmap_step, xmap_offset + du);
-        int ymap_index = mad24(dv, ymap_step, ymap_offset + du);
+        __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float)));
+        __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float)));
 
         float u = tl_u + du;
         float v = tl_v + dv;
@@ -145,7 +131,7 @@ __kernel void buildWarpSphericalMaps(__global float * xmap, __global float * yma
 
         float sinv = sin(v);
         float x_ = sinv * sin(u);
-        float y_ = - cos(v);
+        float y_ = -cos(v);
         float z_ = sinv * cos(u);
 
         float z;
@@ -153,55 +139,10 @@ __kernel void buildWarpSphericalMaps(__global float * xmap, __global float * yma
         y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
         z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
 
-        if (z > 0) { x /= z; y /= z; }
+        if (z > 0) x /= z, y /= z;
         else x = y = -1;
 
-        xmap[xmap_index] = x;
-        ymap[ymap_index] = y;
-    }
-}
-
-__kernel void buildWarpAffineMaps(__global float * xmap, __global float * ymap,
-                                  __constant float * c_warpMat,
-                                  int cols, int rows,
-                                  int xmap_step, int ymap_step,
-                                  int xmap_offset, int ymap_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int xmap_index = mad24(y, xmap_step, x + xmap_offset);
-        int ymap_index = mad24(y, ymap_step, x + ymap_offset);
-
-        float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-        float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-        xmap[xmap_index] = xcoo;
-        ymap[ymap_index] = ycoo;
-    }
-}
-
-__kernel void buildWarpPerspectiveMaps(__global float * xmap, __global float * ymap,
-                                       __constant float * c_warpMat,
-                                       int cols, int rows,
-                                       int xmap_step, int ymap_step,
-                                       int xmap_offset, int ymap_offset)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int xmap_index = mad24(y, xmap_step, x + xmap_offset);
-        int ymap_index = mad24(y, ymap_step, x + ymap_offset);
-
-        float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-        float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-        float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-        xmap[xmap_index] = xcoo;
-        ymap[ymap_index] = ycoo;
+        xmap[0] = x;
+        ymap[0] = y;
     }
 }
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 04445176e..499202fa0 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -53,6 +53,7 @@
 #include <sstream>
 #include <cmath>
 #include "opencv2/core.hpp"
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/stitching.hpp"
 #include "opencv2/stitching/detail/autocalib.hpp"
diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp
index 3f71f2040..eb15d44c0 100644
--- a/modules/stitching/src/warpers.cpp
+++ b/modules/stitching/src/warpers.cpp
@@ -45,8 +45,10 @@
 namespace cv {
 namespace detail {
 
-void ProjectorBase::setCameraParams(const Mat &K, const Mat &R, const Mat &T)
+void ProjectorBase::setCameraParams(InputArray _K, InputArray _R, InputArray _T)
 {
+    Mat K = _K.getMat(), R = _R.getMat(), T = _T.getMat();
+
     CV_Assert(K.size() == Size(3, 3) && K.type() == CV_32F);
     CV_Assert(R.size() == Size(3, 3) && R.type() == CV_32F);
     CV_Assert((T.size() == Size(1, 3) || T.size() == Size(3, 1)) && T.type() == CV_32F);
@@ -76,7 +78,7 @@ void ProjectorBase::setCameraParams(const Mat &K, const Mat &R, const Mat &T)
 }
 
 
-Point2f PlaneWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R, const Mat &T)
+Point2f PlaneWarper::warpPoint(const Point2f &pt, InputArray K, InputArray R, InputArray T)
 {
     projector_.setCameraParams(K, R, T);
     Point2f uv;
@@ -85,15 +87,17 @@ Point2f PlaneWarper::warpPoint(const Point2f &pt, const Mat &K, const Mat &R, co
 }
 
 
-Rect PlaneWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, Mat &xmap, Mat &ymap)
+Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray _xmap, OutputArray _ymap)
 {
     projector_.setCameraParams(K, R, T);
 
     Point dst_tl, dst_br;
     detectResultRoi(src_size, dst_tl, dst_br);
 
-    xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
-    ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _xmap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+    _ymap.create(dst_br.y - dst_tl.y + 1, dst_br.x - dst_tl.x + 1, CV_32F);
+
+    Mat xmap = _xmap.getMat(), ymap = _ymap.getMat();
 
     float x, y;
     for (int v = dst_tl.y; v <= dst_br.y; ++v)
@@ -110,8 +114,8 @@ Rect PlaneWarper::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat
 }
 
 
-Point PlaneWarper::warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                        Mat &dst)
+Point PlaneWarper::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                        OutputArray dst)
 {
     Mat xmap, ymap;
     Rect dst_roi = buildMaps(src.size(), K, R, T, xmap, ymap);
@@ -123,7 +127,7 @@ Point PlaneWarper::warp(const Mat &src, const Mat &K, const Mat &R, const Mat &T
 }
 
 
-Rect PlaneWarper::warpRoi(Size src_size, const Mat &K, const Mat &R, const Mat &T)
+Rect PlaneWarper::warpRoi(Size src_size, InputArray K, InputArray R, InputArray T)
 {
     projector_.setCameraParams(K, R, T);
 
@@ -211,12 +215,12 @@ void SphericalWarper::detectResultRoi(Size src_size, Point &dst_tl, Point &dst_b
 
 
 #ifdef HAVE_OPENCV_CUDAWARPING
-Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     return buildMaps(src_size, K, R, Mat::zeros(3, 1, CV_32F), xmap, ymap);
 }
 
-Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, const Mat &T, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect PlaneWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R, T);
 
@@ -229,15 +233,15 @@ Rect PlaneWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, const
     return Rect(dst_tl, dst_br);
 }
 
-Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                           cuda::GpuMat &dst)
+Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                           cuda::GpuMat & dst)
 {
     return warp(src, K, R, Mat::zeros(3, 1, CV_32F), interp_mode, border_mode, dst);
 }
 
 
-Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, const Mat &T, int interp_mode, int border_mode,
-                           cuda::GpuMat &dst)
+Point PlaneWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode,
+                           cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, T, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@@ -246,7 +250,7 @@ Point PlaneWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R,
 }
 
 
-Rect SphericalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect SphericalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R);
 
@@ -260,8 +264,8 @@ Rect SphericalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cu
 }
 
 
-Point SphericalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                               cuda::GpuMat &dst)
+Point SphericalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                               cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
@@ -270,7 +274,7 @@ Point SphericalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat
 }
 
 
-Rect CylindricalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R, cuda::GpuMat &xmap, cuda::GpuMat &ymap)
+Rect CylindricalWarperGpu::buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap)
 {
     projector_.setCameraParams(K, R);
 
@@ -284,8 +288,8 @@ Rect CylindricalWarperGpu::buildMaps(Size src_size, const Mat &K, const Mat &R,
 }
 
 
-Point CylindricalWarperGpu::warp(const cuda::GpuMat &src, const Mat &K, const Mat &R, int interp_mode, int border_mode,
-                                 cuda::GpuMat &dst)
+Point CylindricalWarperGpu::warp(const cuda::GpuMat & src, InputArray K, InputArray R, int interp_mode, int border_mode,
+                                 cuda::GpuMat & dst)
 {
     Rect dst_roi = buildMaps(src.size(), K, R, d_xmap_, d_ymap_);
     dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
diff --git a/modules/stitching/src/warpers_ocl.cpp b/modules/stitching/src/warpers_ocl.cpp
new file mode 100644
index 000000000..ef8f31677
--- /dev/null
+++ b/modules/stitching/src/warpers_ocl.cpp
@@ -0,0 +1,187 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+namespace cv {
+namespace detail {
+
+/////////////////////////////////////////// PlaneWarperOcl ////////////////////////////////////////////
+
+Rect PlaneWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, InputArray T, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R, T);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpPlaneMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv), t(1, 3, CV_32FC1, projector_.t);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(),
+                    uk_rinv = k_rinv.getUMat(ACCESS_READ), ut = t.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut),
+                   dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return PlaneWarper::buildMaps(src_size, K, R, T, xmap, ymap);
+}
+
+Point PlaneWarperOcl::warp(InputArray src, InputArray K, InputArray R, InputArray T, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, T, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+/////////////////////////////////////////// SphericalWarperOcl ////////////////////////////////////////
+
+Rect SphericalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpSphericalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return SphericalWarper::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point SphericalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+/////////////////////////////////////////// CylindricalWarperOcl ////////////////////////////////////////
+
+Rect CylindricalWarperOcl::buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap)
+{
+    projector_.setCameraParams(K, R);
+
+    Point dst_tl, dst_br;
+    detectResultRoi(src_size, dst_tl, dst_br);
+
+    if (ocl::useOpenCL())
+    {
+        ocl::Kernel k("buildWarpCylindricalMaps", ocl::stitching::warpers_oclsrc);
+        if (!k.empty())
+        {
+            Size dsize(dst_br.x - dst_tl.x + 1, dst_br.y - dst_tl.y + 1);
+            xmap.create(dsize, CV_32FC1);
+            ymap.create(dsize, CV_32FC1);
+
+            Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv);
+            UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ);
+
+            k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap),
+                   ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale);
+
+            size_t globalsize[2] = { dsize.width, dsize.height };
+            if (k.run(2, globalsize, NULL, true))
+                return Rect(dst_tl, dst_br);
+        }
+    }
+
+    return CylindricalWarper::buildMaps(src_size, K, R, xmap, ymap);
+}
+
+Point CylindricalWarperOcl::warp(InputArray src, InputArray K, InputArray R, int interp_mode, int border_mode, OutputArray dst)
+{
+    UMat uxmap, uymap;
+    Rect dst_roi = buildMaps(src.size(), K, R, uxmap, uymap);
+
+    dst.create(dst_roi.height + 1, dst_roi.width + 1, src.type());
+    UMat udst = dst.getUMat();
+    remap(src, udst, uxmap, uymap, interp_mode, border_mode);
+
+    return dst_roi.tl();
+}
+
+} // namespace detail
+} // namespace cv
diff --git a/modules/stitching/test/ocl/test_warpers.cpp b/modules/stitching/test/ocl/test_warpers.cpp
new file mode 100644
index 000000000..94050e966
--- /dev/null
+++ b/modules/stitching/test/ocl/test_warpers.cpp
@@ -0,0 +1,155 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+#include "opencv2/stitching/warpers.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////////////////// WarperTestBase ///////////////////////////
+
+struct WarperTestBase :
+        public Test, public TestUtils
+{
+    Mat src, dst, xmap, ymap;
+    Mat udst, uxmap, uymap;
+    Mat K, R;
+
+    virtual void generateTestData()
+    {
+        Size size = randomSize(1, MAX_VALUE);
+
+        src = randomMat(size, CV_32FC1, -500, 500);
+
+        K = Mat::eye(3, 3, CV_32FC1);
+        float angle = (float)(30.0 * CV_PI / 180.0);
+        float rotationMatrix[9] = {
+                (float)cos(angle), (float)sin(angle), 0,
+                (float)-sin(angle), (float)cos(angle), 0,
+                0, 0, 1
+        };
+        Mat(3, 3, CV_32FC1, rotationMatrix).copyTo(R);
+    }
+
+    void Near(double threshold = 0.)
+    {
+        EXPECT_MAT_NEAR(xmap, uxmap, threshold);
+        EXPECT_MAT_NEAR(ymap, uymap, threshold);
+        EXPECT_MAT_NEAR(dst, udst, threshold);
+    }
+};
+
+//////////////////////////////// SphericalWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase SphericalWarperOclTest;
+
+OCL_TEST_F(SphericalWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<SphericalWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-4);
+    }
+}
+
+//////////////////////////////// CylindricalWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase CylindricalWarperOclTest;
+
+OCL_TEST_F(CylindricalWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<CylindricalWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-4);
+    }
+}
+
+//////////////////////////////// PlaneWarperOcl /////////////////////////////////////////////////
+
+typedef WarperTestBase PlaneWarperOclTest;
+
+OCL_TEST_F(PlaneWarperOclTest, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        Ptr<WarperCreator> creator = makePtr<PlaneWarperOcl>();
+        Ptr<detail::RotationWarper> warper = creator->create(2.0);
+
+        OCL_OFF(warper->buildMaps(src.size(), K, R, xmap, ymap));
+        OCL_ON(warper->buildMaps(src.size(), K, R, uxmap, uymap));
+
+        OCL_OFF(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, dst));
+        OCL_ON(warper->warp(src, K, R, INTER_LINEAR, BORDER_REPLICATE, udst));
+
+        Near(1e-4);
+    }
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
index 1182a3c0a..092b1cd40 100644
--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -3,7 +3,6 @@ if(ANDROID OR IOS)
 endif()
 
 set(the_description "Super Resolution")
-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef -Wshadow)
 ocv_define_module(superres opencv_imgproc opencv_video
-                  OPTIONAL opencv_highgui opencv_ocl
-                  opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec)
+                  OPTIONAL opencv_highgui opencv_cudaarithm opencv_cudafilters opencv_cudawarping opencv_cudaimgproc opencv_cudaoptflow opencv_cudacodec)
diff --git a/modules/superres/include/opencv2/superres.hpp b/modules/superres/include/opencv2/superres.hpp
index 26de781f8..3d96e0f71 100644
--- a/modules/superres/include/opencv2/superres.hpp
+++ b/modules/superres/include/opencv2/superres.hpp
@@ -83,6 +83,8 @@ namespace cv
             virtual void initImpl(Ptr<FrameSource>& frameSource) = 0;
             virtual void processImpl(Ptr<FrameSource>& frameSource, OutputArray output) = 0;
 
+            bool isUmat_;
+
         private:
             Ptr<FrameSource> frameSource_;
             bool firstCall_;
diff --git a/modules/superres/perf/perf_superres.cpp b/modules/superres/perf/perf_superres.cpp
index 810460bd4..e8b3ef754 100644
--- a/modules/superres/perf/perf_superres.cpp
+++ b/modules/superres/perf/perf_superres.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
 using namespace std;
 using namespace std::tr1;
@@ -91,37 +92,26 @@ namespace
     class ZeroOpticalFlow : public DenseOpticalFlowExt
     {
     public:
-        void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
+        virtual void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
         {
             cv::Size size = frame0.size();
 
             if (!flow2.needed())
             {
                 flow1.create(size, CV_32FC2);
-
-                if (flow1.kind() == cv::_InputArray::GPU_MAT)
-                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow1.getMatRef().setTo(cv::Scalar::all(0));
+                flow1.setTo(cv::Scalar::all(0));
             }
             else
             {
                 flow1.create(size, CV_32FC1);
                 flow2.create(size, CV_32FC1);
 
-                if (flow1.kind() == cv::_InputArray::GPU_MAT)
-                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow1.getMatRef().setTo(cv::Scalar::all(0));
-
-                if (flow2.kind() == cv::_InputArray::GPU_MAT)
-                    flow2.getGpuMatRef().setTo(cv::Scalar::all(0));
-                else
-                    flow2.getMatRef().setTo(cv::Scalar::all(0));
+                flow1.setTo(cv::Scalar::all(0));
+                flow2.setTo(cv::Scalar::all(0));
             }
         }
 
-        void collectGarbage()
+        virtual void collectGarbage()
         {
         }
     };
@@ -181,3 +171,48 @@ PERF_TEST_P(Size_MatType, SuperResolution_BTVL1,
         CPU_SANITY_CHECK(dst);
     }
 }
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+typedef Size_MatType SuperResolution_BTVL1;
+
+OCL_PERF_TEST_P(SuperResolution_BTVL1 ,BTVL1,
+            Combine(Values(szSmall64, szSmall128),
+                    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
+{
+    Size_MatType_t params = GetParam();
+    const Size size = get<0>(params);
+    const int type = get<1>(params);
+
+    Mat frame(size, type);
+    UMat dst(1, 1, 0);
+    declare.in(frame, WARMUP_RNG);
+
+    const int scale = 2;
+    const int iterations = 50;
+    const int temporalAreaRadius = 1;
+
+    Ptr<DenseOpticalFlowExt> opticalFlow(new ZeroOpticalFlow);
+    Ptr<SuperResolution> superRes = createSuperResolution_BTVL1();
+
+    superRes->set("scale", scale);
+    superRes->set("iterations", iterations);
+    superRes->set("temporalAreaRadius", temporalAreaRadius);
+    superRes->set("opticalFlow", opticalFlow);
+
+    superRes->setInput(makePtr<OneFrameSource_CPU>(frame));
+
+    // skip first frame
+    superRes->nextFrame(dst);
+
+    OCL_TEST_CYCLE_N(10) superRes->nextFrame(dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/superres/perf/perf_superres_ocl.cpp b/modules/superres/perf/perf_superres_ocl.cpp
deleted file mode 100644
index 04a3f7e85..000000000
--- a/modules/superres/perf/perf_superres_ocl.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_OPENCV_OCL
-
-#include "opencv2/ocl.hpp"
-using namespace std;
-using namespace testing;
-using namespace perf;
-using namespace cv;
-using namespace cv::superres;
-
-namespace
-{
-    class OneFrameSource_OCL : public FrameSource
-    {
-    public:
-        explicit OneFrameSource_OCL(const ocl::oclMat& frame) : frame_(frame) {}
-
-        void nextFrame(OutputArray frame)
-        {
-            ocl::getOclMatRef(frame) = frame_;
-        }
-        void reset()
-        {
-        }
-
-    private:
-        ocl::oclMat frame_;
-    };
-
-
-    class ZeroOpticalFlowOCL : public DenseOpticalFlowExt
-    {
-    public:
-        void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
-        {
-            ocl::oclMat& frame0_ = ocl::getOclMatRef(frame0);
-            ocl::oclMat& flow1_ = ocl::getOclMatRef(flow1);
-            ocl::oclMat& flow2_ = ocl::getOclMatRef(flow2);
-
-            cv::Size size = frame0_.size();
-
-            if(!flow2.needed())
-            {
-                flow1_.create(size, CV_32FC2);
-                flow1_.setTo(Scalar::all(0));
-            }
-            else
-            {
-                flow1_.create(size, CV_32FC1);
-                flow2_.create(size, CV_32FC1);
-
-                flow1_.setTo(Scalar::all(0));
-                flow2_.setTo(Scalar::all(0));
-            }
-        }
-
-        void collectGarbage()
-        {
-        }
-    };
-}
-
-PERF_TEST_P(Size_MatType, SuperResolution_BTVL1_OCL,
-    Combine(Values(szSmall64, szSmall128),
-    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
-{
-    declare.time(5 * 60);
-
-    const Size size = std::tr1::get<0>(GetParam());
-    const int type = std::tr1::get<1>(GetParam());
-
-    Mat frame(size, type);
-    declare.in(frame, WARMUP_RNG);
-
-    ocl::oclMat frame_ocl;
-    frame_ocl.upload(frame);
-
-
-    const int scale = 2;
-    const int iterations = 50;
-    const int temporalAreaRadius = 1;
-    Ptr<DenseOpticalFlowExt> opticalFlowOcl(new ZeroOpticalFlowOCL);
-
-    Ptr<SuperResolution> superRes_ocl = createSuperResolution_BTVL1_OCL();
-
-    superRes_ocl->set("scale", scale);
-    superRes_ocl->set("iterations", iterations);
-    superRes_ocl->set("temporalAreaRadius", temporalAreaRadius);
-    superRes_ocl->set("opticalFlow", opticalFlowOcl);
-
-    superRes_ocl->setInput(makePtr<OneFrameSource_OCL>(frame_ocl));
-
-    ocl::oclMat dst_ocl;
-    superRes_ocl->nextFrame(dst_ocl);
-
-    TEST_CYCLE_N(10) superRes_ocl->nextFrame(dst_ocl);
-    frame_ocl.release();
-    CPU_SANITY_CHECK(dst_ocl);
-}
-#endif
diff --git a/modules/superres/src/btv_l1.cpp b/modules/superres/src/btv_l1.cpp
index e0ee7db63..1e4aa48a7 100644
--- a/modules/superres/src/btv_l1.cpp
+++ b/modules/superres/src/btv_l1.cpp
@@ -44,6 +44,7 @@
 // Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 using namespace cv;
 using namespace cv::superres;
@@ -51,10 +52,17 @@ using namespace cv::superres::detail;
 
 namespace
 {
-    void calcRelativeMotions(const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
-                             std::vector<Mat>& relForwardMotions, std::vector<Mat>& relBackwardMotions,
-                             int baseIdx, Size size)
+#ifdef HAVE_OPENCL
+
+    bool ocl_calcRelativeMotions(InputArrayOfArrays _forwardMotions, InputArrayOfArrays _backwardMotions,
+                                 OutputArrayOfArrays _relForwardMotions, OutputArrayOfArrays _relBackwardMotions,
+                                 int baseIdx, const Size & size)
     {
+        std::vector<UMat> & forwardMotions = *(std::vector<UMat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<UMat> *)_backwardMotions.getObj(),
+                & relForwardMotions = *(std::vector<UMat> *)_relForwardMotions.getObj(),
+                & relBackwardMotions = *(std::vector<UMat> *)_relBackwardMotions.getObj();
+
         const int count = static_cast<int>(forwardMotions.size());
 
         relForwardMotions.resize(count);
@@ -68,20 +76,84 @@ namespace
         for (int i = baseIdx - 1; i >= 0; --i)
         {
             add(relForwardMotions[i + 1], forwardMotions[i], relForwardMotions[i]);
-
             add(relBackwardMotions[i + 1], backwardMotions[i + 1], relBackwardMotions[i]);
         }
 
         for (int i = baseIdx + 1; i < count; ++i)
         {
             add(relForwardMotions[i - 1], backwardMotions[i], relForwardMotions[i]);
+            add(relBackwardMotions[i - 1], forwardMotions[i - 1], relBackwardMotions[i]);
+        }
 
+        return true;
+    }
+
+#endif
+
+    void calcRelativeMotions(InputArrayOfArrays _forwardMotions, InputArrayOfArrays _backwardMotions,
+                             OutputArrayOfArrays _relForwardMotions, OutputArrayOfArrays _relBackwardMotions,
+                             int baseIdx, const Size & size)
+    {
+        CV_OCL_RUN(_forwardMotions.isUMatVector() && _backwardMotions.isUMatVector() &&
+                   _relForwardMotions.isUMatVector() && _relBackwardMotions.isUMatVector(),
+                   ocl_calcRelativeMotions(_forwardMotions, _backwardMotions, _relForwardMotions,
+                                           _relBackwardMotions, baseIdx, size))
+
+        std::vector<Mat> & forwardMotions = *(std::vector<Mat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<Mat> *)_backwardMotions.getObj(),
+                & relForwardMotions = *(std::vector<Mat> *)_relForwardMotions.getObj(),
+                & relBackwardMotions = *(std::vector<Mat> *)_relBackwardMotions.getObj();
+
+        const int count = static_cast<int>(forwardMotions.size());
+
+        relForwardMotions.resize(count);
+        relForwardMotions[baseIdx].create(size, CV_32FC2);
+        relForwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        relBackwardMotions.resize(count);
+        relBackwardMotions[baseIdx].create(size, CV_32FC2);
+        relBackwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        for (int i = baseIdx - 1; i >= 0; --i)
+        {
+            add(relForwardMotions[i + 1], forwardMotions[i], relForwardMotions[i]);
+            add(relBackwardMotions[i + 1], backwardMotions[i + 1], relBackwardMotions[i]);
+        }
+
+        for (int i = baseIdx + 1; i < count; ++i)
+        {
+            add(relForwardMotions[i - 1], backwardMotions[i], relForwardMotions[i]);
             add(relBackwardMotions[i - 1], forwardMotions[i - 1], relBackwardMotions[i]);
         }
     }
+#ifdef HAVE_OPENCL
 
-    void upscaleMotions(const std::vector<Mat>& lowResMotions, std::vector<Mat>& highResMotions, int scale)
+    bool ocl_upscaleMotions(InputArrayOfArrays _lowResMotions, OutputArrayOfArrays _highResMotions, int scale)
     {
+        std::vector<UMat> & lowResMotions = *(std::vector<UMat> *)_lowResMotions.getObj(),
+                & highResMotions = *(std::vector<UMat> *)_highResMotions.getObj();
+
+        highResMotions.resize(lowResMotions.size());
+
+        for (size_t i = 0; i < lowResMotions.size(); ++i)
+        {
+            resize(lowResMotions[i], highResMotions[i], Size(), scale, scale, INTER_LINEAR); // TODO
+            multiply(highResMotions[i], Scalar::all(scale), highResMotions[i]);
+        }
+
+        return true;
+    }
+
+#endif
+
+    void upscaleMotions(InputArrayOfArrays _lowResMotions, OutputArrayOfArrays _highResMotions, int scale)
+    {
+        CV_OCL_RUN(_lowResMotions.isUMatVector() && _highResMotions.isUMatVector(),
+                   ocl_upscaleMotions(_lowResMotions, _highResMotions, scale))
+
+        std::vector<Mat> & lowResMotions = *(std::vector<Mat> *)_lowResMotions.getObj(),
+                & highResMotions = *(std::vector<Mat> *)_highResMotions.getObj();
+
         highResMotions.resize(lowResMotions.size());
 
         for (size_t i = 0; i < lowResMotions.size(); ++i)
@@ -91,10 +163,47 @@ namespace
         }
     }
 
-    void buildMotionMaps(const Mat& forwardMotion, const Mat& backwardMotion, Mat& forwardMap, Mat& backwardMap)
+#ifdef HAVE_OPENCL
+
+    bool ocl_buildMotionMaps(InputArray _forwardMotion, InputArray _backwardMotion,
+                             OutputArray _forwardMap, OutputArray _backwardMap)
     {
-        forwardMap.create(forwardMotion.size(), CV_32FC2);
-        backwardMap.create(forwardMotion.size(), CV_32FC2);
+        ocl::Kernel k("buildMotionMaps", ocl::superres::superres_btvl1_oclsrc);
+        if (k.empty())
+            return false;
+
+        UMat forwardMotion = _forwardMotion.getUMat(), backwardMotion = _backwardMotion.getUMat();
+        Size size = forwardMotion.size();
+
+        _forwardMap.create(size, CV_32FC2);
+        _backwardMap.create(size, CV_32FC2);
+
+        UMat forwardMap = _forwardMap.getUMat(), backwardMap = _backwardMap.getUMat();
+
+        k.args(ocl::KernelArg::ReadOnlyNoSize(forwardMotion),
+               ocl::KernelArg::ReadOnlyNoSize(backwardMotion),
+               ocl::KernelArg::WriteOnlyNoSize(forwardMap),
+               ocl::KernelArg::WriteOnly(backwardMap));
+
+        size_t globalsize[2] = { size.width, size.height };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void buildMotionMaps(InputArray _forwardMotion, InputArray _backwardMotion,
+                         OutputArray _forwardMap, OutputArray _backwardMap)
+    {
+        CV_OCL_RUN(_forwardMap.isUMat() && _backwardMap.isUMat(),
+                   ocl_buildMotionMaps(_forwardMotion, _backwardMotion, _forwardMap,
+                                       _backwardMap));
+
+        Mat forwardMotion = _forwardMotion.getMat(), backwardMotion = _backwardMotion.getMat();
+
+        _forwardMap.create(forwardMotion.size(), CV_32FC2);
+        _backwardMap.create(forwardMotion.size(), CV_32FC2);
+
+        Mat forwardMap = _forwardMap.getMat(), backwardMap = _backwardMap.getMat();
 
         for (int y = 0; y < forwardMotion.rows; ++y)
         {
@@ -114,40 +223,73 @@ namespace
     }
 
     template <typename T>
-    void upscaleImpl(const Mat& src, Mat& dst, int scale)
+    void upscaleImpl(InputArray _src, OutputArray _dst, int scale)
     {
-        dst.create(src.rows * scale, src.cols * scale, src.type());
-        dst.setTo(Scalar::all(0));
+        Mat src = _src.getMat();
+        _dst.create(src.rows * scale, src.cols * scale, src.type());
+        _dst.setTo(Scalar::all(0));
+        Mat dst = _dst.getMat();
 
         for (int y = 0, Y = 0; y < src.rows; ++y, Y += scale)
         {
-            const T* srcRow = src.ptr<T>(y);
-            T* dstRow = dst.ptr<T>(Y);
+            const T * const srcRow = src.ptr<T>(y);
+            T * const dstRow = dst.ptr<T>(Y);
 
             for (int x = 0, X = 0; x < src.cols; ++x, X += scale)
                 dstRow[X] = srcRow[x];
         }
     }
 
-    void upscale(const Mat& src, Mat& dst, int scale)
+#ifdef HAVE_OPENCL
+
+    static bool ocl_upscale(InputArray _src, OutputArray _dst, int scale)
     {
-        typedef void (*func_t)(const Mat& src, Mat& dst, int scale);
-        static const func_t funcs[] =
-        {
-            0, upscaleImpl<float>, 0, upscaleImpl<Point3f>
-        };
+        int type = _src.type(), cn = CV_MAT_CN(type);
+        ocl::Kernel k("upscale", ocl::superres::superres_btvl1_oclsrc,
+                      format("-D cn=%d", cn));
+        if (k.empty())
+            return false;
 
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+        UMat src = _src.getUMat();
+        _dst.create(src.rows * scale, src.cols * scale, type);
+        _dst.setTo(Scalar::all(0));
+        UMat dst = _dst.getUMat();
 
-        const func_t func = funcs[src.channels()];
+        k.args(ocl::KernelArg::ReadOnly(src),
+               ocl::KernelArg::ReadWriteNoSize(dst), scale);
 
-        func(src, dst, scale);
+        size_t globalsize[2] = { src.cols, src.rows };
+        return k.run(2, globalsize, NULL, false);
     }
 
-    float diffSign(float a, float b)
+#endif
+
+    typedef struct _Point4f { float ar[4]; } Point4f;
+
+    void upscale(InputArray _src, OutputArray _dst, int scale)
+    {
+        int cn = _src.channels();
+        CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_upscale(_src, _dst, scale))
+
+        typedef void (*func_t)(InputArray src, OutputArray dst, int scale);
+        static const func_t funcs[] =
+        {
+            0, upscaleImpl<float>, 0, upscaleImpl<Point3f>, upscaleImpl<Point4f>
+        };
+
+        const func_t func = funcs[cn];
+        CV_Assert(func != 0);
+        func(_src, _dst, scale);
+    }
+
+    inline float diffSign(float a, float b)
     {
         return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
     }
+
     Point3f diffSign(Point3f a, Point3f b)
     {
         return Point3f(
@@ -157,16 +299,44 @@ namespace
         );
     }
 
-    void diffSign(const Mat& src1, const Mat& src2, Mat& dst)
-    {
-        const int count = src1.cols * src1.channels();
+#ifdef HAVE_OPENCL
 
-        dst.create(src1.size(), src1.type());
+    static bool ocl_diffSign(InputArray _src1, OutputArray _src2, OutputArray _dst)
+    {
+        ocl::Kernel k("diffSign", ocl::superres::superres_btvl1_oclsrc);
+        if (k.empty())
+            return false;
+
+        UMat src1 = _src1.getUMat(), src2 = _src2.getUMat();
+        _dst.create(src1.size(), src1.type());
+        UMat dst = _dst.getUMat();
+
+        int cn = src1.channels();
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
+               ocl::KernelArg::ReadOnlyNoSize(src2),
+               ocl::KernelArg::WriteOnly(dst, cn));
+
+        size_t globalsize[2] = { src1.cols * cn, src1.rows };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void diffSign(InputArray _src1, OutputArray _src2, OutputArray _dst)
+    {
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_diffSign(_src1, _src2, _dst))
+
+        Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+        _dst.create(src1.size(), src1.type());
+        Mat dst = _dst.getMat();
+
+        const int count = src1.cols * src1.channels();
 
         for (int y = 0; y < src1.rows; ++y)
         {
-            const float* src1Ptr = src1.ptr<float>(y);
-            const float* src2Ptr = src2.ptr<float>(y);
+            const float * const src1Ptr = src1.ptr<float>(y);
+            const float * const src2Ptr = src2.ptr<float>(y);
             float* dstPtr = dst.ptr<float>(y);
 
             for (int x = 0; x < count; ++x)
@@ -206,8 +376,8 @@ namespace
     {
         for (int i = range.start; i < range.end; ++i)
         {
-            const T* srcRow = src.ptr<T>(i);
-            T* dstRow = dst.ptr<T>(i);
+            const T * const srcRow = src.ptr<T>(i);
+            T * const dstRow = dst.ptr<T>(i);
 
             for(int j = ksize; j < src.cols - ksize; ++j)
             {
@@ -219,19 +389,20 @@ namespace
                     const T* srcRow3 = src.ptr<T>(i + m);
 
                     for (int l = ksize; l + m >= 0; --l, ++ind)
-                    {
-                        dstRow[j] += btvWeights[ind] * (diffSign(srcVal, srcRow3[j + l]) - diffSign(srcRow2[j - l], srcVal));
-                    }
+                        dstRow[j] += btvWeights[ind] * (diffSign(srcVal, srcRow3[j + l])
+                                                        - diffSign(srcRow2[j - l], srcVal));
                 }
             }
         }
     }
 
     template <typename T>
-    void calcBtvRegularizationImpl(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+    void calcBtvRegularizationImpl(InputArray _src, OutputArray _dst, int btvKernelSize, const std::vector<float>& btvWeights)
     {
-        dst.create(src.size(), src.type());
-        dst.setTo(Scalar::all(0));
+        Mat src = _src.getMat();
+        _dst.create(src.size(), src.type());
+        _dst.setTo(Scalar::all(0));
+        Mat dst = _dst.getMat();
 
         const int ksize = (btvKernelSize - 1) / 2;
 
@@ -245,17 +416,48 @@ namespace
         parallel_for_(Range(ksize, src.rows - ksize), body);
     }
 
-    void calcBtvRegularization(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+#ifdef HAVE_OPENCL
+
+    static bool ocl_calcBtvRegularization(InputArray _src, OutputArray _dst, int btvKernelSize, const UMat & ubtvWeights)
     {
-        typedef void (*func_t)(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights);
+        int cn = _src.channels();
+        ocl::Kernel k("calcBtvRegularization", ocl::superres::superres_btvl1_oclsrc,
+                      format("-D cn=%d", cn));
+        if (k.empty())
+            return false;
+
+        UMat src = _src.getUMat();
+        _dst.create(src.size(), src.type());
+        _dst.setTo(Scalar::all(0));
+        UMat dst = _dst.getUMat();
+
+        const int ksize = (btvKernelSize - 1) / 2;
+
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
+              ksize, ocl::KernelArg::PtrReadOnly(ubtvWeights));
+
+        size_t globalsize[2] = { src.cols, src.rows };
+        return k.run(2, globalsize, NULL, false);
+    }
+
+#endif
+
+    void calcBtvRegularization(InputArray _src, OutputArray _dst, int btvKernelSize,
+                               const std::vector<float>& btvWeights, const UMat & ubtvWeights)
+    {
+        CV_OCL_RUN(_dst.isUMat(),
+                   ocl_calcBtvRegularization(_src, _dst, btvKernelSize, ubtvWeights))
+        (void)ubtvWeights;
+
+        typedef void (*func_t)(InputArray _src, OutputArray _dst, int btvKernelSize, const std::vector<float>& btvWeights);
         static const func_t funcs[] =
         {
-            0, calcBtvRegularizationImpl<float>, 0, calcBtvRegularizationImpl<Point3f>
+            0, calcBtvRegularizationImpl<float>, 0, calcBtvRegularizationImpl<Point3f>, 0
         };
 
-        const func_t func = funcs[src.channels()];
-
-        func(src, dst, btvKernelSize, btvWeights);
+        const func_t func = funcs[_src.channels()];
+        CV_Assert(func != 0);
+        func(_src, _dst, btvKernelSize, btvWeights);
     }
 
     class BTVL1_Base
@@ -263,9 +465,8 @@ namespace
     public:
         BTVL1_Base();
 
-        void process(const std::vector<Mat>& src, Mat& dst,
-                     const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
-                     int baseIdx);
+        void process(InputArrayOfArrays src, OutputArray dst, InputArrayOfArrays forwardMotions,
+                     InputArrayOfArrays backwardMotions, int baseIdx);
 
         void collectGarbage();
 
@@ -281,15 +482,21 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
+        bool ocl_process(InputArrayOfArrays src, OutputArray dst, InputArrayOfArrays forwardMotions,
+                         InputArrayOfArrays backwardMotions, int baseIdx);
+
         Ptr<FilterEngine> filter_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
 
         std::vector<float> btvWeights_;
+        UMat ubtvWeights_;
+
         int curBtvKernelSize_;
         double curAlpha_;
 
+        // Mat
         std::vector<Mat> lowResForwardMotions_;
         std::vector<Mat> lowResBackwardMotions_;
 
@@ -303,6 +510,23 @@ namespace
 
         Mat diffTerm_, regTerm_;
         Mat a_, b_, c_;
+
+#ifdef HAVE_OPENCL
+        // UMat
+        std::vector<UMat> ulowResForwardMotions_;
+        std::vector<UMat> ulowResBackwardMotions_;
+
+        std::vector<UMat> uhighResForwardMotions_;
+        std::vector<UMat> uhighResBackwardMotions_;
+
+        std::vector<UMat> uforwardMaps_;
+        std::vector<UMat> ubackwardMaps_;
+
+        UMat uhighRes_;
+
+        UMat udiffTerm_, uregTerm_;
+        UMat ua_, ub_, uc_;
+#endif
     };
 
     BTVL1_Base::BTVL1_Base()
@@ -325,7 +549,101 @@ namespace
         curAlpha_ = -1.0;
     }
 
-    void BTVL1_Base::process(const std::vector<Mat>& src, Mat& dst, const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions, int baseIdx)
+#ifdef HAVE_OPENCL
+
+    bool BTVL1_Base::ocl_process(InputArrayOfArrays _src, OutputArray _dst, InputArrayOfArrays _forwardMotions,
+                                 InputArrayOfArrays _backwardMotions, int baseIdx)
+    {
+        std::vector<UMat> & src = *(std::vector<UMat> *)_src.getObj(),
+                & forwardMotions = *(std::vector<UMat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<UMat> *)_backwardMotions.getObj();
+
+        // update blur filter and btv weights
+        if (!filter_ || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
+        {
+            filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+            curBlurKernelSize_ = blurKernelSize_;
+            curBlurSigma_ = blurSigma_;
+            curSrcType_ = src[0].type();
+        }
+
+        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
+        {
+            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
+            Mat(btvWeights_, true).copyTo(ubtvWeights_);
+
+            curBtvKernelSize_ = btvKernelSize_;
+            curAlpha_ = alpha_;
+        }
+
+        // calc high res motions
+        calcRelativeMotions(forwardMotions, backwardMotions, ulowResForwardMotions_, ulowResBackwardMotions_, baseIdx, src[0].size());
+
+        upscaleMotions(ulowResForwardMotions_, uhighResForwardMotions_, scale_);
+        upscaleMotions(ulowResBackwardMotions_, uhighResBackwardMotions_, scale_);
+
+        uforwardMaps_.resize(uhighResForwardMotions_.size());
+        ubackwardMaps_.resize(uhighResForwardMotions_.size());
+        for (size_t i = 0; i < uhighResForwardMotions_.size(); ++i)
+            buildMotionMaps(uhighResForwardMotions_[i], uhighResBackwardMotions_[i], uforwardMaps_[i], ubackwardMaps_[i]);
+
+        // initial estimation
+        const Size lowResSize = src[0].size();
+        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
+
+        resize(src[baseIdx], uhighRes_, highResSize, 0, 0, INTER_LINEAR); // TODO
+
+        // iterations
+        udiffTerm_.create(highResSize, uhighRes_.type());
+        ua_.create(highResSize, uhighRes_.type());
+        ub_.create(highResSize, uhighRes_.type());
+        uc_.create(lowResSize, uhighRes_.type());
+
+        for (int i = 0; i < iterations_; ++i)
+        {
+            udiffTerm_.setTo(Scalar::all(0));
+
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                // a = M * Ih
+                remap(uhighRes_, ua_, ubackwardMaps_[k], noArray(), INTER_NEAREST);
+                // b = HM * Ih
+                GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                // c = DHM * Ih
+                resize(ub_, uc_, lowResSize, 0, 0, INTER_NEAREST);
+
+                diffSign(src[k], uc_, uc_);
+
+                // a = Dt * diff
+                upscale(uc_, ua_, scale_);
+
+                // b = HtDt * diff
+                GaussianBlur(ua_, ub_, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                // a = MtHtDt * diff
+                remap(ub_, ua_, uforwardMaps_[k], noArray(), INTER_NEAREST);
+
+                add(udiffTerm_, ua_, udiffTerm_);
+            }
+
+            if (lambda_ > 0)
+            {
+                calcBtvRegularization(uhighRes_, uregTerm_, btvKernelSize_, btvWeights_, ubtvWeights_);
+                addWeighted(udiffTerm_, 1.0, uregTerm_, -lambda_, 0.0, udiffTerm_);
+            }
+
+            addWeighted(uhighRes_, 1.0, udiffTerm_, tau_, 0.0, uhighRes_);
+        }
+
+        Rect inner(btvKernelSize_, btvKernelSize_, uhighRes_.cols - 2 * btvKernelSize_, uhighRes_.rows - 2 * btvKernelSize_);
+        uhighRes_(inner).copyTo(_dst);
+
+        return true;
+    }
+
+#endif
+
+    void BTVL1_Base::process(InputArrayOfArrays _src, OutputArray _dst, InputArrayOfArrays _forwardMotions,
+                             InputArrayOfArrays _backwardMotions, int baseIdx)
     {
         CV_Assert( scale_ > 1 );
         CV_Assert( iterations_ > 0 );
@@ -335,8 +653,15 @@ namespace
         CV_Assert( blurKernelSize_ > 0 );
         CV_Assert( blurSigma_ >= 0.0 );
 
-        // update blur filter and btv weights
+        CV_OCL_RUN(_src.isUMatVector() && _dst.isUMat() && _forwardMotions.isUMatVector() &&
+                   _backwardMotions.isUMatVector(),
+                   ocl_process(_src, _dst, _forwardMotions, _backwardMotions, baseIdx))
 
+        std::vector<Mat> & src = *(std::vector<Mat> *)_src.getObj(),
+                & forwardMotions = *(std::vector<Mat> *)_forwardMotions.getObj(),
+                & backwardMotions = *(std::vector<Mat> *)_backwardMotions.getObj();
+
+        // update blur filter and btv weights
         if (!filter_ || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
         {
             filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
@@ -353,7 +678,6 @@ namespace
         }
 
         // calc high res motions
-
         calcRelativeMotions(forwardMotions, backwardMotions, lowResForwardMotions_, lowResBackwardMotions_, baseIdx, src[0].size());
 
         upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
@@ -365,14 +689,12 @@ namespace
             buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
 
         // initial estimation
-
         const Size lowResSize = src[0].size();
         const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
 
         resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_CUBIC);
 
         // iterations
-
         diffTerm_.create(highResSize, highRes_.type());
         a_.create(highResSize, highRes_.type());
         b_.create(highResSize, highRes_.type());
@@ -405,7 +727,7 @@ namespace
 
             if (lambda_ > 0)
             {
-                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_, btvWeights_);
+                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_, btvWeights_, ubtvWeights_);
                 addWeighted(diffTerm_, 1.0, regTerm_, -lambda_, 0.0, diffTerm_);
             }
 
@@ -413,13 +735,14 @@ namespace
         }
 
         Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
-        highRes_(inner).copyTo(dst);
+        highRes_(inner).copyTo(_dst);
     }
 
     void BTVL1_Base::collectGarbage()
     {
         filter_.release();
 
+        // Mat
         lowResForwardMotions_.clear();
         lowResBackwardMotions_.clear();
 
@@ -436,11 +759,32 @@ namespace
         a_.release();
         b_.release();
         c_.release();
+
+#ifdef HAVE_OPENCL
+        // UMat
+        ulowResForwardMotions_.clear();
+        ulowResBackwardMotions_.clear();
+
+        uhighResForwardMotions_.clear();
+        uhighResBackwardMotions_.clear();
+
+        uforwardMaps_.clear();
+        ubackwardMaps_.clear();
+
+        uhighRes_.release();
+
+        udiffTerm_.release();
+        uregTerm_.release();
+        ua_.release();
+        ub_.release();
+        uc_.release();
+#endif
     }
 
 ////////////////////////////////////////////////////////////////////
 
-    class BTVL1 : public SuperResolution, private BTVL1_Base
+    class BTVL1 :
+            public SuperResolution, private BTVL1_Base
     {
     public:
         AlgorithmInfo* info() const;
@@ -451,14 +795,25 @@ namespace
 
     protected:
         void initImpl(Ptr<FrameSource>& frameSource);
+        bool ocl_initImpl(Ptr<FrameSource>& frameSource);
+
         void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
+        bool ocl_processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
 
     private:
         int temporalAreaRadius_;
 
         void readNextFrame(Ptr<FrameSource>& frameSource);
-        void processFrame(int idx);
+        bool ocl_readNextFrame(Ptr<FrameSource>& frameSource);
 
+        void processFrame(int idx);
+        bool ocl_processFrame(int idx);
+
+        int storePos_;
+        int procPos_;
+        int outPos_;
+
+        // Mat
         Mat curFrame_;
         Mat prevFrame_;
 
@@ -467,14 +822,25 @@ namespace
         std::vector<Mat> backwardMotions_;
         std::vector<Mat> outputs_;
 
-        int storePos_;
-        int procPos_;
-        int outPos_;
-
         std::vector<Mat> srcFrames_;
         std::vector<Mat> srcForwardMotions_;
         std::vector<Mat> srcBackwardMotions_;
         Mat finalOutput_;
+
+#ifdef HAVE_OPENCL
+        // UMat
+        UMat ucurFrame_;
+        UMat uprevFrame_;
+
+        std::vector<UMat> uframes_;
+        std::vector<UMat> uforwardMotions_;
+        std::vector<UMat> ubackwardMotions_;
+        std::vector<UMat> uoutputs_;
+
+        std::vector<UMat> usrcFrames_;
+        std::vector<UMat> usrcForwardMotions_;
+        std::vector<UMat> usrcBackwardMotions_;
+#endif
     };
 
     CV_INIT_ALGORITHM(BTVL1, "SuperResolution.BTVL1",
@@ -487,7 +853,7 @@ namespace
                       obj.info()->addParam(obj, "blurKernelSize", obj.blurKernelSize_, false, 0, 0, "Gaussian blur kernel size.");
                       obj.info()->addParam(obj, "blurSigma", obj.blurSigma_, false, 0, 0, "Gaussian blur sigma.");
                       obj.info()->addParam(obj, "temporalAreaRadius", obj.temporalAreaRadius_, false, 0, 0, "Radius of the temporal search area.");
-                      obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."));
+                      obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."))
 
     BTVL1::BTVL1()
     {
@@ -496,6 +862,7 @@ namespace
 
     void BTVL1::collectGarbage()
     {
+        // Mat
         curFrame_.release();
         prevFrame_.release();
 
@@ -509,10 +876,52 @@ namespace
         srcBackwardMotions_.clear();
         finalOutput_.release();
 
+#ifdef HAVE_OPENCL
+        // UMat
+        ucurFrame_.release();
+        uprevFrame_.release();
+
+        uframes_.clear();
+        uforwardMotions_.clear();
+        ubackwardMotions_.clear();
+        uoutputs_.clear();
+
+        usrcFrames_.clear();
+        usrcForwardMotions_.clear();
+        usrcBackwardMotions_.clear();
+#endif
+
         SuperResolution::collectGarbage();
         BTVL1_Base::collectGarbage();
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_initImpl(Ptr<FrameSource>& frameSource)
+    {
+        const int cacheSize = 2 * temporalAreaRadius_ + 1;
+
+        uframes_.resize(cacheSize);
+        uforwardMotions_.resize(cacheSize);
+        ubackwardMotions_.resize(cacheSize);
+        uoutputs_.resize(cacheSize);
+
+        storePos_ = -1;
+
+        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
+            readNextFrame(frameSource);
+
+        for (int i = 0; i <= temporalAreaRadius_; ++i)
+            processFrame(i);
+
+        procPos_ = temporalAreaRadius_;
+        outPos_ = -1;
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::initImpl(Ptr<FrameSource>& frameSource)
     {
         const int cacheSize = 2 * temporalAreaRadius_ + 1;
@@ -522,6 +931,9 @@ namespace
         backwardMotions_.resize(cacheSize);
         outputs_.resize(cacheSize);
 
+        CV_OCL_RUN(isUmat_,
+                   ocl_initImpl(frameSource))
+
         storePos_ = -1;
 
         for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
@@ -534,6 +946,18 @@ namespace
         outPos_ = -1;
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_processImpl(Ptr<FrameSource>& /*frameSource*/, OutputArray _output)
+    {
+        const UMat& curOutput = at(outPos_, uoutputs_);
+        curOutput.convertTo(_output, CV_8U);
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
     {
         if (outPos_ >= storePos_)
@@ -549,11 +973,14 @@ namespace
             ++procPos_;
             processFrame(procPos_);
         }
-
         ++outPos_;
+
+        CV_OCL_RUN(isUmat_,
+                   ocl_processImpl(frameSource, _output))
+
         const Mat& curOutput = at(outPos_, outputs_);
 
-        if (_output.kind() < _InputArray::OPENGL_BUFFER)
+        if (_output.kind() < _InputArray::OPENGL_BUFFER || _output.isUMat())
             curOutput.convertTo(_output, CV_8U);
         else
         {
@@ -562,14 +989,41 @@ namespace
         }
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_readNextFrame(Ptr<FrameSource>& /*frameSource*/)
+    {
+        ucurFrame_.convertTo(at(storePos_, uframes_), CV_32F);
+
+        if (storePos_ > 0)
+        {
+            opticalFlow_->calc(uprevFrame_, ucurFrame_, at(storePos_ - 1, uforwardMotions_));
+            opticalFlow_->calc(ucurFrame_, uprevFrame_, at(storePos_, ubackwardMotions_));
+        }
+
+        ucurFrame_.copyTo(uprevFrame_);
+        return true;
+    }
+
+#endif
+
     void BTVL1::readNextFrame(Ptr<FrameSource>& frameSource)
     {
         frameSource->nextFrame(curFrame_);
-
         if (curFrame_.empty())
             return;
 
+#ifdef HAVE_OPENCL
+        if (isUmat_ && curFrame_.channels() == 1)
+            curFrame_.copyTo(ucurFrame_);
+        else
+            isUmat_ = false;
+#endif
         ++storePos_;
+
+        CV_OCL_RUN(isUmat_,
+                   ocl_readNextFrame(frameSource))
+
         curFrame_.convertTo(at(storePos_, frames_), CV_32F);
 
         if (storePos_ > 0)
@@ -581,8 +1035,47 @@ namespace
         curFrame_.copyTo(prevFrame_);
     }
 
+#ifdef HAVE_OPENCL
+
+    bool BTVL1::ocl_processFrame(int idx)
+    {
+        const int startIdx = std::max(idx - temporalAreaRadius_, 0);
+        const int procIdx = idx;
+        const int endIdx = std::min(startIdx + 2 * temporalAreaRadius_, storePos_);
+
+        const int count = endIdx - startIdx + 1;
+
+        usrcFrames_.resize(count);
+        usrcForwardMotions_.resize(count);
+        usrcBackwardMotions_.resize(count);
+
+        int baseIdx = -1;
+
+        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
+        {
+            if (i == procIdx)
+                baseIdx = k;
+
+            usrcFrames_[k] = at(i, uframes_);
+
+            if (i < endIdx)
+                usrcForwardMotions_[k] = at(i, uforwardMotions_);
+            if (i > startIdx)
+                usrcBackwardMotions_[k] = at(i, ubackwardMotions_);
+        }
+
+        process(usrcFrames_, at(idx, uoutputs_), usrcForwardMotions_, usrcBackwardMotions_, baseIdx);
+
+        return true;
+    }
+
+#endif
+
     void BTVL1::processFrame(int idx)
     {
+        CV_OCL_RUN(isUmat_,
+                   ocl_processFrame(idx))
+
         const int startIdx = std::max(idx - temporalAreaRadius_, 0);
         const int procIdx = idx;
         const int endIdx = std::min(startIdx + 2 * temporalAreaRadius_, storePos_);
diff --git a/modules/superres/src/btv_l1_ocl.cpp b/modules/superres/src/btv_l1_ocl.cpp
deleted file mode 100644
index 7fd6741e8..000000000
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//		Jin Ma, jin@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-// S. Farsiu , D. Robinson, M. Elad, P. Milanfar. Fast and robust multiframe super resolution.
-// Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
-
-#include "precomp.hpp"
-
-#if !defined(HAVE_OPENCL) || !defined(HAVE_OPENCV_OCL)
-
-cv::Ptr<cv::superres::SuperResolution> cv::superres::createSuperResolution_BTVL1_OCL()
-{
-    CV_Error(cv::Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    return Ptr<SuperResolution>();
-}
-
-#else
-#include "opencl_kernels.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-using namespace cv::superres;
-using namespace cv::superres::detail;
-
-static ProgramEntry superres_btvl1 = cv::ocl::superres::superres_btvl1;
-
-namespace cv
-{
-    namespace ocl
-    {
-        float* btvWeights_ = NULL;
-        size_t btvWeights_size = 0;
-        oclMat c_btvRegWeights;
-    }
-}
-
-namespace btv_l1_device_ocl
-{
-    void buildMotionMaps(const oclMat& forwardMotionX, const oclMat& forwardMotionY,
-        const oclMat& backwardMotionX, const oclMat& bacwardMotionY,
-        oclMat& forwardMapX, oclMat& forwardMapY,
-        oclMat& backwardMapX, oclMat& backwardMapY);
-
-    void upscale(const oclMat& src, oclMat& dst, int scale);
-
-    void diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst);
-
-    void calcBtvRegularization(const oclMat& src, oclMat& dst, int ksize);
-}
-
-void btv_l1_device_ocl::buildMotionMaps(const oclMat& forwardMotionX, const oclMat& forwardMotionY,
-    const oclMat& backwardMotionX, const oclMat& backwardMotionY,
-    oclMat& forwardMapX, oclMat& forwardMapY,
-    oclMat& backwardMapX, oclMat& backwardMapY)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {forwardMapX.cols, forwardMapX.rows, 1};
-
-    int forwardMotionX_step = (int)(forwardMotionX.step/forwardMotionX.elemSize());
-    int forwardMotionY_step = (int)(forwardMotionY.step/forwardMotionY.elemSize());
-    int backwardMotionX_step = (int)(backwardMotionX.step/backwardMotionX.elemSize());
-    int backwardMotionY_step = (int)(backwardMotionY.step/backwardMotionY.elemSize());
-    int forwardMapX_step = (int)(forwardMapX.step/forwardMapX.elemSize());
-    int forwardMapY_step = (int)(forwardMapY.step/forwardMapY.elemSize());
-    int backwardMapX_step = (int)(backwardMapX.step/backwardMapX.elemSize());
-    int backwardMapY_step = (int)(backwardMapY.step/backwardMapY.elemSize());
-
-    String kernel_name = "buildMotionMapsKernel";
-    vector< pair<size_t, const void*> > args;
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMotionX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMotionY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMotionX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMotionY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMapX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&forwardMapY.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMapX.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&backwardMapY.data));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionX.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionY.cols));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMotionY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMotionX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMotionY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMapX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&forwardMapY_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMapX_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&backwardMapY_step));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-void btv_l1_device_ocl::upscale(const oclMat& src, oclMat& dst, int scale)
-{
-    Context* clCxt = Context::getContext();
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src.cols, src.rows, 1};
-
-    int src_step = (int)(src.step/src.elemSize());
-    int dst_step = (int)(dst.step/dst.elemSize());
-
-    String kernel_name = "upscaleKernel";
-    vector< pair<size_t, const void*> > args;
-
-    int cn = src.oclchannels();
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst.data));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&scale));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&cn));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-
-}
-
-void btv_l1_device_ocl::diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst)
-{
-    Context* clCxt = Context::getContext();
-
-    oclMat src1_ = src1.reshape(1);
-    oclMat src2_ = src2.reshape(1);
-    oclMat dst_ = dst.reshape(1);
-
-    int src1_step = (int)(src1_.step/src1_.elemSize());
-    int src2_step = (int)(src2_.step/src2_.elemSize());
-    int dst_step = (int)(dst_.step/dst_.elemSize());
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src1_.cols, src1_.rows, 1};
-
-    String kernel_name = "diffSignKernel";
-    vector< pair<size_t, const void*> > args;
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src1_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src2_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst_.data));
-
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src1_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src2_step));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-void btv_l1_device_ocl::calcBtvRegularization(const oclMat& src, oclMat& dst, int ksize)
-{
-    Context* clCxt = Context::getContext();
-
-    oclMat src_ = src.reshape(1);
-    oclMat dst_ = dst.reshape(1);
-
-    size_t local_thread[] = {32, 8, 1};
-    size_t global_thread[] = {src.cols, src.rows, 1};
-
-    int src_step = (int)(src_.step/src_.elemSize());
-    int dst_step = (int)(dst_.step/dst_.elemSize());
-
-    String kernel_name = "calcBtvRegularizationKernel";
-    vector< pair<size_t, const void*> > args;
-
-    int cn = src.oclchannels();
-
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&src_.data));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&dst_.data));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&dst_step));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.rows));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&src.cols));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&ksize));
-    args.push_back(make_pair(sizeof(cl_int), (void*)&cn));
-    args.push_back(make_pair(sizeof(cl_mem), (void*)&c_btvRegWeights.data));
-
-    openCLExecuteKernel(clCxt, &superres_btvl1, kernel_name, global_thread, local_thread, args, -1, -1);
-}
-
-namespace
-{
-    void calcRelativeMotions(const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-        vector<pair<oclMat, oclMat> >& relForwardMotions, vector<pair<oclMat, oclMat> >& relBackwardMotions,
-        int baseIdx, Size size)
-    {
-        const int count = static_cast<int>(forwardMotions.size());
-
-        relForwardMotions.resize(count);
-        relForwardMotions[baseIdx].first.create(size, CV_32FC1);
-        relForwardMotions[baseIdx].first.setTo(Scalar::all(0));
-        relForwardMotions[baseIdx].second.create(size, CV_32FC1);
-        relForwardMotions[baseIdx].second.setTo(Scalar::all(0));
-
-        relBackwardMotions.resize(count);
-        relBackwardMotions[baseIdx].first.create(size, CV_32FC1);
-        relBackwardMotions[baseIdx].first.setTo(Scalar::all(0));
-        relBackwardMotions[baseIdx].second.create(size, CV_32FC1);
-        relBackwardMotions[baseIdx].second.setTo(Scalar::all(0));
-
-        for (int i = baseIdx - 1; i >= 0; --i)
-        {
-            ocl::add(relForwardMotions[i + 1].first, forwardMotions[i].first, relForwardMotions[i].first);
-            ocl::add(relForwardMotions[i + 1].second, forwardMotions[i].second, relForwardMotions[i].second);
-
-            ocl::add(relBackwardMotions[i + 1].first, backwardMotions[i + 1].first, relBackwardMotions[i].first);
-            ocl::add(relBackwardMotions[i + 1].second, backwardMotions[i + 1].second, relBackwardMotions[i].second);
-        }
-
-        for (int i = baseIdx + 1; i < count; ++i)
-        {
-            ocl::add(relForwardMotions[i - 1].first, backwardMotions[i].first, relForwardMotions[i].first);
-            ocl::add(relForwardMotions[i - 1].second, backwardMotions[i].second, relForwardMotions[i].second);
-
-            ocl::add(relBackwardMotions[i - 1].first, forwardMotions[i - 1].first, relBackwardMotions[i].first);
-            ocl::add(relBackwardMotions[i - 1].second, forwardMotions[i - 1].second, relBackwardMotions[i].second);
-        }
-    }
-
-    void upscaleMotions(const vector<pair<oclMat, oclMat> >& lowResMotions, vector<pair<oclMat, oclMat> >& highResMotions, int scale)
-    {
-        highResMotions.resize(lowResMotions.size());
-
-        for (size_t i = 0; i < lowResMotions.size(); ++i)
-        {
-            ocl::resize(lowResMotions[i].first, highResMotions[i].first, Size(), scale, scale, INTER_LINEAR);
-            ocl::resize(lowResMotions[i].second, highResMotions[i].second, Size(), scale, scale, INTER_LINEAR);
-
-            ocl::multiply(scale, highResMotions[i].first, highResMotions[i].first);
-            ocl::multiply(scale, highResMotions[i].second, highResMotions[i].second);
-        }
-    }
-
-    void buildMotionMaps(const pair<oclMat, oclMat>& forwardMotion, const pair<oclMat, oclMat>& backwardMotion,
-        pair<oclMat, oclMat>& forwardMap, pair<oclMat, oclMat>& backwardMap)
-    {
-        forwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
-        forwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
-
-        backwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
-        backwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
-
-        btv_l1_device_ocl::buildMotionMaps(forwardMotion.first, forwardMotion.second,
-            backwardMotion.first, backwardMotion.second,
-            forwardMap.first, forwardMap.second,
-            backwardMap.first, backwardMap.second);
-    }
-
-    void upscale(const oclMat& src, oclMat& dst, int scale)
-    {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
-
-        btv_l1_device_ocl::upscale(src, dst, scale);
-    }
-
-    void diffSign(const oclMat& src1, const oclMat& src2, oclMat& dst)
-    {
-        dst.create(src1.size(), src1.type());
-
-        btv_l1_device_ocl::diffSign(src1, src2, dst);
-    }
-
-    void calcBtvWeights(int btvKernelSize, double alpha, vector<float>& btvWeights)
-    {
-        const size_t size = btvKernelSize * btvKernelSize;
-
-        btvWeights.resize(size);
-
-        const int ksize = (btvKernelSize - 1) / 2;
-        const float alpha_f = static_cast<float>(alpha);
-
-        for (int m = 0, ind = 0; m <= ksize; ++m)
-        {
-            for (int l = ksize; l + m >= 0; --l, ++ind)
-                btvWeights[ind] = pow(alpha_f, std::abs(m) + std::abs(l));
-        }
-
-        btvWeights_ = &btvWeights[0];
-        btvWeights_size = size;
-        Mat btvWeights_mheader(1, static_cast<int>(size), CV_32FC1, btvWeights_);
-        c_btvRegWeights = btvWeights_mheader;
-    }
-
-    void calcBtvRegularization(const oclMat& src, oclMat& dst, int btvKernelSize)
-    {
-        dst.create(src.size(), src.type());
-
-        const int ksize = (btvKernelSize - 1) / 2;
-
-        btv_l1_device_ocl::calcBtvRegularization(src, dst, ksize);
-    }
-
-    class BTVL1_OCL_Base
-    {
-    public:
-        BTVL1_OCL_Base();
-
-        void process(const vector<oclMat>& src, oclMat& dst,
-            const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-            int baseIdx);
-
-        void collectGarbage();
-
-    protected:
-        int scale_;
-        int iterations_;
-        double lambda_;
-        double tau_;
-        double alpha_;
-        int btvKernelSize_;
-        int blurKernelSize_;
-        double blurSigma_;
-        Ptr<DenseOpticalFlowExt> opticalFlow_;
-
-    private:
-        vector<Ptr<cv::ocl::FilterEngine_GPU> > filters_;
-        int curBlurKernelSize_;
-        double curBlurSigma_;
-        int curSrcType_;
-
-        vector<float> btvWeights_;
-        int curBtvKernelSize_;
-        double curAlpha_;
-
-        vector<pair<oclMat, oclMat> > lowResForwardMotions_;
-        vector<pair<oclMat, oclMat> > lowResBackwardMotions_;
-
-        vector<pair<oclMat, oclMat> > highResForwardMotions_;
-        vector<pair<oclMat, oclMat> > highResBackwardMotions_;
-
-        vector<pair<oclMat, oclMat> > forwardMaps_;
-        vector<pair<oclMat, oclMat> > backwardMaps_;
-
-        oclMat highRes_;
-
-        vector<oclMat> diffTerms_;
-        oclMat a_, b_, c_, d_;
-        oclMat regTerm_;
-    };
-
-    BTVL1_OCL_Base::BTVL1_OCL_Base()
-    {
-        scale_ = 4;
-        iterations_ = 180;
-        lambda_ = 0.03;
-        tau_ = 1.3;
-        alpha_ = 0.7;
-        btvKernelSize_ = 7;
-        blurKernelSize_ = 5;
-        blurSigma_ = 0.0;
-        opticalFlow_ = createOptFlow_Farneback_OCL();
-
-        curBlurKernelSize_ = -1;
-        curBlurSigma_ = -1.0;
-        curSrcType_ = -1;
-
-        curBtvKernelSize_ = -1;
-        curAlpha_ = -1.0;
-    }
-
-    void BTVL1_OCL_Base::process(const vector<oclMat>& src, oclMat& dst,
-        const vector<pair<oclMat, oclMat> >& forwardMotions, const vector<pair<oclMat, oclMat> >& backwardMotions,
-        int baseIdx)
-    {
-        CV_Assert( scale_ > 1 );
-        CV_Assert( iterations_ > 0 );
-        CV_Assert( tau_ > 0.0 );
-        CV_Assert( alpha_ > 0.0 );
-        CV_Assert( btvKernelSize_ > 0 && btvKernelSize_ <= 16 );
-        CV_Assert( blurKernelSize_ > 0 );
-        CV_Assert( blurSigma_ >= 0.0 );
-
-        // update blur filter and btv weights
-
-        if (filters_.size() != src.size() || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
-        {
-            filters_.resize(src.size());
-            for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = cv::ocl::createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
-            curBlurKernelSize_ = blurKernelSize_;
-            curBlurSigma_ = blurSigma_;
-            curSrcType_ = src[0].type();
-        }
-
-        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
-        {
-            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
-            curBtvKernelSize_ = btvKernelSize_;
-            curAlpha_ = alpha_;
-        }
-
-        // calc motions between input frames
-
-        calcRelativeMotions(forwardMotions, backwardMotions,
-            lowResForwardMotions_, lowResBackwardMotions_,
-            baseIdx, src[0].size());
-
-        upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
-        upscaleMotions(lowResBackwardMotions_, highResBackwardMotions_, scale_);
-
-        forwardMaps_.resize(highResForwardMotions_.size());
-        backwardMaps_.resize(highResForwardMotions_.size());
-        for (size_t i = 0; i < highResForwardMotions_.size(); ++i)
-        {
-            buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
-        }
-        // initial estimation
-
-        const Size lowResSize = src[0].size();
-        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
-
-        ocl::resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_LINEAR);
-
-        // iterations
-
-        diffTerms_.resize(src.size());
-        bool d_inited = false;
-        a_.create(highRes_.size(), highRes_.type());
-        b_.create(highRes_.size(), highRes_.type());
-        c_.create(lowResSize, highRes_.type());
-        d_.create(highRes_.rows, highRes_.cols, highRes_.type());
-        for (int i = 0; i < iterations_; ++i)
-        {
-            if(!d_inited)
-            {
-                d_.setTo(0);
-                d_inited = true;
-            }
-            for (size_t k = 0; k < src.size(); ++k)
-            {
-                diffTerms_[k].create(highRes_.size(), highRes_.type());
-                // a = M * Ih
-                ocl::remap(highRes_, a_, backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar());
-                // b = HM * Ih
-                filters_[k]->apply(a_, b_, Rect(0,0,-1,-1));
-                // c = DHF * Ih
-                ocl::resize(b_, c_, lowResSize, 0, 0, INTER_NEAREST);
-
-                diffSign(src[k], c_, c_);
-
-                // a = Dt * diff
-                upscale(c_, d_, scale_);
-                // b = HtDt * diff
-                filters_[k]->apply(d_, b_, Rect(0,0,-1,-1));
-                // diffTerm = MtHtDt * diff
-                ocl::remap(b_, diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_CONSTANT, Scalar());
-            }
-
-            if (lambda_ > 0)
-            {
-                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_);
-                ocl::addWeighted(highRes_, 1.0, regTerm_, -tau_ * lambda_, 0.0, highRes_);
-            }
-
-            for (size_t k = 0; k < src.size(); ++k)
-            {
-                ocl::addWeighted(highRes_, 1.0, diffTerms_[k], tau_, 0.0, highRes_);
-            }
-        }
-
-        Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
-        highRes_(inner).copyTo(dst);
-    }
-
-    void BTVL1_OCL_Base::collectGarbage()
-    {
-        filters_.clear();
-
-        lowResForwardMotions_.clear();
-        lowResBackwardMotions_.clear();
-
-        highResForwardMotions_.clear();
-        highResBackwardMotions_.clear();
-
-        forwardMaps_.clear();
-        backwardMaps_.clear();
-
-        highRes_.release();
-
-        diffTerms_.clear();
-        a_.release();
-        b_.release();
-        c_.release();
-        regTerm_.release();
-        c_btvRegWeights.release();
-    }
-
-    ////////////////////////////////////////////////////////////
-
-    class BTVL1_OCL : public SuperResolution, private BTVL1_OCL_Base
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        BTVL1_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void initImpl(Ptr<FrameSource>& frameSource);
-        void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
-
-    private:
-        int temporalAreaRadius_;
-
-        void readNextFrame(Ptr<FrameSource>& frameSource);
-        void processFrame(int idx);
-
-        oclMat curFrame_;
-        oclMat prevFrame_;
-
-        vector<oclMat> frames_;
-        vector<pair<oclMat, oclMat> > forwardMotions_;
-        vector<pair<oclMat, oclMat> > backwardMotions_;
-        vector<oclMat> outputs_;
-
-        int storePos_;
-        int procPos_;
-        int outPos_;
-
-        vector<oclMat> srcFrames_;
-        vector<pair<oclMat, oclMat> > srcForwardMotions_;
-        vector<pair<oclMat, oclMat> > srcBackwardMotions_;
-        oclMat finalOutput_;
-    };
-
-    CV_INIT_ALGORITHM(BTVL1_OCL, "SuperResolution.BTVL1_OCL",
-    obj.info()->addParam(obj, "scale", obj.scale_, false, 0, 0, "Scale factor.");
-    obj.info()->addParam(obj, "iterations", obj.iterations_, false, 0, 0, "Iteration count.");
-    obj.info()->addParam(obj, "tau", obj.tau_, false, 0, 0, "Asymptotic value of steepest descent method.");
-    obj.info()->addParam(obj, "lambda", obj.lambda_, false, 0, 0, "Weight parameter to balance data term and smoothness term.");
-    obj.info()->addParam(obj, "alpha", obj.alpha_, false, 0, 0, "Parameter of spacial distribution in Bilateral-TV.");
-    obj.info()->addParam(obj, "btvKernelSize", obj.btvKernelSize_, false, 0, 0, "Kernel size of Bilateral-TV filter.");
-    obj.info()->addParam(obj, "blurKernelSize", obj.blurKernelSize_, false, 0, 0, "Gaussian blur kernel size.");
-    obj.info()->addParam(obj, "blurSigma", obj.blurSigma_, false, 0, 0, "Gaussian blur sigma.");
-    obj.info()->addParam(obj, "temporalAreaRadius", obj.temporalAreaRadius_, false, 0, 0, "Radius of the temporal search area.");
-    obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."));
-
-    BTVL1_OCL::BTVL1_OCL()
-    {
-        temporalAreaRadius_ = 4;
-    }
-
-    void BTVL1_OCL::collectGarbage()
-    {
-        curFrame_.release();
-        prevFrame_.release();
-
-        frames_.clear();
-        forwardMotions_.clear();
-        backwardMotions_.clear();
-        outputs_.clear();
-
-        srcFrames_.clear();
-        srcForwardMotions_.clear();
-        srcBackwardMotions_.clear();
-        finalOutput_.release();
-
-        SuperResolution::collectGarbage();
-        BTVL1_OCL_Base::collectGarbage();
-    }
-
-    void BTVL1_OCL::initImpl(Ptr<FrameSource>& frameSource)
-    {
-        const int cacheSize = 2 * temporalAreaRadius_ + 1;
-
-        frames_.resize(cacheSize);
-        forwardMotions_.resize(cacheSize);
-        backwardMotions_.resize(cacheSize);
-        outputs_.resize(cacheSize);
-
-        storePos_ = -1;
-
-        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
-            readNextFrame(frameSource);
-
-        for (int i = 0; i <= temporalAreaRadius_; ++i)
-            processFrame(i);
-
-        procPos_ = temporalAreaRadius_;
-        outPos_ = -1;
-    }
-
-    void BTVL1_OCL::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
-    {
-        if (outPos_ >= storePos_)
-        {
-            if(_output.kind() == _InputArray::OCL_MAT)
-            {
-                getOclMatRef(_output).release();
-            }
-            else
-            {
-                _output.release();
-            }
-            return;
-        }
-
-        readNextFrame(frameSource);
-
-        if (procPos_ < storePos_)
-        {
-            ++procPos_;
-            processFrame(procPos_);
-        }
-
-        ++outPos_;
-        const oclMat& curOutput = at(outPos_, outputs_);
-
-        if (_output.kind() == _InputArray::OCL_MAT)
-            curOutput.convertTo(getOclMatRef(_output), CV_8U);
-        else
-        {
-            curOutput.convertTo(finalOutput_, CV_8U);
-            arrCopy(finalOutput_, _output);
-        }
-    }
-
-    void BTVL1_OCL::readNextFrame(Ptr<FrameSource>& frameSource)
-    {
-        curFrame_.release();
-        frameSource->nextFrame(curFrame_);
-
-        if (curFrame_.empty())
-            return;
-
-        ++storePos_;
-        curFrame_.convertTo(at(storePos_, frames_), CV_32F);
-
-        if (storePos_ > 0)
-        {
-            pair<oclMat, oclMat>& forwardMotion = at(storePos_ - 1, forwardMotions_);
-            pair<oclMat, oclMat>& backwardMotion = at(storePos_, backwardMotions_);
-
-            opticalFlow_->calc(prevFrame_, curFrame_, forwardMotion.first, forwardMotion.second);
-            opticalFlow_->calc(curFrame_, prevFrame_, backwardMotion.first, backwardMotion.second);
-        }
-
-        curFrame_.copyTo(prevFrame_);
-    }
-
-    void BTVL1_OCL::processFrame(int idx)
-    {
-        const int startIdx = max(idx - temporalAreaRadius_, 0);
-        const int procIdx = idx;
-        const int endIdx = min(startIdx + 2 * temporalAreaRadius_, storePos_);
-
-        const int count = endIdx - startIdx + 1;
-
-        srcFrames_.resize(count);
-        srcForwardMotions_.resize(count);
-        srcBackwardMotions_.resize(count);
-
-        int baseIdx = -1;
-
-        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
-        {
-            if (i == procIdx)
-                baseIdx = k;
-
-            srcFrames_[k] = at(i, frames_);
-
-            if (i < endIdx)
-                srcForwardMotions_[k] = at(i, forwardMotions_);
-            if (i > startIdx)
-                srcBackwardMotions_[k] = at(i, backwardMotions_);
-        }
-
-        process(srcFrames_, at(idx, outputs_), srcForwardMotions_, srcBackwardMotions_, baseIdx);
-    }
-}
-
-Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1_OCL()
-{
-    return makePtr<BTVL1_OCL>();
-}
-#endif
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
index 14481b852..c572c09a9 100644
--- a/modules/superres/src/frame_source.cpp
+++ b/modules/superres/src/frame_source.cpp
@@ -115,25 +115,18 @@ namespace
     void CaptureFrameSource::nextFrame(OutputArray _frame)
     {
         if (_frame.kind() == _InputArray::MAT)
-        {
             vc_ >> _frame.getMatRef();
-        }
         else if(_frame.kind() == _InputArray::GPU_MAT)
         {
             vc_ >> frame_;
             arrCopy(frame_, _frame);
         }
-        else if(_frame.kind() == _InputArray::OCL_MAT)
-        {
-            vc_ >> frame_;
-            if(!frame_.empty())
-            {
-                arrCopy(frame_, _frame);
-            }
-        }
+        else if (_frame.isUMat())
+            vc_ >> *(UMat *)_frame.getObj();
         else
         {
-            //should never get here
+            // should never get here
+            CV_Assert(0);
         }
     }
 
diff --git a/modules/superres/src/input_array_utility.cpp b/modules/superres/src/input_array_utility.cpp
index 5b8726747..9f4f22936 100644
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
@@ -62,6 +62,23 @@ Mat cv::superres::arrGetMat(InputArray arr, Mat& buf)
     }
 }
 
+UMat cv::superres::arrGetUMat(InputArray arr, UMat& buf)
+{
+    switch (arr.kind())
+    {
+    case _InputArray::GPU_MAT:
+        arr.getGpuMat().download(buf);
+        return buf;
+
+    case _InputArray::OPENGL_BUFFER:
+        arr.getOGlBuffer().copyTo(buf);
+        return buf;
+
+    default:
+        return arr.getUMat();
+    }
+}
+
 GpuMat cv::superres::arrGetGpuMat(InputArray arr, GpuMat& buf)
 {
     switch (arr.kind())
@@ -108,62 +125,39 @@ namespace
     {
         src.getGpuMat().copyTo(dst.getGpuMatRef());
     }
-#ifdef HAVE_OPENCV_OCL
-    void ocl2mat(InputArray src, OutputArray dst)
-    {
-        dst.getMatRef() = (Mat)ocl::getOclMatRef(src);
-    }
-    void mat2ocl(InputArray src, OutputArray dst)
-    {
-        Mat m = src.getMat();
-        ocl::getOclMatRef(dst) = (ocl::oclMat)m;
-    }
-    void ocl2ocl(InputArray src, OutputArray dst)
-    {
-        ocl::getOclMatRef(src).copyTo(ocl::getOclMatRef(dst));
-    }
-#else
-    void ocl2mat(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");;
-    }
-    void mat2ocl(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");;
-    }
-    void ocl2ocl(InputArray, OutputArray)
-    {
-        CV_Error(Error::StsNotImplemented, "The called functionality is disabled for current build or platform");
-    }
-#endif
 }
 
 void cv::superres::arrCopy(InputArray src, OutputArray dst)
 {
-    typedef void (*func_t)(InputArray src, OutputArray dst);
-    static const func_t funcs[11][11] =
+    if (dst.isUMat() || src.isUMat())
     {
-        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0 /*arr2tex*/, mat2gpu, mat2ocl},
-        {0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, 0 /*buf2arr*/, buf2arr, 0      },
-        {0, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0 /*tex2arr*/, 0},
-        {0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, 0 /*arr2tex*/, gpu2gpu, 0      },
-        {0, ocl2mat, ocl2mat, ocl2mat, ocl2mat, ocl2mat, ocl2mat, 0,       0,             0,       ocl2ocl}
+        src.copyTo(dst);
+        return;
+    }
+
+    typedef void (*func_t)(InputArray src, OutputArray dst);
+    static const func_t funcs[10][10] =
+    {
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, 0, mat2gpu },
+        { 0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, 0, buf2arr },
+        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+        { 0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, 0 , gpu2gpu },
     };
 
     const int src_kind = src.kind() >> _InputArray::KIND_SHIFT;
     const int dst_kind = dst.kind() >> _InputArray::KIND_SHIFT;
 
-    CV_DbgAssert( src_kind >= 0 && src_kind < 11 );
-    CV_DbgAssert( dst_kind >= 0 && dst_kind < 11 );
+    CV_Assert( src_kind >= 0 && src_kind < 10 );
+    CV_Assert( dst_kind >= 0 && dst_kind < 10 );
 
     const func_t func = funcs[src_kind][dst_kind];
-    CV_DbgAssert( func != 0 );
+    CV_Assert( func != 0 );
 
     func(src, dst);
 }
@@ -172,20 +166,21 @@ namespace
 {
     void convertToCn(InputArray src, OutputArray dst, int cn)
     {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+        int scn = src.channels();
+        CV_Assert( scn == 1 || scn == 3 || scn == 4 );
         CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
         static const int codes[5][5] =
         {
-            {-1, -1, -1, -1, -1},
-            {-1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
-            {-1, -1, -1, -1, -1},
-            {-1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
-            {-1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1},
+            { -1, -1, -1, -1, -1 },
+            { -1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA },
+            { -1, -1, -1, -1, -1 },
+            { -1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA },
+            { -1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1 }
         };
 
-        const int code = codes[src.channels()][cn];
-        CV_DbgAssert( code >= 0 );
+        const int code = codes[scn][cn];
+        CV_Assert( code >= 0 );
 
         switch (src.kind())
         {
@@ -202,6 +197,7 @@ namespace
             break;
         }
     }
+
     void convertToDepth(InputArray src, OutputArray dst, int depth)
     {
         CV_Assert( src.depth() <= CV_64F );
@@ -226,6 +222,10 @@ namespace
             src.getGpuMat().convertTo(dst.getGpuMatRef(), depth, scale);
             break;
 
+        case _InputArray::UMAT:
+            src.getUMat().convertTo(dst, depth, scale);
+            break;
+
         default:
             src.getMat().convertTo(dst, depth, scale);
             break;
@@ -258,6 +258,31 @@ Mat cv::superres::convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1)
     return buf1;
 }
 
+UMat cv::superres::convertToType(const UMat& src, int type, UMat& buf0, UMat& buf1)
+{
+    if (src.type() == type)
+        return src;
+
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (src.depth() == depth)
+    {
+        convertToCn(src, buf0, cn);
+        return buf0;
+    }
+
+    if (src.channels() == cn)
+    {
+        convertToDepth(src, buf1, depth);
+        return buf1;
+    }
+
+    convertToCn(src, buf0, cn);
+    convertToDepth(buf0, buf1, depth);
+    return buf1;
+}
+
 GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, GpuMat& buf1)
 {
     if (src.type() == type)
@@ -282,70 +307,3 @@ GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, Gp
     convertToDepth(buf0, buf1, depth);
     return buf1;
 }
-#ifdef HAVE_OPENCV_OCL
-namespace
-{
-    // TODO(pengx17): remove these overloaded functions until IntputArray fully supports oclMat
-    void convertToCn(const ocl::oclMat& src, ocl::oclMat& dst, int cn)
-    {
-        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
-        CV_Assert( cn == 1 || cn == 3 || cn == 4 );
-
-        static const int codes[5][5] =
-        {
-            {-1, -1, -1, -1, -1},
-            {-1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
-            {-1, -1, -1, -1, -1},
-            {-1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
-            {-1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1},
-        };
-
-        const int code = codes[src.channels()][cn];
-        CV_DbgAssert( code >= 0 );
-
-        ocl::cvtColor(src, dst, code, cn);
-    }
-    void convertToDepth(const ocl::oclMat& src, ocl::oclMat& dst, int depth)
-    {
-        CV_Assert( src.depth() <= CV_64F );
-        CV_Assert( depth == CV_8U || depth == CV_32F );
-
-        static const double maxVals[] =
-        {
-            std::numeric_limits<uchar>::max(),
-            std::numeric_limits<schar>::max(),
-            std::numeric_limits<ushort>::max(),
-            std::numeric_limits<short>::max(),
-            std::numeric_limits<int>::max(),
-            1.0,
-            1.0,
-        };
-        const double scale = maxVals[depth] / maxVals[src.depth()];
-        src.convertTo(dst, depth, scale);
-    }
-}
-ocl::oclMat cv::superres::convertToType(const ocl::oclMat& src, int type, ocl::oclMat& buf0, ocl::oclMat& buf1)
-{
-    if (src.type() == type)
-        return src;
-
-    const int depth = CV_MAT_DEPTH(type);
-    const int cn = CV_MAT_CN(type);
-
-    if (src.depth() == depth)
-    {
-        convertToCn(src, buf0, cn);
-        return buf0;
-    }
-
-    if (src.channels() == cn)
-    {
-        convertToDepth(src, buf1, depth);
-        return buf1;
-    }
-
-    convertToCn(src, buf0, cn);
-    convertToDepth(buf0, buf1, depth);
-    return buf1;
-}
-#endif
diff --git a/modules/superres/src/input_array_utility.hpp b/modules/superres/src/input_array_utility.hpp
index 6f17da001..3a858fbd7 100644
--- a/modules/superres/src/input_array_utility.hpp
+++ b/modules/superres/src/input_array_utility.hpp
@@ -45,25 +45,20 @@
 
 #include "opencv2/core.hpp"
 #include "opencv2/core/cuda.hpp"
-#ifdef HAVE_OPENCV_OCL
-#include "opencv2/ocl.hpp"
-#endif
 
 namespace cv
 {
     namespace superres
     {
         CV_EXPORTS Mat arrGetMat(InputArray arr, Mat& buf);
+        CV_EXPORTS UMat arrGetUMat(InputArray arr, UMat& buf);
         CV_EXPORTS cuda::GpuMat arrGetGpuMat(InputArray arr, cuda::GpuMat& buf);
 
         CV_EXPORTS void arrCopy(InputArray src, OutputArray dst);
 
         CV_EXPORTS Mat convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1);
+        CV_EXPORTS UMat convertToType(const UMat& src, int type, UMat& buf0, UMat& buf1);
         CV_EXPORTS cuda::GpuMat convertToType(const cuda::GpuMat& src, int type, cuda::GpuMat& buf0, cuda::GpuMat& buf1);
-
-#ifdef HAVE_OPENCV_OCL
-        CV_EXPORTS ocl::oclMat convertToType(const ocl::oclMat& src, int type, ocl::oclMat& buf0, ocl::oclMat& buf1);
-#endif
     }
 }
 
diff --git a/modules/superres/src/opencl/superres_btvl1.cl b/modules/superres/src/opencl/superres_btvl1.cl
index 3c0cff85b..b0e11aacb 100644
--- a/modules/superres/src/opencl/superres_btvl1.cl
+++ b/modules/superres/src/opencl/superres_btvl1.cl
@@ -43,160 +43,137 @@
 //
 //M*/
 
-__kernel void buildMotionMapsKernel(__global float* forwardMotionX,
-                                    __global float* forwardMotionY,
-                                    __global float* backwardMotionX,
-                                    __global float* backwardMotionY,
-                                    __global float* forwardMapX,
-                                    __global float* forwardMapY,
-                                    __global float* backwardMapX,
-                                    __global float* backwardMapY,
-                                    int forwardMotionX_row,
-                                    int forwardMotionX_col,
-                                    int forwardMotionX_step,
-                                    int forwardMotionY_step,
-                                    int backwardMotionX_step,
-                                    int backwardMotionY_step,
-                                    int forwardMapX_step,
-                                    int forwardMapY_step,
-                                    int backwardMapX_step,
-                                    int backwardMapY_step
-                                   )
+#ifndef cn
+#define cn 1
+#endif
+
+#define sz (int)sizeof(float)
+#define src_elem_at(_src, y, step, x) *(__global const float *)(_src + mad24(y, step, (x) * sz))
+#define dst_elem_at(_dst, y, step, x) *(__global float *)(_dst + mad24(y, step, (x) * sz))
+
+__kernel void buildMotionMaps(__global const uchar * forwardMotionPtr, int forwardMotion_step, int forwardMotion_offset,
+                              __global const uchar * backwardMotionPtr, int backwardMotion_step, int backwardMotion_offset,
+                              __global const uchar * forwardMapPtr, int forwardMap_step, int forwardMap_offset,
+                              __global const uchar * backwardMapPtr, int backwardMap_step, int backwardMap_offset,
+                              int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < forwardMotionX_col && y < forwardMotionX_row)
+    if (x < cols && y < rows)
     {
-        float fx = forwardMotionX[y * forwardMotionX_step + x];
-        float fy = forwardMotionY[y * forwardMotionY_step + x];
+        int forwardMotion_index = mad24(forwardMotion_step, y, (int)sizeof(float2) * x + forwardMotion_offset);
+        int backwardMotion_index = mad24(backwardMotion_step, y, (int)sizeof(float2) * x + backwardMotion_offset);
+        int forwardMap_index = mad24(forwardMap_step, y, (int)sizeof(float2) * x + forwardMap_offset);
+        int backwardMap_index = mad24(backwardMap_step, y, (int)sizeof(float2) * x + backwardMap_offset);
 
-        float bx = backwardMotionX[y * backwardMotionX_step + x];
-        float by = backwardMotionY[y * backwardMotionY_step + x];
+        float2 forwardMotion = *(__global const float2 *)(forwardMotionPtr + forwardMotion_index);
+        float2 backwardMotion = *(__global const float2 *)(backwardMotionPtr + backwardMotion_index);
+        __global float2 * forwardMap = (__global float2 *)(forwardMapPtr + forwardMap_index);
+        __global float2 * backwardMap = (__global float2 *)(backwardMapPtr + backwardMap_index);
 
-        forwardMapX[y * forwardMapX_step + x] = x + bx;
-        forwardMapY[y * forwardMapY_step + x] = y + by;
+        float2 basePoint = (float2)(x, y);
 
-        backwardMapX[y * backwardMapX_step + x] = x + fx;
-        backwardMapY[y * backwardMapY_step + x] = y + fy;
+        forwardMap[0] = basePoint + backwardMotion;
+        backwardMap[0] = basePoint + forwardMotion;
     }
 }
 
-__kernel void upscaleKernel(__global float* src,
-                            __global float* dst,
-                            int src_step,
-                            int dst_step,
-                            int src_row,
-                            int src_col,
-                            int scale,
-                            int channels
-                           )
+__kernel void upscale(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+                      __global uchar * dstptr, int dst_step, int dst_offset, int scale)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < src_col && y < src_row)
+    if (x < src_cols && y < src_rows)
     {
-        if(channels == 1)
-        {
-            dst[y * scale * dst_step + x * scale] = src[y * src_step + x];
-        }
-        else
-        {
-            vstore4(vload4(0, src + y * channels * src_step + 4 * x), 0, dst + y * channels * scale * dst_step + 4 * x * scale);
-        }
+        int src_index = mad24(y, src_step, sz * x * cn + src_offset);
+        int dst_index = mad24(y * scale, dst_step, sz * x * scale * cn + dst_offset);
+
+        __global const float * src = (__global const float *)(srcptr + src_index);
+        __global float * dst = (__global float *)(dstptr + dst_index);
+
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            dst[c] = src[c];
     }
 }
 
 
-float diffSign(float a, float b)
+inline float diffSign1(float a, float b)
 {
     return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
 }
 
-float4 diffSign4(float4 a, float4 b)
+inline float3 diffSign3(float3 a, float3 b)
 {
-    float4 pos;
+    float3 pos;
     pos.x = a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f;
     pos.y = a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f;
     pos.z = a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f;
-    pos.w = 0.0f;
     return pos;
 }
 
-__kernel void diffSignKernel(__global float* src1,
-                             __global float* src2,
-                             __global float* dst,
-                             int src1_row,
-                             int src1_col,
-                             int dst_step,
-                             int src1_step,
-                             int src2_step)
+__kernel void diffSign(__global const uchar * src1, int src1_step, int src1_offset,
+                       __global const uchar * src2, int src2_step, int src2_offset,
+                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if(x < src1_col && y < src1_row)
-    {
-        dst[y * dst_step + x] = diffSign(src1[y * src1_step + x], src2[y * src2_step + x]);
-    }
+    if (x < cols && y < rows)
+        *(__global float *)(dst + mad24(y, dst_step, sz * x + dst_offset)) =
+            diffSign1(*(__global const float *)(src1 + mad24(y, src1_step, sz * x + src1_offset)),
+                      *(__global const float *)(src2 + mad24(y, src2_step, sz * x + src2_offset)));
 }
 
-__kernel void calcBtvRegularizationKernel(__global float* src,
-        __global float* dst,
-        int src_step,
-        int dst_step,
-        int src_row,
-        int src_col,
-        int ksize,
-        int channels,
-        __constant float* c_btvRegWeights
-                                         )
+__kernel void calcBtvRegularization(__global const uchar * src, int src_step, int src_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                    int ksize, __constant float * c_btvRegWeights)
 {
     int x = get_global_id(0) + ksize;
     int y = get_global_id(1) + ksize;
 
-    if ((y < src_row - ksize) && (x < src_col - ksize))
+    if (y < dst_rows - ksize && x < dst_cols - ksize)
     {
-        if(channels == 1)
-        {
-            const float srcVal = src[y * src_step + x];
-            float dstVal = 0.0f;
+        src += src_offset;
 
-            for (int m = 0, count = 0; m <= ksize; ++m)
+#if cn == 1
+        const float srcVal = src_elem_at(src, y, src_step, x);
+        float dstVal = 0.0f;
+
+        for (int m = 0, count = 0; m <= ksize; ++m)
+            for (int l = ksize; l + m >= 0; --l, ++count)
             {
-                for (int l = ksize; l + m >= 0; --l, ++count)
-                {
-                    dstVal = dstVal + c_btvRegWeights[count] * (diffSign(srcVal, src[(y + m) * src_step + (x + l)]) - diffSign(src[(y - m) * src_step + (x - l)], srcVal));
-                }
+                dstVal += c_btvRegWeights[count] * (diffSign1(srcVal, src_elem_at(src, y + m, src_step, x + l))
+                    - diffSign1(src_elem_at(src, y - m, src_step, x - l), srcVal));
             }
-            dst[y * dst_step + x] = dstVal;
-        }
-        else
+
+        dst_elem_at(dst, y, dst_step, x) = dstVal;
+#elif cn == 3
+        __global const float * src0ptr = (__global const float *)(src + mad24(y, src_step, 3 * sz * x + src_offset));
+        float3 srcVal = (float3)(src0ptr[0], src0ptr[1], src0ptr[2]), dstVal = 0.f;
+
+        for (int m = 0, count = 0; m <= ksize; ++m)
         {
-            float4 srcVal = vload4(0, src + y * src_step + 4 * x);
-            float4 dstVal = 0.f;
-
-            for (int m = 0, count = 0; m <= ksize; ++m)
+            for (int l = ksize; l + m >= 0; --l, ++count)
             {
-                for (int l = ksize; l + m >= 0; --l, ++count)
-                {
-                    float4 src1;
-                    src1.x = src[(y + m) * src_step + 4 * (x + l) + 0];
-                    src1.y = src[(y + m) * src_step + 4 * (x + l) + 1];
-                    src1.z = src[(y + m) * src_step + 4 * (x + l) + 2];
-                    src1.w = src[(y + m) * src_step + 4 * (x + l) + 3];
+                __global const float * src1ptr = (__global const float *)(src + mad24(y + m, src_step, 3 * sz * (x + l) + src_offset));
+                __global const float * src2ptr = (__global const float *)(src + mad24(y - m, src_step, 3 * sz * (x - l) + src_offset));
 
-                    float4 src2;
-                    src2.x = src[(y - m) * src_step + 4 * (x - l) + 0];
-                    src2.y = src[(y - m) * src_step + 4 * (x - l) + 1];
-                    src2.z = src[(y - m) * src_step + 4 * (x - l) + 2];
-                    src2.w = src[(y - m) * src_step + 4 * (x - l) + 3];
+                float3 src1 = (float3)(src1ptr[0], src1ptr[1], src1ptr[2]);
+                float3 src2 = (float3)(src2ptr[0], src2ptr[1], src2ptr[2]);
 
-                    dstVal = dstVal + c_btvRegWeights[count] * (diffSign4(srcVal, src1) - diffSign4(src2, srcVal));
-                }
+                dstVal += c_btvRegWeights[count] * (diffSign3(srcVal, src1) - diffSign3(src2, srcVal));
             }
-            vstore4(dstVal, 0, dst + y * dst_step + 4 * x);
         }
+
+        __global float * dstptr = (__global float *)(dst + mad24(y, dst_step, 3 * sz * x + dst_offset + 0));
+        dstptr[0] = dstVal.x;
+        dstptr[1] = dstVal.y;
+        dstptr[2] = dstVal.z;
+#else
+#error "Number of channels should be either 1 of 3"
+#endif
     }
 }
diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
index 0389a78fd..2f77cd786 100644
--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencv2/core/opencl/ocl_defs.hpp"
 
 using namespace cv;
 using namespace cv::cuda;
@@ -61,21 +62,66 @@ namespace
         void collectGarbage();
 
     protected:
-        virtual void impl(const Mat& input0, const Mat& input1, OutputArray dst) = 0;
+        virtual void impl(InputArray input0, InputArray input1, OutputArray dst) = 0;
 
     private:
+        bool ocl_calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
+
         int work_type_;
+
+        // Mat
         Mat buf_[6];
         Mat flow_;
         Mat flows_[2];
+
+        // UMat
+        UMat ubuf_[6];
+        UMat uflow_;
+        std::vector<UMat> uflows_;
     };
 
-    CpuOpticalFlow::CpuOpticalFlow(int work_type) : work_type_(work_type)
+    CpuOpticalFlow::CpuOpticalFlow(int work_type) :
+        work_type_(work_type)
     {
     }
 
+    bool CpuOpticalFlow::ocl_calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
+    {
+        UMat frame0 = arrGetUMat(_frame0, ubuf_[0]);
+        UMat frame1 = arrGetUMat(_frame1, ubuf_[1]);
+
+        CV_Assert( frame1.type() == frame0.type() );
+        CV_Assert( frame1.size() == frame0.size() );
+
+        UMat input0 = convertToType(frame0, work_type_, ubuf_[2], ubuf_[3]);
+        UMat input1 = convertToType(frame1, work_type_, ubuf_[4], ubuf_[5]);
+
+        if (!_flow2.needed())
+        {
+            impl(input0, input1, _flow1);
+            return true;
+        }
+
+        impl(input0, input1, uflow_);
+
+        if (!_flow2.needed())
+            arrCopy(uflow_, _flow1);
+        else
+        {
+            split(uflow_, uflows_);
+
+            arrCopy(uflows_[0], _flow1);
+            arrCopy(uflows_[1], _flow2);
+        }
+
+        return true;
+    }
+
     void CpuOpticalFlow::calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
     {
+        CV_OCL_RUN(_flow1.isUMat() && (_flow2.isUMat() || !_flow2.needed()),
+                   ocl_calc(_frame0, _frame1, _flow1, _flow2))
+
         Mat frame0 = arrGetMat(_frame0, buf_[0]);
         Mat frame1 = arrGetMat(_frame1, buf_[1]);
 
@@ -94,9 +140,7 @@ namespace
         impl(input0, input1, flow_);
 
         if (!_flow2.needed())
-        {
             arrCopy(flow_, _flow1);
-        }
         else
         {
             split(flow_, flows_);
@@ -108,11 +152,19 @@ namespace
 
     void CpuOpticalFlow::collectGarbage()
     {
+        // Mat
         for (int i = 0; i < 6; ++i)
             buf_[i].release();
         flow_.release();
         flows_[0].release();
         flows_[1].release();
+
+        // UMat
+        for (int i = 0; i < 6; ++i)
+            ubuf_[i].release();
+        uflow_.release();
+        uflows_[0].release();
+        uflows_[1].release();
     }
 }
 
@@ -129,7 +181,7 @@ namespace
         Farneback();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         double pyrScale_;
@@ -148,7 +200,7 @@ namespace
                       obj.info()->addParam(obj, "numIters", obj.numIters_);
                       obj.info()->addParam(obj, "polyN", obj.polyN_);
                       obj.info()->addParam(obj, "polySigma", obj.polySigma_);
-                      obj.info()->addParam(obj, "flags", obj.flags_));
+                      obj.info()->addParam(obj, "flags", obj.flags_))
 
     Farneback::Farneback() : CpuOpticalFlow(CV_8UC1)
     {
@@ -161,7 +213,7 @@ namespace
         flags_ = 0;
     }
 
-    void Farneback::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    void Farneback::impl(InputArray input0, InputArray input1, OutputArray dst)
     {
         calcOpticalFlowFarneback(input0, input1, (InputOutputArray)dst, pyrScale_,
                                  numLevels_, winSize_, numIters_,
@@ -187,7 +239,7 @@ namespace
         Simple();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         int layers_;
@@ -218,7 +270,7 @@ namespace
                       obj.info()->addParam(obj, "upscaleAveragingRadius", obj.upscaleAveragingRadius_);
                       obj.info()->addParam(obj, "upscaleSigmaDist", obj.upscaleSigmaDist_);
                       obj.info()->addParam(obj, "upscaleSigmaColor", obj.upscaleSigmaColor_);
-                      obj.info()->addParam(obj, "speedUpThr", obj.speedUpThr_));
+                      obj.info()->addParam(obj, "speedUpThr", obj.speedUpThr_))
 
     Simple::Simple() : CpuOpticalFlow(CV_8UC3)
     {
@@ -237,11 +289,9 @@ namespace
         speedUpThr_ = 10;
     }
 
-    void Simple::impl(const Mat& _input0, const Mat& _input1, OutputArray dst)
+    void Simple::impl(InputArray _input0, InputArray _input1, OutputArray _dst)
     {
-        Mat input0 = _input0;
-        Mat input1 = _input1;
-        calcOpticalFlowSF(input0, input1, dst.getMatRef(),
+        calcOpticalFlowSF(_input0, _input1, _dst,
                           layers_,
                           averagingBlockSize_,
                           maxFlow_,
@@ -278,7 +328,7 @@ namespace
         void collectGarbage();
 
     protected:
-        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+        void impl(InputArray input0, InputArray input1, OutputArray dst);
 
     private:
         double tau_;
@@ -301,7 +351,7 @@ namespace
                       obj.info()->addParam(obj, "warps", obj.warps_);
                       obj.info()->addParam(obj, "epsilon", obj.epsilon_);
                       obj.info()->addParam(obj, "iterations", obj.iterations_);
-                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_));
+                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_))
 
     DualTVL1::DualTVL1() : CpuOpticalFlow(CV_8UC1)
     {
@@ -316,7 +366,7 @@ namespace
         useInitialFlow_ = alg_->getBool("useInitialFlow");
     }
 
-    void DualTVL1::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    void DualTVL1::impl(InputArray input0, InputArray input1, OutputArray dst)
     {
         alg_->set("tau", tau_);
         alg_->set("lambda", lambda_);
@@ -472,7 +522,7 @@ namespace
                       obj.info()->addParam(obj, "scaleFactor", obj.scaleFactor_, false, 0, 0, "Pyramid scale factor");
                       obj.info()->addParam(obj, "innerIterations", obj.innerIterations_, false, 0, 0, "Number of lagged non-linearity iterations (inner loop)");
                       obj.info()->addParam(obj, "outerIterations", obj.outerIterations_, false, 0, 0, "Number of warping iterations (number of pyramid levels)");
-                      obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"));
+                      obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"))
 
     Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1), alg_(0.197f, 50.0f, 0.8f, 10, 77, 10)
     {
@@ -536,7 +586,7 @@ namespace
     CV_INIT_ALGORITHM(PyrLK_CUDA, "DenseOpticalFlowExt.PyrLK_CUDA",
                       obj.info()->addParam(obj, "winSize", obj.winSize_);
                       obj.info()->addParam(obj, "maxLevel", obj.maxLevel_);
-                      obj.info()->addParam(obj, "iterations", obj.iterations_));
+                      obj.info()->addParam(obj, "iterations", obj.iterations_))
 
     PyrLK_CUDA::PyrLK_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
@@ -603,7 +653,7 @@ namespace
                       obj.info()->addParam(obj, "numIters", obj.numIters_);
                       obj.info()->addParam(obj, "polyN", obj.polyN_);
                       obj.info()->addParam(obj, "polySigma", obj.polySigma_);
-                      obj.info()->addParam(obj, "flags", obj.flags_));
+                      obj.info()->addParam(obj, "flags", obj.flags_))
 
     Farneback_CUDA::Farneback_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
@@ -679,7 +729,7 @@ namespace
                       obj.info()->addParam(obj, "warps", obj.warps_);
                       obj.info()->addParam(obj, "epsilon", obj.epsilon_);
                       obj.info()->addParam(obj, "iterations", obj.iterations_);
-                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_));
+                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_))
 
     DualTVL1_CUDA::DualTVL1_CUDA() : GpuOpticalFlow(CV_8UC1)
     {
@@ -720,269 +770,3 @@ Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_CUDA()
 }
 
 #endif // HAVE_OPENCV_CUDAOPTFLOW
-#ifdef HAVE_OPENCV_OCL
-
-namespace
-{
-    class oclOpticalFlow : public DenseOpticalFlowExt
-    {
-    public:
-        explicit oclOpticalFlow(int work_type);
-
-        void calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
-        void collectGarbage();
-
-    protected:
-        virtual void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2) = 0;
-
-    private:
-        int work_type_;
-        cv::ocl::oclMat buf_[6];
-        cv::ocl::oclMat u_, v_, flow_;
-    };
-
-    oclOpticalFlow::oclOpticalFlow(int work_type) : work_type_(work_type)
-    {
-    }
-
-    void oclOpticalFlow::calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2)
-    {
-        ocl::oclMat& _frame0 = ocl::getOclMatRef(frame0);
-        ocl::oclMat& _frame1 = ocl::getOclMatRef(frame1);
-        ocl::oclMat& _flow1  = ocl::getOclMatRef(flow1);
-        ocl::oclMat& _flow2  = ocl::getOclMatRef(flow2);
-
-        CV_Assert( _frame1.type() == _frame0.type() );
-        CV_Assert( _frame1.size() == _frame0.size() );
-
-        cv::ocl::oclMat input0_ = convertToType(_frame0, work_type_, buf_[2], buf_[3]);
-        cv::ocl::oclMat input1_ = convertToType(_frame1, work_type_, buf_[4], buf_[5]);
-
-        impl(input0_, input1_, u_, v_);//go to tvl1 algorithm
-
-        u_.copyTo(_flow1);
-        v_.copyTo(_flow2);
-    }
-
-    void oclOpticalFlow::collectGarbage()
-    {
-        for (int i = 0; i < 6; ++i)
-            buf_[i].release();
-        u_.release();
-        v_.release();
-        flow_.release();
-    }
-}
-///////////////////////////////////////////////////////////////////
-// PyrLK_OCL
-
-namespace
-{
-    class PyrLK_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        PyrLK_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const ocl::oclMat& input0, const ocl::oclMat& input1, ocl::oclMat& dst1, ocl::oclMat& dst2);
-
-    private:
-        int winSize_;
-        int maxLevel_;
-        int iterations_;
-
-        ocl::PyrLKOpticalFlow alg_;
-    };
-
-    CV_INIT_ALGORITHM(PyrLK_OCL, "DenseOpticalFlowExt.PyrLK_OCL",
-        obj.info()->addParam(obj, "winSize", obj.winSize_);
-    obj.info()->addParam(obj, "maxLevel", obj.maxLevel_);
-    obj.info()->addParam(obj, "iterations", obj.iterations_));
-
-    PyrLK_OCL::PyrLK_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        winSize_ = alg_.winSize.width;
-        maxLevel_ = alg_.maxLevel;
-        iterations_ = alg_.iters;
-    }
-
-    void PyrLK_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.winSize.width = winSize_;
-        alg_.winSize.height = winSize_;
-        alg_.maxLevel = maxLevel_;
-        alg_.iters = iterations_;
-
-        alg_.dense(input0, input1, dst1, dst2);
-    }
-
-    void PyrLK_OCL::collectGarbage()
-    {
-        alg_.releaseMemory();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_PyrLK_OCL()
-{
-    return makePtr<PyrLK_OCL>();
-}
-
-///////////////////////////////////////////////////////////////////
-// DualTVL1_OCL
-
-namespace
-{
-    class DualTVL1_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        DualTVL1_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2);
-
-    private:
-        double tau_;
-        double lambda_;
-        double theta_;
-        int nscales_;
-        int warps_;
-        double epsilon_;
-        int iterations_;
-        bool useInitialFlow_;
-
-        ocl::OpticalFlowDual_TVL1_OCL alg_;
-    };
-
-    CV_INIT_ALGORITHM(DualTVL1_OCL, "DenseOpticalFlowExt.DualTVL1_OCL",
-    obj.info()->addParam(obj, "tau", obj.tau_);
-    obj.info()->addParam(obj, "lambda", obj.lambda_);
-    obj.info()->addParam(obj, "theta", obj.theta_);
-    obj.info()->addParam(obj, "nscales", obj.nscales_);
-    obj.info()->addParam(obj, "warps", obj.warps_);
-    obj.info()->addParam(obj, "epsilon", obj.epsilon_);
-    obj.info()->addParam(obj, "iterations", obj.iterations_);
-    obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_));
-
-    DualTVL1_OCL::DualTVL1_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        tau_ = alg_.tau;
-        lambda_ = alg_.lambda;
-        theta_ = alg_.theta;
-        nscales_ = alg_.nscales;
-        warps_ = alg_.warps;
-        epsilon_ = alg_.epsilon;
-        iterations_ = alg_.iterations;
-        useInitialFlow_ = alg_.useInitialFlow;
-    }
-
-    void DualTVL1_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.tau = tau_;
-        alg_.lambda = lambda_;
-        alg_.theta = theta_;
-        alg_.nscales = nscales_;
-        alg_.warps = warps_;
-        alg_.epsilon = epsilon_;
-        alg_.iterations = iterations_;
-        alg_.useInitialFlow = useInitialFlow_;
-
-        alg_(input0, input1, dst1, dst2);
-
-    }
-
-    void DualTVL1_OCL::collectGarbage()
-    {
-        alg_.collectGarbage();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_OCL()
-{
-    return makePtr<DualTVL1_OCL>();
-}
-
-///////////////////////////////////////////////////////////////////
-// FarneBack
-
-namespace
-{
-    class FarneBack_OCL : public oclOpticalFlow
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        FarneBack_OCL();
-
-        void collectGarbage();
-
-    protected:
-        void impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2);
-
-    private:
-        double pyrScale_;
-        int numLevels_;
-        int winSize_;
-        int numIters_;
-        int polyN_;
-        double polySigma_;
-        int flags_;
-
-        ocl::FarnebackOpticalFlow alg_;
-    };
-
-    CV_INIT_ALGORITHM(FarneBack_OCL, "DenseOpticalFlowExt.FarneBack_OCL",
-        obj.info()->addParam(obj, "pyrScale", obj.pyrScale_);
-    obj.info()->addParam(obj, "numLevels", obj.numLevels_);
-    obj.info()->addParam(obj, "winSize", obj.winSize_);
-    obj.info()->addParam(obj, "numIters", obj.numIters_);
-    obj.info()->addParam(obj, "polyN", obj.polyN_);
-    obj.info()->addParam(obj, "polySigma", obj.polySigma_);
-    obj.info()->addParam(obj, "flags", obj.flags_));
-
-    FarneBack_OCL::FarneBack_OCL() : oclOpticalFlow(CV_8UC1)
-    {
-        pyrScale_ = alg_.pyrScale;
-        numLevels_ = alg_.numLevels;
-        winSize_ = alg_.winSize;
-        numIters_ = alg_.numIters;
-        polyN_ = alg_.polyN;
-        polySigma_ = alg_.polySigma;
-        flags_ = alg_.flags;
-    }
-
-    void FarneBack_OCL::impl(const cv::ocl::oclMat& input0, const cv::ocl::oclMat& input1, cv::ocl::oclMat& dst1, cv::ocl::oclMat& dst2)
-    {
-        alg_.pyrScale = pyrScale_;
-        alg_.numLevels = numLevels_;
-        alg_.winSize = winSize_;
-        alg_.numIters = numIters_;
-        alg_.polyN = polyN_;
-        alg_.polySigma = polySigma_;
-        alg_.flags = flags_;
-
-        alg_(input0, input1, dst1, dst2);
-    }
-
-    void FarneBack_OCL::collectGarbage()
-    {
-        alg_.releaseMemory();
-        oclOpticalFlow::collectGarbage();
-    }
-}
-
-Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Farneback_OCL()
-{
-    return makePtr<FarneBack_OCL>();
-}
-
-#endif
diff --git a/modules/superres/src/precomp.hpp b/modules/superres/src/precomp.hpp
index 0681bfa28..c3aeb665d 100644
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
@@ -82,10 +82,6 @@
 #  include "opencv2/cudacodec.hpp"
 #endif
 
-#ifdef HAVE_OPENCV_OCL
-    #include "opencv2/ocl/private/util.hpp"
-#endif
-
 #ifdef HAVE_OPENCV_HIGHGUI
     #include "opencv2/highgui.hpp"
 #endif
diff --git a/modules/superres/src/super_resolution.cpp b/modules/superres/src/super_resolution.cpp
index 031a59b29..215416dd7 100644
--- a/modules/superres/src/super_resolution.cpp
+++ b/modules/superres/src/super_resolution.cpp
@@ -54,16 +54,20 @@ cv::superres::SuperResolution::SuperResolution()
 {
     frameSource_ = createFrameSource_Empty();
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::setInput(const Ptr<FrameSource>& frameSource)
 {
     frameSource_ = frameSource;
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::nextFrame(OutputArray frame)
 {
+    isUmat_ = frame.isUMat();
+
     if (firstCall_)
     {
         initImpl(frameSource_);
@@ -77,6 +81,7 @@ void cv::superres::SuperResolution::reset()
 {
     frameSource_->reset();
     firstCall_ = true;
+    isUmat_ = false;
 }
 
 void cv::superres::SuperResolution::collectGarbage()
diff --git a/modules/superres/test/test_precomp.hpp b/modules/superres/test/test_precomp.hpp
index 47334e324..553481818 100644
--- a/modules/superres/test/test_precomp.hpp
+++ b/modules/superres/test/test_precomp.hpp
@@ -52,9 +52,9 @@
 #define __OPENCV_TEST_PRECOMP_HPP__
 
 #include "opencv2/opencv_modules.hpp"
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/ts.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/superres.hpp"
 #include "cvconfig.h"
 #include "input_array_utility.hpp"
diff --git a/modules/superres/test/test_superres.cpp b/modules/superres/test/test_superres.cpp
index 95baefda9..980c8ed60 100644
--- a/modules/superres/test/test_superres.cpp
+++ b/modules/superres/test/test_superres.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
 
 class AllignedFrameSource : public cv::superres::FrameSource
 {
@@ -52,6 +53,7 @@ public:
 
 private:
     cv::Ptr<cv::superres::FrameSource> base_;
+
     cv::Mat origFrame_;
     int scale_;
 };
@@ -67,9 +69,7 @@ void AllignedFrameSource::nextFrame(cv::OutputArray frame)
     base_->nextFrame(origFrame_);
 
     if (origFrame_.rows % scale_ == 0 && origFrame_.cols % scale_ == 0)
-    {
         cv::superres::arrCopy(origFrame_, frame);
-    }
     else
     {
         cv::Rect ROI(0, 0, (origFrame_.cols / scale_) * scale_, (origFrame_.rows / scale_) * scale_);
@@ -92,6 +92,7 @@ public:
 
 private:
     cv::Ptr<cv::superres::FrameSource> base_;
+
     cv::Mat origFrame_;
     cv::Mat blurred_;
     cv::Mat deg_;
@@ -104,28 +105,25 @@ DegradeFrameSource::DegradeFrameSource(const cv::Ptr<cv::superres::FrameSource>&
     CV_Assert( base_ );
 }
 
-void addGaussNoise(cv::Mat& image, double sigma)
+static void addGaussNoise(cv::OutputArray _image, double sigma)
 {
-    cv::Mat noise(image.size(), CV_32FC(image.channels()));
+    int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    cv::Mat noise(_image.size(), CV_32FC(cn));
     cvtest::TS::ptr()->get_rng().fill(noise, cv::RNG::NORMAL, 0.0, sigma);
 
-    cv::addWeighted(image, 1.0, noise, 1.0, 0.0, image, image.depth());
+    cv::addWeighted(_image, 1.0, noise, 1.0, 0.0, _image, depth);
 }
 
-void addSpikeNoise(cv::Mat& image, int frequency)
+static void addSpikeNoise(cv::OutputArray _image, int frequency)
 {
-    cv::Mat_<uchar> mask(image.size(), 0);
+    cv::Mat_<uchar> mask(_image.size(), 0);
 
     for (int y = 0; y < mask.rows; ++y)
-    {
         for (int x = 0; x < mask.cols; ++x)
-        {
             if (cvtest::TS::ptr()->get_rng().uniform(0, frequency) < 1)
                 mask(y, x) = 255;
-        }
-    }
 
-    image.setTo(cv::Scalar::all(255), mask);
+    _image.setTo(cv::Scalar::all(255), mask);
 }
 
 void DegradeFrameSource::nextFrame(cv::OutputArray frame)
@@ -146,7 +144,7 @@ void DegradeFrameSource::reset()
     base_->reset();
 }
 
-double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
+double MSSIM(cv::InputArray _i1, cv::InputArray _i2)
 {
     const double C1 = 6.5025;
     const double C2 = 58.5225;
@@ -154,8 +152,8 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
     const int depth = CV_32F;
 
     cv::Mat I1, I2;
-    i1.convertTo(I1, depth);
-    i2.convertTo(I2, depth);
+    _i1.getMat().convertTo(I1, depth);
+    _i2.getMat().convertTo(I2, depth);
 
     cv::Mat I2_2  = I2.mul(I2); // I2^2
     cv::Mat I1_2  = I1.mul(I1); // I1^2
@@ -201,7 +199,7 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
     // mssim = average of ssim map
     cv::Scalar mssim = cv::mean(ssim_map);
 
-    if (i1.channels() == 1)
+    if (_i1.channels() == 1)
         return mssim[0];
 
     return (mssim[0] + mssim[1] + mssim[3]) / 3;
@@ -210,9 +208,11 @@ double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
 class SuperResolution : public testing::Test
 {
 public:
+    template <typename T>
     void RunTest(cv::Ptr<cv::superres::SuperResolution> superRes);
 };
 
+template <typename T>
 void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
 {
     const std::string inputVideoName = cvtest::TS::ptr()->get_data_path() + "car.avi";
@@ -245,7 +245,8 @@ void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
     double srAvgMSSIM = 0.0;
     const int count = 10;
 
-    cv::Mat goldFrame, superResFrame;
+    cv::Mat goldFrame;
+    T superResFrame;
     for (int i = 0; i < count; ++i)
     {
         goldSource->nextFrame(goldFrame);
@@ -266,23 +267,28 @@ void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
 
 TEST_F(SuperResolution, BTVL1)
 {
-    RunTest(cv::superres::createSuperResolution_BTVL1());
+    RunTest<cv::Mat>(cv::superres::createSuperResolution_BTVL1());
 }
 
 #if defined(HAVE_CUDA) && defined(HAVE_OPENCV_CUDAARITHM) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAFILTERS)
 
 TEST_F(SuperResolution, BTVL1_CUDA)
 {
-    RunTest(cv::superres::createSuperResolution_BTVL1_CUDA());
+    RunTest<cv::Mat>(cv::superres::createSuperResolution_BTVL1_CUDA());
 }
 
 #endif
 
-#if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL)
+#ifdef HAVE_OPENCL
 
-TEST_F(SuperResolution, BTVL1_OCL)
+namespace cvtest {
+namespace ocl {
+
+OCL_TEST_F(SuperResolution, BTVL1)
 {
-    RunTest(cv::superres::createSuperResolution_BTVL1_OCL());
+    RunTest<cv::UMat>(cv::superres::createSuperResolution_BTVL1());
 }
 
+} } // namespace cvtest::ocl
+
 #endif
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index 0f9c3fe77..3e1b5a05a 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -7,10 +7,6 @@ endif()
 set(OPENCV_MODULE_TYPE STATIC)
 set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
 
-if(HAVE_CUDA)
-  ocv_include_directories(${CUDA_INCLUDE_DIRS})
-endif()
-
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 
 ocv_add_module(ts opencv_core opencv_imgproc opencv_highgui)
diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp
index 8e898af7e..72a7ae684 100644
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -4,6 +4,8 @@
 #include "opencv2/core/cvdef.h"
 #include <stdarg.h> // for va_list
 
+#include "cvconfig.h"
+
 #ifdef HAVE_WINRT
     #pragma warning(disable:4447) // Disable warning 'main' signature found without threading model
 #endif
@@ -548,6 +550,15 @@ CV_EXPORTS void printVersionInfo(bool useStdOut = true);
 #endif
 #endif
 
+#if defined(HAVE_OPENCL) && !defined(CV_BUILD_OCL_MODULE)
+namespace cvtest { namespace ocl {
+void dumpOpenCLDevice();
+}}
+#define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
+#else
+#define TEST_DUMP_OCL_INFO
+#endif
+
 #define CV_TEST_MAIN(resourcesubdir, ...) \
 int main(int argc, char **argv) \
 { \
@@ -555,6 +566,7 @@ int main(int argc, char **argv) \
     ::testing::InitGoogleTest(&argc, argv); \
     cvtest::printVersionInfo(); \
     __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
+    TEST_DUMP_OCL_INFO \
     return RUN_ALL_TESTS(); \
 }
 
diff --git a/modules/ts/include/opencv2/ts/ocl_perf.hpp b/modules/ts/include/opencv2/ts/ocl_perf.hpp
index 52f815d1c..c2e860067 100644
--- a/modules/ts/include/opencv2/ts/ocl_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_perf.hpp
@@ -45,46 +45,68 @@
 #include "ocl_test.hpp"
 #include "ts_perf.hpp"
 
-#ifdef HAVE_OPENCL
-
 namespace cvtest {
 namespace ocl {
 
 using namespace perf;
 
+using std::tr1::get;
+using std::tr1::tuple;
+
 #define OCL_PERF_STRATEGY PERF_STRATEGY_SIMPLE
 
+#define OCL_PERF_TEST(fixture, name) SIMPLE_PERF_TEST(fixture, name)
 #define OCL_PERF_TEST_P(fixture, name, params) SIMPLE_PERF_TEST_P(fixture, name, params)
 
-#define SIMPLE_PERF_TEST_P(fixture, name, params)\
-    class OCL##_##fixture##_##name : public fixture {\
-    public:\
-        OCL##_##fixture##_##name() {}\
-    protected:\
-        virtual void PerfTestBody();\
-    };\
-    TEST_P(OCL##_##fixture##_##name, name){ declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); }\
-    INSTANTIATE_TEST_CASE_P(/*none*/, OCL##_##fixture##_##name, params);\
+#define SIMPLE_PERF_TEST(fixture, name) \
+    class OCL##_##fixture##_##name : \
+        public ::perf::TestBase \
+    { \
+    public: \
+        OCL##_##fixture##_##name() { } \
+    protected: \
+        virtual void PerfTestBody(); \
+    }; \
+    TEST_F(OCL##_##fixture##_##name, name) { declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \
     void OCL##_##fixture##_##name::PerfTestBody()
 
+#define SIMPLE_PERF_TEST_P(fixture, name, params) \
+    class OCL##_##fixture##_##name : \
+        public fixture \
+    { \
+    public: \
+        OCL##_##fixture##_##name() { } \
+    protected: \
+        virtual void PerfTestBody(); \
+    }; \
+    TEST_P(OCL##_##fixture##_##name, name) { declare.strategy(OCL_PERF_STRATEGY); RunPerfTestBody(); } \
+    INSTANTIATE_TEST_CASE_P(/*none*/, OCL##_##fixture##_##name, params); \
+    void OCL##_##fixture##_##name::PerfTestBody()
 
-#define OCL_SIZE_1000 Size(1000, 1000)
-#define OCL_SIZE_2000 Size(2000, 2000)
-#define OCL_SIZE_4000 Size(4000, 4000)
+#define OCL_SIZE_1 szVGA
+#define OCL_SIZE_2 sz720p
+#define OCL_SIZE_3 sz1080p
+#define OCL_SIZE_4 sz2160p
 
-#define OCL_TEST_SIZES ::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000, OCL_SIZE_4000)
+#define OCL_TEST_SIZES ::testing::Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, OCL_SIZE_4)
 #define OCL_TEST_TYPES ::testing::Values(CV_8UC1, CV_32FC1, CV_8UC4, CV_32FC4)
+#define OCL_TEST_TYPES_14 OCL_TEST_TYPES
+#define OCL_TEST_TYPES_134 ::testing::Values(CV_8UC1, CV_32FC1, CV_8UC3, CV_32FC3, CV_8UC4, CV_32FC4)
 
 #define OCL_PERF_ENUM ::testing::Values
 
 // TODO Replace finish call to dstUMat.wait()
 #define OCL_TEST_CYCLE() \
-    for (; startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer())
+    for (cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer())
+
+#define OCL_TEST_CYCLE_N(n) \
+    for(declare.iterations(n), cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer())
 
 #define OCL_TEST_CYCLE_MULTIRUN(runsNum) \
-    for (declare.runs(runsNum); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer()) \
+    for (declare.runs(runsNum), cvtest::ocl::perf::safeFinish(); startTimer(), next(); cvtest::ocl::perf::safeFinish(), stopTimer()) \
         for (int r = 0; r < runsNum; cvtest::ocl::perf::safeFinish(), ++r)
 
+
 namespace perf {
 
 // Check for current device limitation
@@ -97,7 +119,7 @@ CV_EXPORTS void randu(InputOutputArray dst);
 inline void safeFinish()
 {
     if (cv::ocl::useOpenCL())
-        cv::ocl::finish2();
+        cv::ocl::finish();
 }
 
 } // namespace perf
@@ -106,6 +128,4 @@ using namespace perf;
 } // namespace cvtest::ocl
 } // namespace cvtest
 
-#endif // HAVE_OPENCL
-
 #endif // __OPENCV_TS_OCL_PERF_HPP__
diff --git a/modules/ts/include/opencv2/ts/ocl_test.hpp b/modules/ts/include/opencv2/ts/ocl_test.hpp
index 57220c7a7..169e34fdc 100644
--- a/modules/ts/include/opencv2/ts/ocl_test.hpp
+++ b/modules/ts/include/opencv2/ts/ocl_test.hpp
@@ -42,11 +42,8 @@
 #ifndef __OPENCV_TS_OCL_TEST_HPP__
 #define __OPENCV_TS_OCL_TEST_HPP__
 
-#include "cvconfig.h" // to get definition of HAVE_OPENCL
 #include "opencv2/opencv_modules.hpp"
 
-#ifdef HAVE_OPENCL
-
 #include "opencv2/ts.hpp"
 
 #include "opencv2/highgui.hpp"
@@ -60,54 +57,29 @@ namespace ocl {
 using namespace cv;
 using namespace testing;
 
-namespace traits {
-
-template <typename T>
-struct GetMatForRead
-{
-};
-template <>
-struct GetMatForRead<Mat>
-{
-    static const Mat get(const Mat& m) { return m; }
-};
-template <>
-struct GetMatForRead<UMat>
-{
-    static const Mat get(const UMat& m) { return m.getMat(ACCESS_READ); }
-};
-
-} // namespace traits
-
-template <typename T>
-const Mat getMatForRead(const T& mat)
-{
-    return traits::GetMatForRead<T>::get(mat);
-}
-
 extern int test_loop_times;
 
 #define MAX_VALUE 357
 
 #define EXPECT_MAT_NORM(mat, eps) \
 { \
-    EXPECT_LE(checkNorm(mat), eps) \
+    EXPECT_LE(TestUtils::checkNorm(mat), eps) \
 }
 
 #define EXPECT_MAT_NEAR(mat1, mat2, eps) \
 { \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNorm(mat1, mat2), eps) \
-       << cv::format("Size: %d x %d", mat1.size().width, mat1.size().height) << std::endl; \
+    ASSERT_EQ(mat1.type(), mat2.type()); \
+    ASSERT_EQ(mat1.size(), mat2.size()); \
+    EXPECT_LE(TestUtils::checkNorm(mat1, mat2), eps) \
+        << "Size: " << mat1.size() << std::endl; \
 }
 
 #define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
 { \
-   ASSERT_EQ(mat1.type(), mat2.type()); \
-   ASSERT_EQ(mat1.size(), mat2.size()); \
-   EXPECT_LE(checkNormRelative(mat1, mat2), eps) \
-       << cv::format("Size: %d x %d", mat1.size().width, mat1.size().height) << std::endl; \
+    ASSERT_EQ(mat1.type(), mat2.type()); \
+    ASSERT_EQ(mat1.size(), mat2.size()); \
+    EXPECT_LE(TestUtils::checkNormRelative(mat1, mat2), eps) \
+        << "Size: " << mat1.size() << std::endl; \
 }
 
 #define OCL_EXPECT_MATS_NEAR(name, eps) \
@@ -134,8 +106,8 @@ extern int test_loop_times;
 { \
     ASSERT_EQ(mat1.type(), mat2.type()); \
     ASSERT_EQ(mat1.size(), mat2.size()); \
-    EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
-        << cv::format("Size: %d x %d", mat1.size().width, mat1.size().height) << std::endl; \
+    EXPECT_LE(checkSimilarity(mat1, mat2), eps) \
+        << "Size: " << mat1.size() << std::endl; \
 }
 
 using perf::MatDepth;
@@ -227,54 +199,22 @@ struct CV_EXPORTS TestUtils
     // If the two vectors are not equal, it will return the difference in vector size
     // Else it will return (total diff of each 1 and 2 rects covered pixels)/(total 1 rects covered pixels)
     // The smaller, the better matched
-    static double checkRectSimilarity(cv::Size sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);
+    static double checkRectSimilarity(const cv::Size & sz, std::vector<cv::Rect>& ob1, std::vector<cv::Rect>& ob2);
 
     //! read image from testdata folder.
-
     static cv::Mat readImage(const String &fileName, int flags = cv::IMREAD_COLOR);
     static cv::Mat readImageType(const String &fname, int type);
 
-    static double checkNorm(const cv::Mat &m);
-    static double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-    static double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-    static inline double checkNormRelative(const Mat &m1, const Mat &m2)
+    static double checkNorm(InputArray m);
+    static double checkNorm(InputArray m1, InputArray m2);
+    static double checkSimilarity(InputArray m1, InputArray m2);
+    static void showDiff(InputArray _src, InputArray _gold, InputArray _actual, double eps, bool alwaysShow);
+
+    static inline double checkNormRelative(InputArray m1, InputArray m2)
     {
-        return cv::norm(m1, m2, cv::NORM_INF) /
+        return cv::norm(m1.getMat(), m2.getMat(), cv::NORM_INF) /
                 std::max((double)std::numeric_limits<float>::epsilon(),
-                         (double)std::max(cv::norm(m1, cv::NORM_INF), norm(m2, cv::NORM_INF)));
-    }
-    static void showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow = false);
-
-    template <typename T1>
-    static double checkNorm(const T1& m)
-    {
-        return checkNorm(getMatForRead(m));
-    }
-    template <typename T1, typename T2>
-    static double checkNorm(const T1& m1, const T2& m2)
-    {
-        return checkNorm(getMatForRead(m1), getMatForRead(m2));
-    }
-    template <typename T1, typename T2>
-    static double checkSimilarity(const T1& m1, const T2& m2)
-    {
-        return checkSimilarity(getMatForRead(m1), getMatForRead(m2));
-    }
-    template <typename T1, typename T2>
-    static inline double checkNormRelative(const T1& m1, const T2& m2)
-    {
-        const Mat _m1 = getMatForRead(m1);
-        const Mat _m2 = getMatForRead(m2);
-        return checkNormRelative(_m1, _m2);
-    }
-
-    template <typename T1, typename T2, typename T3>
-    static void showDiff(const T1& src, const T2& gold, const T3& actual, double eps, bool alwaysShow = false)
-    {
-        const Mat _src = getMatForRead(src);
-        const Mat _gold = getMatForRead(gold);
-        const Mat _actual = getMatForRead(actual);
-        showDiff(_src, _gold, _actual, eps, alwaysShow);
+                         (double)std::max(cv::norm(m1.getMat(), cv::NORM_INF), norm(m2.getMat(), cv::NORM_INF)));
     }
 };
 
@@ -318,6 +258,8 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
 #endif // IMPLEMENT_PARAM_CLASS
 
 #define OCL_TEST_P TEST_P
+#define OCL_TEST_F(name, ...) typedef name OCL_##name; TEST_F(OCL_##name, __VA_ARGS__)
+#define OCL_TEST(name, ...) TEST(OCL_##name, __VA_ARGS__)
 
 #define OCL_OFF(fn) cv::ocl::setUseOpenCL(false); fn
 #define OCL_ON(fn) cv::ocl::setUseOpenCL(true); fn
@@ -332,8 +274,6 @@ CV_ENUM(BorderType, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_WR
 #define OCL_INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
     INSTANTIATE_TEST_CASE_P(OCL_ ## prefix, test_case_name, generator)
 
-}} // namespace cvtest::ocl
-
-#endif // HAVE_OPENCL
+} } // namespace cvtest::ocl
 
 #endif // __OPENCV_TS_OCL_TEST_HPP__
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index 499b53cad..e3b6481d1 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -164,6 +164,7 @@ class CV_EXPORTS Regression
 {
 public:
     static Regression& add(TestBase* test, const std::string& name, cv::InputArray array, double eps = DBL_EPSILON, ERROR_TYPE err = ERROR_ABSOLUTE);
+    static Regression& addMoments(TestBase* test, const std::string& name, const cv::Moments & array, double eps = DBL_EPSILON, ERROR_TYPE err = ERROR_ABSOLUTE);
     static Regression& addKeypoints(TestBase* test, const std::string& name, const std::vector<cv::KeyPoint>& array, double eps = DBL_EPSILON, ERROR_TYPE err = ERROR_ABSOLUTE);
     static Regression& addMatches(TestBase* test, const std::string& name, const std::vector<cv::DMatch>& array, double eps = DBL_EPSILON, ERROR_TYPE err = ERROR_ABSOLUTE);
     static void Init(const std::string& testSuitName, const std::string& ext = ".xml");
@@ -201,9 +202,10 @@ private:
 };
 
 #define SANITY_CHECK(array, ...) ::perf::Regression::add(this, #array, array , ## __VA_ARGS__)
+#define SANITY_CHECK_MOMENTS(array, ...) ::perf::Regression::addMoments(this, #array, array , ## __VA_ARGS__)
 #define SANITY_CHECK_KEYPOINTS(array, ...) ::perf::Regression::addKeypoints(this, #array, array , ## __VA_ARGS__)
 #define SANITY_CHECK_MATCHES(array, ...) ::perf::Regression::addMatches(this, #array, array , ## __VA_ARGS__)
-#define SANITY_CHECK_NOTHING() this->setVerified();
+#define SANITY_CHECK_NOTHING() this->setVerified()
 
 class CV_EXPORTS GpuPerf
 {
@@ -253,7 +255,7 @@ enum PERF_STRATEGY
 {
     PERF_STRATEGY_DEFAULT = -1,
     PERF_STRATEGY_BASE = 0,
-    PERF_STRATEGY_SIMPLE = 1,
+    PERF_STRATEGY_SIMPLE = 1
 };
 
 
@@ -377,6 +379,9 @@ template<typename T> class TestBaseWithParam: public TestBase, public ::testing:
 typedef std::tr1::tuple<cv::Size, MatType> Size_MatType_t;
 typedef TestBaseWithParam<Size_MatType_t> Size_MatType;
 
+typedef std::tr1::tuple<cv::Size, MatDepth> Size_MatDepth_t;
+typedef TestBaseWithParam<Size_MatDepth_t> Size_MatDepth;
+
 /*****************************************************************************************\
 *                              Print functions for googletest                             *
 \*****************************************************************************************/
@@ -387,6 +392,7 @@ CV_EXPORTS void PrintTo(const MatType& t, std::ostream* os);
 namespace cv
 {
 
+CV_EXPORTS void PrintTo(const String& str, ::std::ostream* os);
 CV_EXPORTS void PrintTo(const Size& sz, ::std::ostream* os);
 
 } //namespace cv
@@ -504,6 +510,15 @@ CV_EXPORTS void PrintTo(const Size& sz, ::std::ostream* os);
 #endif
 #endif
 
+#if defined(HAVE_OPENCL) && !defined(CV_BUILD_OCL_MODULE)
+namespace cvtest { namespace ocl {
+void dumpOpenCLDevice();
+}}
+#define TEST_DUMP_OCL_INFO cvtest::ocl::dumpOpenCLDevice();
+#else
+#define TEST_DUMP_OCL_INFO
+#endif
+
 #define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...)	\
     ::perf::Regression::Init(#modulename); \
     ::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls), \
@@ -513,6 +528,7 @@ CV_EXPORTS void PrintTo(const Size& sz, ::std::ostream* os);
     ::testing::Test::RecordProperty("cv_module_name", #modulename); \
     ::perf::TestBase::RecordRunParameters(); \
     __CV_TEST_EXEC_ARGS(__VA_ARGS__) \
+    TEST_DUMP_OCL_INFO \
     return RUN_ALL_TESTS();
 
 // impls must be an array, not a pointer; "plain" should always be one of the implementations
diff --git a/modules/ts/misc/chart.py b/modules/ts/misc/chart.py
index 39a60eb2e..2663c7875 100755
--- a/modules/ts/misc/chart.py
+++ b/modules/ts/misc/chart.py
@@ -168,7 +168,7 @@ if __name__ == "__main__":
             print >> sys.stderr, "%4s:   %s" % (i, name)
             i += 1
         if names1:
-            print >> sys.stderr, "Other suits in this log (can not be choosen):"
+            print >> sys.stderr, "Other suits in this log (can not be chosen):"
             for name in sorted(names1):
                 print >> sys.stderr, "%4s:   %s" % (i, name)
                 i += 1
diff --git a/modules/ts/misc/xls-report.py b/modules/ts/misc/xls-report.py
index e71a7f66c..6b90b5924 100755
--- a/modules/ts/misc/xls-report.py
+++ b/modules/ts/misc/xls-report.py
@@ -97,6 +97,9 @@ re_data_type = re.compile(r'^ (?: 8 | 16 | 32 | 64 ) [USF] C [1234] $', re.VERBO
 
 time_style = xlwt.easyxf(num_format_str='#0.00')
 no_time_style = xlwt.easyxf('pattern: pattern solid, fore_color gray25')
+failed_style = xlwt.easyxf('pattern: pattern solid, fore_color red')
+noimpl_style = xlwt.easyxf('pattern: pattern solid, fore_color orange')
+style_dict = {"failed": failed_style, "noimpl":noimpl_style}
 
 speedup_style = time_style
 good_speedup_style = xlwt.easyxf('font: color green', num_format_str='#0.00')
@@ -328,7 +331,7 @@ def main():
 
                 for c in config_names:
                     if c in configs:
-                        sheet.write(row, col, configs[c], time_style)
+                        sheet.write(row, col, configs[c], style_dict.get(configs[c], time_style))
                     else:
                         sheet.write(row, col, None, no_time_style)
                     col += 1
diff --git a/modules/ts/src/cuda_perf.cpp b/modules/ts/src/cuda_perf.cpp
index 61e9e3401..c5c278142 100644
--- a/modules/ts/src/cuda_perf.cpp
+++ b/modules/ts/src/cuda_perf.cpp
@@ -44,10 +44,6 @@
 #include "opencv2/ts/cuda_perf.hpp"
 #include "opencv2/core/cuda.hpp"
 
-#ifdef HAVE_CUDA
-    #include <cuda_runtime.h>
-#endif
-
 using namespace cv;
 using namespace std;
 
@@ -260,44 +256,8 @@ namespace perf
     void printCudaInfo()
     {
         printOsInfo();
-    #ifndef HAVE_CUDA
-        printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
-    #else
-        int driver;
-        cudaDriverGetVersion(&driver);
-
-        printf("[----------]\n"), fflush(stdout);
-        printf("[ GPU INFO ] \tCUDA Driver  version: %d.\n", driver), fflush(stdout);
-        printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
-        printf("[----------]\n"), fflush(stdout);
-
-        printf("[----------]\n"), fflush(stdout);
-        printf("[ GPU INFO ] \tCUDA module was compiled for the following GPU archs.\n"), fflush(stdout);
-        printf("[      BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
-        printf("[      PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
-        printf("[----------]\n"), fflush(stdout);
-
-        printf("[----------]\n"), fflush(stdout);
-        int deviceCount = cv::cuda::getCudaEnabledDeviceCount();
-        printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
-        printf("[----------]\n"), fflush(stdout);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            cv::cuda::DeviceInfo info(i);
-
-            printf("[----------]\n"), fflush(stdout);
-            printf("[ DEVICE   ] \t# %d %s.\n", i, info.name()), fflush(stdout);
-            printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
-            printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
-            printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
-            printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
-            if (!info.isCompatible())
-                printf("[ GPU INFO ] \tThis device is NOT compatible with current CUDA module build\n");
-            printf("[----------]\n"), fflush(stdout);
-        }
-
-    #endif
+        for (int i = 0; i < cv::cuda::getCudaEnabledDeviceCount(); i++)
+            cv::cuda::printCudaDeviceInfo(i);
     }
 
     struct KeypointIdxCompare
diff --git a/modules/ts/src/ocl_perf.cpp b/modules/ts/src/ocl_perf.cpp
index 9151f8889..8dacf219f 100644
--- a/modules/ts/src/ocl_perf.cpp
+++ b/modules/ts/src/ocl_perf.cpp
@@ -43,8 +43,6 @@
 
 #include "opencv2/ts/ocl_perf.hpp"
 
-#ifdef HAVE_OPENCL
-
 namespace cvtest {
 namespace ocl {
 
@@ -53,45 +51,33 @@ namespace perf {
 void checkDeviceMaxMemoryAllocSize(const Size& size, int type, int factor)
 {
     assert(factor > 0);
+
     if (!cv::ocl::useOpenCL())
         return;
-    int cn = CV_MAT_CN(type);
-    int cn_ocl = cn == 3 ? 4 : cn;
-    int type_ocl = CV_MAKE_TYPE(CV_MAT_DEPTH(type), cn_ocl);
-    size_t memSize = size.area() * CV_ELEM_SIZE(type_ocl);
+
+    size_t memSize = size.area() * CV_ELEM_SIZE(type);
     const cv::ocl::Device& dev = cv::ocl::Device::getDefault();
+
     if (memSize * factor >= dev.maxMemAllocSize())
-    {
         throw ::perf::TestBase::PerfSkipTestException();
-    }
 }
 
 void randu(InputOutputArray dst)
 {
     if (dst.depth() == CV_8U)
-    {
         cv::randu(dst, 0, 256);
-    }
     else if (dst.depth() == CV_8S)
-    {
         cv::randu(dst, -128, 128);
-    }
     else if (dst.depth() == CV_16U)
-    {
         cv::randu(dst, 0, 1024);
-    }
     else if (dst.depth() == CV_32F || dst.depth() == CV_64F)
-    {
         cv::randu(dst, -1.0, 1.0);
-    }
-    else // (dst.depth() == CV_16S || dst.depth() == CV_32S)
-    {
+    else if (dst.depth() == CV_16S || dst.depth() == CV_32S)
         cv::randu(dst, -4096, 4096);
-    }
+    else
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported format");
 }
 
 } // namespace perf
 
-}} // namespace cvtest::ocl
-
-#endif // HAVE_OPENCL
+} } // namespace cvtest::ocl
diff --git a/modules/ts/src/ocl_test.cpp b/modules/ts/src/ocl_test.cpp
index d2ee77199..7da04f656 100644
--- a/modules/ts/src/ocl_test.cpp
+++ b/modules/ts/src/ocl_test.cpp
@@ -43,8 +43,6 @@
 
 #include "opencv2/ts/ocl_test.hpp"
 
-#ifdef HAVE_OPENCL
-
 namespace cvtest {
 namespace ocl {
 
@@ -52,6 +50,142 @@ using namespace cv;
 
 int test_loop_times = 1; // TODO Read from command line / environment
 
+
+#define DUMP_PROPERTY_XML(propertyName, propertyValue) \
+    do { \
+        std::stringstream ssName, ssValue;\
+        ssName << propertyName;\
+        ssValue << (propertyValue); \
+        ::testing::Test::RecordProperty(ssName.str(), ssValue.str()); \
+    } while (false)
+
+#define DUMP_MESSAGE_STDOUT(msg) \
+    do { \
+        std::cout << msg << std::endl; \
+    } while (false)
+
+static std::string bytesToStringRepr(size_t value)
+{
+    size_t b = value % 1024;
+    value /= 1024;
+
+    size_t kb = value % 1024;
+    value /= 1024;
+
+    size_t mb = value % 1024;
+    value /= 1024;
+
+    size_t gb = value;
+
+    std::ostringstream stream;
+
+    if (gb > 0)
+        stream << gb << " GB ";
+    if (mb > 0)
+        stream << mb << " MB ";
+    if (kb > 0)
+        stream << kb << " kB ";
+    if (b > 0)
+        stream << b << " B";
+
+    return stream.str();
+}
+
+void dumpOpenCLDevice()
+{
+    using namespace cv::ocl;
+
+    try
+    {
+        std::vector<PlatformInfo> platforms;
+        cv::ocl::getPlatfomsInfo(platforms);
+        if (platforms.size() > 0)
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
+            for (size_t i = 0; i < platforms.size(); i++)
+            {
+                const PlatformInfo* platform = &platforms[i];
+                DUMP_MESSAGE_STDOUT("    " << platform->name().c_str());
+                Device current_device;
+                for (int j = 0; j < platform->deviceNumber(); j++)
+                {
+                    platform->getDevice(current_device, j);
+                    const char* deviceTypeStr = current_device.type() == Device::TYPE_CPU
+                        ? ("CPU") : (current_device.type() == Device::TYPE_GPU ? current_device.hostUnifiedMemory() ? "iGPU" : "dGPU" : "unknown");
+                    DUMP_MESSAGE_STDOUT( "        " << deviceTypeStr << ": " << current_device.name().c_str() << " (" << current_device.version().c_str() << ")");
+                    DUMP_PROPERTY_XML( cv::format("cv_ocl_platform_%d_device_%d", (int)i, (int)j ),
+                        cv::format("(Platform=%s)(Type=%s)(Name=%s)(Version=%s)",
+                        platform->name().c_str(), deviceTypeStr, current_device.name().c_str(), current_device.version().c_str()) );
+                }
+            }
+        }
+        else
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL is not available");
+            DUMP_PROPERTY_XML("cv_ocl", "not available");
+            return;
+        }
+
+        const Device& device = Device::getDefault();
+        DUMP_MESSAGE_STDOUT("Current OpenCL device: ");
+
+#if 0
+        DUMP_MESSAGE_STDOUT("    Platform = "<< device.getPlatform().name());
+        DUMP_PROPERTY_XML("cv_ocl_current_platformName", device.getPlatform().name());
+#endif
+
+        const char* deviceTypeStr = device.type() == Device::TYPE_CPU
+            ? ("CPU") : (device.type() == Device::TYPE_GPU ? device.hostUnifiedMemory() ? "iGPU" : "dGPU" : "unknown");
+        DUMP_MESSAGE_STDOUT("    Type = "<< deviceTypeStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_deviceType", deviceTypeStr);
+
+        DUMP_MESSAGE_STDOUT("    Name = "<< device.name());
+        DUMP_PROPERTY_XML("cv_ocl_current_deviceName", device.name());
+
+        DUMP_MESSAGE_STDOUT("    Version = " << device.version());
+        DUMP_PROPERTY_XML("cv_ocl_current_deviceVersion", device.version());
+
+        DUMP_MESSAGE_STDOUT("    Compute units = "<< device.maxComputeUnits());
+        DUMP_PROPERTY_XML("cv_ocl_current_maxComputeUnits", device.maxComputeUnits());
+
+        DUMP_MESSAGE_STDOUT("    Max work group size = "<< device.maxWorkGroupSize());
+        DUMP_PROPERTY_XML("cv_ocl_current_maxWorkGroupSize", device.maxWorkGroupSize());
+
+        std::string localMemorySizeStr = bytesToStringRepr(device.localMemSize());
+        DUMP_MESSAGE_STDOUT("    Local memory size = " << localMemorySizeStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_localMemSize", device.localMemSize());
+
+        std::string maxMemAllocSizeStr = bytesToStringRepr(device.maxMemAllocSize());
+        DUMP_MESSAGE_STDOUT("    Max memory allocation size = "<< maxMemAllocSizeStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
+
+        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Double support = "<< doubleSupportStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+
+        const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Host unified memory = "<< isUnifiedMemoryStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_hostUnifiedMemory", device.hostUnifiedMemory());
+
+        const char* haveAmdBlasStr = haveAmdBlas() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Blas = "<< haveAmdBlasStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_AmdBlas", haveAmdBlas());
+
+        const char* haveAmdFftStr = haveAmdFft() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Fft = "<< haveAmdFftStr);
+        DUMP_PROPERTY_XML("cv_ocl_current_AmdFft", haveAmdFft());
+    }
+    catch (...)
+    {
+        DUMP_MESSAGE_STDOUT("Exception. Can't dump OpenCL info");
+        DUMP_MESSAGE_STDOUT("OpenCL device not available");
+        DUMP_PROPERTY_XML("cv_ocl", "not available");
+    }
+}
+#undef DUMP_MESSAGE_STDOUT
+#undef DUMP_PROPERTY_XML
+
+
 Mat TestUtils::readImage(const String &fileName, int flags)
 {
     return cv::imread(cvtest::TS::ptr()->get_data_path() + fileName, flags);
@@ -70,41 +204,39 @@ Mat TestUtils::readImageType(const String &fname, int type)
     return src;
 }
 
-double TestUtils::checkNorm(const Mat &m)
+double TestUtils::checkNorm(InputArray m)
 {
-    return norm(m, NORM_INF);
+    return norm(m.getMat(), NORM_INF);
 }
 
-double TestUtils::checkNorm(const Mat &m1, const Mat &m2)
+double TestUtils::checkNorm(InputArray m1, InputArray m2)
 {
-    return norm(m1, m2, NORM_INF);
+    return norm(m1.getMat(), m2.getMat(), NORM_INF);
 }
 
-double TestUtils::checkSimilarity(const Mat &m1, const Mat &m2)
+double TestUtils::checkSimilarity(InputArray m1, InputArray m2)
 {
     Mat diff;
-    matchTemplate(m1, m2, diff, CV_TM_CCORR_NORMED);
+    matchTemplate(m1.getMat(), m2.getMat(), diff, CV_TM_CCORR_NORMED);
     return std::abs(diff.at<float>(0, 0) - 1.f);
 }
 
-double TestUtils::checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
+double TestUtils::checkRectSimilarity(const Size & sz, std::vector<Rect>& ob1, std::vector<Rect>& ob2)
 {
     double final_test_result = 0.0;
     size_t sz1 = ob1.size();
     size_t sz2 = ob2.size();
 
-    if(sz1 != sz2)
-    {
+    if (sz1 != sz2)
         return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
-    }
     else
     {
-        if(sz1==0 && sz2==0)
+        if (sz1 == 0 && sz2 == 0)
             return 0;
         cv::Mat cpu_result(sz, CV_8UC1);
         cpu_result.setTo(0);
 
-        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
+        for (vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
         {
             cv::Mat cpu_result_roi(cpu_result, *r);
             cpu_result_roi.setTo(1);
@@ -124,7 +256,7 @@ double TestUtils::checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vect
         cv::Mat result_;
         multiply(cpu_result, gpu_result, result_);
         int result = cv::countNonZero(result_ > 0);
-        if(cpu_area!=0 && result!=0)
+        if (cpu_area!=0 && result!=0)
             final_test_result = 1.0 - (double)result/(double)cpu_area;
         else if(cpu_area==0 && result!=0)
             final_test_result = -1;
@@ -132,8 +264,10 @@ double TestUtils::checkRectSimilarity(Size sz, std::vector<Rect>& ob1, std::vect
     return final_test_result;
 }
 
-void TestUtils::showDiff(const Mat& src, const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
+void TestUtils::showDiff(InputArray _src, InputArray _gold, InputArray _actual, double eps, bool alwaysShow)
 {
+    Mat src = _src.getMat(), actual = _actual.getMat(), gold = _gold.getMat();
+
     Mat diff, diff_thresh;
     absdiff(gold, actual, diff);
     diff.convertTo(diff, CV_32F);
@@ -161,6 +295,4 @@ void TestUtils::showDiff(const Mat& src, const Mat& gold, const Mat& actual, dou
     }
 }
 
-}} // namespace cvtest::ocl
-
-#endif // HAVE_OPENCL
+} } // namespace cvtest::ocl
diff --git a/modules/ts/src/ts_arrtest.cpp b/modules/ts/src/ts_arrtest.cpp
index a3a058c51..8ba0b3786 100644
--- a/modules/ts/src/ts_arrtest.cpp
+++ b/modules/ts/src/ts_arrtest.cpp
@@ -123,7 +123,7 @@ void ArrayTest::get_test_array_types_and_sizes( int /*test_case_idx*/, vector<ve
 }
 
 
-static const int icvTsTypeToDepth[] =
+static const unsigned int icvTsTypeToDepth[] =
 {
     IPL_DEPTH_8U, IPL_DEPTH_8S, IPL_DEPTH_16U, IPL_DEPTH_16S,
     IPL_DEPTH_32S, IPL_DEPTH_32F, IPL_DEPTH_64F
diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp
index 0472815bf..89c91b98f 100644
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
@@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool
 
     Mat m(size0, type);
 
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
     if( size0 == size )
         return m;
     return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height));
@@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector<int>& size, int type, double minVal, double
 
     Mat m(dims, &size0[0], type);
 
-    rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
+    rng.fill(m, RNG::UNIFORM, minVal, maxVal);
     if( eqsize )
         return m;
     return m(&r[0]);
@@ -2897,7 +2897,7 @@ static std::ostream& operator << (std::ostream& out, const MatPart& m)
 }
 
 MatComparator::MatComparator(double _maxdiff, int _context)
-    : maxdiff(_maxdiff), context(_context) {}
+    : maxdiff(_maxdiff), realmaxdiff(DBL_MAX), context(_context) {}
 
 ::testing::AssertionResult
 MatComparator::operator()(const char* expr1, const char* expr2,
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 08f2ed5c7..8060e271f 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -115,6 +115,14 @@ Regression& Regression::add(TestBase* test, const std::string& name, cv::InputAr
     return instance()(name, array, eps, err);
 }
 
+Regression& Regression::addMoments(TestBase* test, const std::string& name, const cv::Moments& array, double eps, ERROR_TYPE err)
+{
+    int len = (int)sizeof(cv::Moments) / sizeof(double);
+    cv::Mat m(1, len, CV_64F, (void*)&array);
+
+    return Regression::add(test, name, m, eps, err);
+}
+
 Regression& Regression::addKeypoints(TestBase* test, const std::string& name, const std::vector<cv::KeyPoint>& array, double eps, ERROR_TYPE err)
 {
     int len = (int)array.size();
@@ -268,7 +276,8 @@ std::string Regression::getCurrentTestNodeName()
 
 bool Regression::isVector(cv::InputArray a)
 {
-    return a.kind() == cv::_InputArray::STD_VECTOR_MAT || a.kind() == cv::_InputArray::STD_VECTOR_VECTOR;
+    return a.kind() == cv::_InputArray::STD_VECTOR_MAT || a.kind() == cv::_InputArray::STD_VECTOR_VECTOR ||
+           a.kind() == cv::_InputArray::STD_VECTOR_UMAT;
 }
 
 double Regression::getElem(cv::Mat& m, int y, int x, int cn)
@@ -846,6 +855,9 @@ int64 TestBase::_calibrate()
 #endif
 TestBase::TestBase(): testStrategy(PERF_STRATEGY_DEFAULT), declare(this)
 {
+    lastTime = totalTime = timeLimit = 0;
+    nIters = currentIter = runsPerIteration = 0;
+    verified = false;
 }
 #ifdef _MSC_VER
 # pragma warning(pop)
@@ -866,17 +878,29 @@ void TestBase::declareArray(SizeVector& sizes, cv::InputOutputArray a, WarmUpTyp
 void TestBase::warmup(cv::InputOutputArray a, WarmUpType wtype)
 {
     if (a.empty())
-    {
         return;
-    }
     else if (a.isUMat())
     {
-        return; // TODO current warmup_impl is not useful for GPU-based data
+        if (wtype == WARMUP_RNG || wtype == WARMUP_WRITE)
+        {
+            int depth = a.depth();
+            if (depth == CV_8U)
+                cv::randu(a, 0, 256);
+            else if (depth == CV_8S)
+                cv::randu(a, -128, 128);
+            else if (depth == CV_16U)
+                cv::randu(a, 0, 1024);
+            else if (depth == CV_32F || depth == CV_64F)
+                cv::randu(a, -1.0, 1.0);
+            else if (depth == CV_16S || depth == CV_32S)
+                cv::randu(a, -4096, 4096);
+            else
+                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported format");
+        }
+        return;
     }
     else if (a.kind() != cv::_InputArray::STD_VECTOR_MAT && a.kind() != cv::_InputArray::STD_VECTOR_VECTOR)
-    {
         warmup_impl(a.getMat(), wtype);
-    }
     else
     {
         size_t total = a.total();
@@ -1180,7 +1204,7 @@ void TestBase::validateMetrics()
         double mean = metrics.mean * 1000.0f / metrics.frequency;
         double stddev = metrics.stddev * 1000.0f / metrics.frequency;
         double percents = stddev / mean * 100.f;
-        printf("    samples = %d, mean = %.2f, stddev = %.2f (%.1f%%)\n", (int)metrics.samples, mean, stddev, percents);
+        printf("[ PERFSTAT ]    (samples = %d, mean = %.2f, stddev = %.2f (%.1f%%))\n", (int)metrics.samples, mean, stddev, percents);
     }
     else
     {
@@ -1592,6 +1616,11 @@ void PrintTo(const MatType& t, ::std::ostream* os)
 \*****************************************************************************************/
 namespace cv {
 
+void PrintTo(const String& str, ::std::ostream* os)
+{
+    *os << str;
+}
+
 void PrintTo(const Size& sz, ::std::ostream* os)
 {
     *os << /*"Size:" << */sz.width << "x" << sz.height;
diff --git a/modules/video/perf/opencl/perf_bgfg_mog2.cpp b/modules/video/perf/opencl/perf_bgfg_mog2.cpp
new file mode 100644
index 000000000..50814bf81
--- /dev/null
+++ b/modules/video/perf/opencl/perf_bgfg_mog2.cpp
@@ -0,0 +1,120 @@
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+#if defined(HAVE_XINE)     || \
+defined(HAVE_GSTREAMER)    || \
+defined(HAVE_QUICKTIME)    || \
+defined(HAVE_AVFOUNDATION) || \
+defined(HAVE_FFMPEG)       || \
+defined(WIN32)
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////// Mog2//////////////////////////
+
+typedef tuple<string, int> VideoMOG2ParamType;
+typedef TestBaseWithParam<VideoMOG2ParamType> MOG2_Apply;
+typedef TestBaseWithParam<VideoMOG2ParamType> MOG2_GetBackgroundImage;
+
+static void cvtFrameFmt(vector<Mat>& input, vector<Mat>& output)
+{
+    for(int i = 0; i< (int)(input.size()); i++)
+    {
+        cvtColor(input[i], output[i], COLOR_RGB2GRAY);
+    }
+}
+
+static void prepareData(VideoCapture& cap, int cn, vector<Mat>& frame_buffer)
+{
+    cv::Mat frame;
+    std::vector<Mat> frame_buffer_init;
+    int nFrame = (int)frame_buffer.size();
+    for(int i = 0; i < nFrame; i++)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+        frame_buffer_init.push_back(frame);
+    }
+
+    if(cn == 1)
+        cvtFrameFmt(frame_buffer_init, frame_buffer);
+    else
+        frame_buffer = frame_buffer_init;
+}
+
+OCL_PERF_TEST_P(MOG2_Apply, Mog2, Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), Values(1,3)))
+{
+    VideoMOG2ParamType params = GetParam();
+
+    const string inputFile = getDataPath(get<0>(params));
+
+    const int cn = get<1>(params);
+    int nFrame = 5;
+
+    vector<Mat> frame_buffer(nFrame);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+    prepareData(cap, cn, frame_buffer);
+
+    UMat u_foreground;
+
+    OCL_TEST_CYCLE()
+    {
+        Ptr<cv::BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2();
+        mog2->setDetectShadows(false);
+        u_foreground.release();
+        for (int i = 0; i < nFrame; i++)
+        {
+            mog2->apply(frame_buffer[i], u_foreground);
+        }
+    }
+    SANITY_CHECK(u_foreground);
+}
+
+OCL_PERF_TEST_P(MOG2_GetBackgroundImage, Mog2, Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), Values(3)))
+{
+    VideoMOG2ParamType params = GetParam();
+
+    const string inputFile = getDataPath(get<0>(params));
+
+    const int cn = get<1>(params);
+    int nFrame = 5;
+
+    vector<Mat> frame_buffer(nFrame);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+    prepareData(cap, cn, frame_buffer);
+
+    UMat u_foreground, u_background;
+
+    OCL_TEST_CYCLE()
+    {
+        Ptr<cv::BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2();
+        mog2->setDetectShadows(false);
+        u_foreground.release();
+        u_background.release();
+        for (int i = 0; i < nFrame; i++)
+        {
+            mog2->apply(frame_buffer[i], u_foreground);
+        }
+        mog2->getBackgroundImage(u_background);
+    }
+    SANITY_CHECK(u_background);
+}
+
+}}// namespace cvtest::ocl
+
+    #endif
+#endif
\ No newline at end of file
diff --git a/modules/video/perf/opencl/perf_motempl.cpp b/modules/video/perf/opencl/perf_motempl.cpp
new file mode 100644
index 000000000..795685721
--- /dev/null
+++ b/modules/video/perf/opencl/perf_motempl.cpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// UpdateMotionHistory ////////////////////////
+
+typedef TestBaseWithParam<Size> UpdateMotionHistoryFixture;
+
+OCL_PERF_TEST_P(UpdateMotionHistoryFixture, UpdateMotionHistory, OCL_TEST_SIZES)
+{
+    const Size size = GetParam();
+    checkDeviceMaxMemoryAllocSize(size, CV_32FC1);
+
+    UMat silhouette(size, CV_8UC1), mhi(size, CV_32FC1);
+    randu(silhouette, -5, 5);
+    declare.in(mhi, WARMUP_RNG);
+
+    OCL_TEST_CYCLE() cv::updateMotionHistory(silhouette, mhi, 1, 0.5);
+
+    SANITY_CHECK(mhi);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/video/perf/opencl/perf_optflow_farneback.cpp b/modules/video/perf/opencl/perf_optflow_farneback.cpp
new file mode 100644
index 000000000..a17ed4dd9
--- /dev/null
+++ b/modules/video/perf/opencl/perf_optflow_farneback.cpp
@@ -0,0 +1,114 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Fangfang Bai, fangfang@multicorewareinc.com
+//    Jin Ma,       jin@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+
+using std::tr1::make_tuple;
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+///////////// FarnebackOpticalFlow ////////////////////////
+CV_ENUM(farneFlagType, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
+
+typedef tuple< tuple<int, double>, farneFlagType, bool > FarnebackOpticalFlowParams;
+typedef TestBaseWithParam<FarnebackOpticalFlowParams> FarnebackOpticalFlowFixture;
+
+OCL_PERF_TEST_P(FarnebackOpticalFlowFixture, FarnebackOpticalFlow,
+                ::testing::Combine(
+                    ::testing::Values(
+                                      make_tuple<int, double>(5, 1.1),
+                                      make_tuple<int, double>(7, 1.5)
+                                     ),
+                    farneFlagType::all(),
+                    ::testing::Bool()
+                    )
+                )
+{
+    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
+
+    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
+
+    const Size srcSize = frame0.size();
+
+    const int numLevels = 5;
+    const int winSize = 13;
+    const int numIters = 10;
+
+    const FarnebackOpticalFlowParams params = GetParam();
+    const tuple<int, double> polyParams = get<0>(params);
+    const int polyN = get<0>(polyParams);
+    const double polySigma = get<1>(polyParams);
+    const double pyrScale = 0.5;
+    int flags = get<1>(params);
+    const bool useInitFlow = get<2>(params);
+    const double eps = 0.1;
+
+    UMat uFrame0; frame0.copyTo(uFrame0);
+    UMat uFrame1; frame1.copyTo(uFrame1);
+    UMat uFlow(srcSize, CV_32FC2);
+    declare.in(uFrame0, uFrame1, WARMUP_READ).out(uFlow, WARMUP_READ);
+    if (useInitFlow)
+    {
+        cv::calcOpticalFlowFarneback(uFrame0, uFrame1, uFlow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+        flags |= OPTFLOW_USE_INITIAL_FLOW;
+    }
+
+    OCL_TEST_CYCLE()
+            cv::calcOpticalFlowFarneback(uFrame0, uFrame1, uFlow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+
+    SANITY_CHECK(uFlow, eps, ERROR_RELATIVE);
+}
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/ocl/perf/perf_kalman.cpp b/modules/video/perf/opencl/perf_optflow_pyrlk.cpp
similarity index 56%
rename from modules/ocl/perf/perf_kalman.cpp
rename to modules/video/perf/opencl/perf_optflow_pyrlk.cpp
index 946444ad9..b2492e114 100644
--- a/modules/ocl/perf/perf_kalman.cpp
+++ b/modules/video/perf/opencl/perf_optflow_pyrlk.cpp
@@ -45,59 +45,58 @@
 //M*/
 
 #include "perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
 
-#ifdef HAVE_CLAMDBLAS
+using std::tr1::make_tuple;
 
-using namespace perf;
-using namespace std;
-using namespace cv::ocl;
-using namespace cv;
-using std::tr1::tuple;
-using std::tr1::get;
+#ifdef HAVE_OPENCL
 
-///////////// Kalman Filter ////////////////////////
+namespace cvtest {
+namespace ocl {
 
-typedef tuple<int> KalmanFilterType;
-typedef TestBaseWithParam<KalmanFilterType> KalmanFilterFixture;
+///////////// FarnebackOpticalFlow ////////////////////////
+CV_ENUM(farneFlagType, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
 
-PERF_TEST_P(KalmanFilterFixture, KalmanFilter,
-    ::testing::Values(1000, 1500))
+typedef tuple< int > PyrLKOpticalFlowParams;
+typedef TestBaseWithParam<PyrLKOpticalFlowParams> PyrLKOpticalFlowFixture;
+
+OCL_PERF_TEST_P(PyrLKOpticalFlowFixture, PyrLKOpticalFlow,
+                ::testing::Values(1000, 2000, 4000)
+                )
 {
-    KalmanFilterType params = GetParam();
-    const int dim = get<0>(params);
+    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";
 
-    cv::Mat sample(dim, 1, CV_32FC1), dresult;
-    randu(sample, -1, 1);
+    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";
 
-    cv::Mat statePre_;
+    UMat uFrame0; frame0.copyTo(uFrame0);
+    UMat uFrame1; frame1.copyTo(uFrame1);
 
-    if (RUN_PLAIN_IMPL)
-    {
-        cv::KalmanFilter kalman;
-        TEST_CYCLE()
-        {
-            kalman.init(dim, dim);
-            kalman.correct(sample);
-            kalman.predict();
-        }
-        statePre_ = kalman.statePre;
-    }
-    else if(RUN_OCL_IMPL)
-    {
-        cv::ocl::oclMat dsample(sample);
-        cv::ocl::KalmanFilter kalman_ocl;
-        OCL_TEST_CYCLE()
-        {
-            kalman_ocl.init(dim, dim);
-            kalman_ocl.correct(dsample);
-            kalman_ocl.predict();
-        }
-        kalman_ocl.statePre.download(statePre_);
-    }
-    else
-        OCL_PERF_ELSE
+    const Size winSize = Size(21, 21);
+    const int maxLevel = 3;
+    const TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01);
+    const int flags = 0;
+    const float minEigThreshold = 1e-4f;
+    const double eps = 1.0;
 
-    SANITY_CHECK(statePre_);
+    const PyrLKOpticalFlowParams params = GetParam();
+    const int pointsCount = get<0>(params);
+
+    vector<Point2f> pts, nextPts;
+    vector<unsigned char> status;
+    vector<float> err;
+    goodFeaturesToTrack(frame0, pts, pointsCount, 0.01, 0.0);
+    Mat ptsMat(1, static_cast<int>(pts.size()), CV_32FC2, (void *)&pts[0]);
+
+    declare.in(uFrame0, uFrame1, WARMUP_READ);
+    UMat uNextPts, uStatus, uErr;
+    OCL_TEST_CYCLE()
+        cv::calcOpticalFlowPyrLK(uFrame0, uFrame1, pts, uNextPts, uStatus, uErr, winSize, maxLevel, criteria, flags, minEigThreshold);
+
+    SANITY_CHECK(uNextPts, eps);
 }
 
-#endif // HAVE_CLAMDBLAS
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/video/perf/perf_optflowpyrlk.cpp b/modules/video/perf/perf_optflowpyrlk.cpp
index 339cbd035..08ffd04d4 100644
--- a/modules/video/perf/perf_optflowpyrlk.cpp
+++ b/modules/video/perf/perf_optflowpyrlk.cpp
@@ -178,7 +178,7 @@ PERF_TEST_P(Path_Idx_Cn_NPoints_WSize_Deriv, OpticalFlowPyrLK_self, testing::Com
     SANITY_CHECK(err, 2);
 }
 
-CV_ENUM(PyrBorderMode, BORDER_DEFAULT, BORDER_TRANSPARENT);
+CV_ENUM(PyrBorderMode, BORDER_DEFAULT, BORDER_TRANSPARENT)
 typedef tr1::tuple<std::string, int, bool, PyrBorderMode, bool> Path_Win_Deriv_Border_Reuse_t;
 typedef TestBaseWithParam<Path_Win_Deriv_Border_Reuse_t> Path_Win_Deriv_Border_Reuse;
 
diff --git a/modules/video/src/bgfg_gaussmix2.cpp b/modules/video/src/bgfg_gaussmix2.cpp
index 485e34d26..1e6ee0d88 100644
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -83,6 +83,7 @@
 ///////////*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 namespace cv
 {
@@ -141,6 +142,8 @@ public:
         fCT = defaultfCT2;
         nShadowDetection =  defaultnShadowDetection2;
         fTau = defaultfTau;
+
+        opencl_ON = true;
     }
     //! the full constructor that takes the length of the history,
     // the number of gaussian mixtures, the background ratio parameter and the noise strength
@@ -165,6 +168,8 @@ public:
         nShadowDetection =  defaultnShadowDetection2;
         fTau = defaultfTau;
         name_ = "BackgroundSubtractor.MOG2";
+
+        opencl_ON = true;
     }
     //! the destructor
     ~BackgroundSubtractorMOG2Impl() {}
@@ -184,14 +189,44 @@ public:
         int nchannels = CV_MAT_CN(frameType);
         CV_Assert( nchannels <= CV_CN_MAX );
 
-        // for each gaussian mixture of each pixel bg model we store ...
-        // the mixture weight (w),
-        // the mean (nchannels values) and
-        // the covariance
-        bgmodel.create( 1, frameSize.height*frameSize.width*nmixtures*(2 + nchannels), CV_32F );
-        //make the array for keeping track of the used modes per pixel - all zeros at start
-        bgmodelUsedModes.create(frameSize,CV_8U);
-        bgmodelUsedModes = Scalar::all(0);
+        if (ocl::useOpenCL() && opencl_ON)
+        {
+            kernel_apply.create("mog2_kernel", ocl::video::bgfg_mog2_oclsrc, format("-D CN=%d -D NMIXTURES=%d", nchannels, nmixtures));
+            kernel_getBg.create("getBackgroundImage2_kernel", ocl::video::bgfg_mog2_oclsrc, format( "-D CN=%d -D NMIXTURES=%d", nchannels, nmixtures));
+
+            if (kernel_apply.empty() || kernel_getBg.empty())
+                opencl_ON = false;
+        }
+        else opencl_ON = false;
+
+        if (opencl_ON)
+        {
+            u_weight.create(frameSize.height * nmixtures, frameSize.width, CV_32FC1);
+            u_weight.setTo(Scalar::all(0));
+
+            u_variance.create(frameSize.height * nmixtures, frameSize.width, CV_32FC1);
+            u_variance.setTo(Scalar::all(0));
+
+            if (nchannels==3)
+                nchannels=4;
+            u_mean.create(frameSize.height * nmixtures, frameSize.width, CV_32FC(nchannels)); //4 channels
+            u_mean.setTo(Scalar::all(0));
+
+            //make the array for keeping track of the used modes per pixel - all zeros at start
+            u_bgmodelUsedModes.create(frameSize, CV_32FC1);
+            u_bgmodelUsedModes.setTo(cv::Scalar::all(0));
+        }
+        else
+        {
+            // for each gaussian mixture of each pixel bg model we store ...
+            // the mixture weight (w),
+            // the mean (nchannels values) and
+            // the covariance
+            bgmodel.create( 1, frameSize.height*frameSize.width*nmixtures*(2 + nchannels), CV_32F );
+            //make the array for keeping track of the used modes per pixel - all zeros at start
+            bgmodelUsedModes.create(frameSize,CV_8U);
+            bgmodelUsedModes = Scalar::all(0);
+        }
     }
 
     virtual AlgorithmInfo* info() const { return 0; }
@@ -271,6 +306,19 @@ protected:
     int frameType;
     Mat bgmodel;
     Mat bgmodelUsedModes;//keep track of number of modes per pixel
+
+    //for OCL
+
+    mutable bool opencl_ON;
+
+    UMat u_weight;
+    UMat u_variance;
+    UMat u_mean;
+    UMat u_bgmodelUsedModes;
+
+    mutable ocl::Kernel kernel_apply;
+    mutable ocl::Kernel kernel_getBg;
+
     int nframes;
     int history;
     int nmixtures;
@@ -321,6 +369,9 @@ protected:
     //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003.
 
     String name_;
+
+    bool ocl_getBackgroundImage(OutputArray backgroundImage) const;
+    bool ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate=-1);
 };
 
 struct GaussBGStatModel2Params
@@ -685,14 +736,100 @@ public:
     uchar shadowVal;
 };
 
+#ifdef HAVE_OPENCL
+
+bool BackgroundSubtractorMOG2Impl::ocl_apply(InputArray _image, OutputArray _fgmask, double learningRate)
+{
+    ++nframes;
+    learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./std::min( 2*nframes, history );
+    CV_Assert(learningRate >= 0);
+
+    UMat fgmask(_image.size(), CV_32SC1);
+
+    fgmask.setTo(cv::Scalar::all(1));
+
+    const double alpha1 = 1.0f - learningRate;
+
+    int detectShadows_flag = 0;
+    if(bShadowDetection)
+        detectShadows_flag = 1;
+
+    UMat frame = _image.getUMat();
+
+    float varMax = MAX(fVarMin, fVarMax);
+    float varMin = MIN(fVarMin, fVarMax);
+
+    int idxArg = 0;
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadOnly(frame));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_bgmodelUsedModes));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_weight));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_mean));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::ReadWriteNoSize(u_variance));
+    idxArg = kernel_apply.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(fgmask));
+
+    idxArg = kernel_apply.set(idxArg, (float)learningRate);        //alphaT
+    idxArg = kernel_apply.set(idxArg, (float)alpha1);
+    idxArg = kernel_apply.set(idxArg, (float)(-learningRate*fCT));   //prune
+    idxArg = kernel_apply.set(idxArg, detectShadows_flag);
+
+    idxArg = kernel_apply.set(idxArg, (float)varThreshold); //c_Tb
+    idxArg = kernel_apply.set(idxArg, backgroundRatio);     //c_TB
+    idxArg = kernel_apply.set(idxArg, varThresholdGen);     //c_Tg
+    idxArg = kernel_apply.set(idxArg, varMin);
+    idxArg = kernel_apply.set(idxArg, varMax);
+    idxArg = kernel_apply.set(idxArg, fVarInit);
+    idxArg = kernel_apply.set(idxArg, fTau);
+    kernel_apply.set(idxArg, nShadowDetection);
+
+    size_t globalsize[] = {frame.cols, frame.rows, 1};
+
+    if (!(kernel_apply.run(2, globalsize, NULL, true)))
+        return false;
+
+    _fgmask.create(_image.size(),CV_8U);
+    UMat temp = _fgmask.getUMat();
+    fgmask.convertTo(temp, CV_8U);
+
+    return true;
+}
+
+bool BackgroundSubtractorMOG2Impl::ocl_getBackgroundImage(OutputArray _backgroundImage) const
+{
+    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3);
+
+    _backgroundImage.create(frameSize, frameType);
+    UMat dst = _backgroundImage.getUMat();
+
+    int idxArg = 0;
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnly(u_bgmodelUsedModes));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(u_weight));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::ReadOnlyNoSize(u_mean));
+    idxArg = kernel_getBg.set(idxArg, ocl::KernelArg::WriteOnlyNoSize(dst));
+    kernel_getBg.set(idxArg, backgroundRatio);
+
+    size_t globalsize[2] = {u_bgmodelUsedModes.cols, u_bgmodelUsedModes.rows};
+
+    return kernel_getBg.run(2, globalsize, NULL, false);
+}
+
+#endif
+
 void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask, double learningRate)
 {
-    Mat image = _image.getMat();
-    bool needToInitialize = nframes == 0 || learningRate >= 1 || image.size() != frameSize || image.type() != frameType;
+    bool needToInitialize = nframes == 0 || learningRate >= 1 || _image.size() != frameSize || _image.type() != frameType;
 
     if( needToInitialize )
-        initialize(image.size(), image.type());
+        initialize(_image.size(), _image.type());
 
+    if (opencl_ON)
+    {
+        CV_OCL_RUN(opencl_ON, ocl_apply(_image, _fgmask, learningRate))
+
+        opencl_ON = false;
+        initialize(_image.size(), _image.type());
+    }
+
+    Mat image = _image.getMat();
     _fgmask.create( image.size(), CV_8U );
     Mat fgmask = _fgmask.getMat();
 
@@ -714,6 +851,14 @@ void BackgroundSubtractorMOG2Impl::apply(InputArray _image, OutputArray _fgmask,
 
 void BackgroundSubtractorMOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
 {
+    if (opencl_ON)
+    {
+        CV_OCL_RUN(opencl_ON, ocl_getBackgroundImage(backgroundImage))
+
+        opencl_ON = false;
+        return;
+    }
+
     int nchannels = CV_MAT_CN(frameType);
     CV_Assert( nchannels == 3 );
     Mat meanBackground(frameSize, CV_8UC3, Scalar::all(0));
@@ -765,7 +910,6 @@ void BackgroundSubtractorMOG2Impl::getBackgroundImage(OutputArray backgroundImag
     }
 }
 
-
 Ptr<BackgroundSubtractorMOG2> createBackgroundSubtractorMOG2(int _history, double _varThreshold,
                                                              bool _bShadowDetection)
 {
@@ -774,4 +918,4 @@ Ptr<BackgroundSubtractorMOG2> createBackgroundSubtractorMOG2(int _history, doubl
 
 }
 
-/* End of file. */
+/* End of file. */
\ No newline at end of file
diff --git a/modules/video/src/camshift.cpp b/modules/video/src/camshift.cpp
index 9ba02381d..5449a1b47 100644
--- a/modules/video/src/camshift.cpp
+++ b/modules/video/src/camshift.cpp
@@ -43,15 +43,25 @@
 
 int cv::meanShift( InputArray _probImage, Rect& window, TermCriteria criteria )
 {
-    Mat mat = _probImage.getMat();
+    Size size;
+    int cn;
+    Mat mat;
+    UMat umat;
+    bool isUMat = _probImage.isUMat();
+
+    if (isUMat)
+        umat = _probImage.getUMat(), cn = umat.channels(), size = umat.size();
+    else
+        mat = _probImage.getMat(), cn = mat.channels(), size = mat.size();
+
     Rect cur_rect = window;
 
-    CV_Assert( mat.channels() == 1 );
+    CV_Assert( cn == 1 );
 
     if( window.height <= 0 || window.width <= 0 )
         CV_Error( Error::StsBadArg, "Input window has non-positive sizes" );
 
-    window = window & Rect(0, 0, mat.cols, mat.rows);
+    window = window & Rect(0, 0, size.width, size.height);
 
     double eps = (criteria.type & TermCriteria::EPS) ? std::max(criteria.epsilon, 0.) : 1.;
     eps = cvRound(eps*eps);
@@ -59,16 +69,16 @@ int cv::meanShift( InputArray _probImage, Rect& window, TermCriteria criteria )
 
     for( i = 0; i < niters; i++ )
     {
-        cur_rect = cur_rect & Rect(0, 0, mat.cols, mat.rows);
+        cur_rect = cur_rect & Rect(0, 0, size.width, size.height);
         if( cur_rect == Rect() )
         {
-            cur_rect.x = mat.cols/2;
-            cur_rect.y = mat.rows/2;
+            cur_rect.x = size.width/2;
+            cur_rect.y = size.height/2;
         }
         cur_rect.width = std::max(cur_rect.width, 1);
         cur_rect.height = std::max(cur_rect.height, 1);
 
-        Moments m = moments(mat(cur_rect));
+        Moments m = isUMat ? moments(umat(cur_rect)) : moments(mat(cur_rect));
 
         // Calculating center of mass
         if( fabs(m.m00) < DBL_EPSILON )
@@ -77,8 +87,8 @@ int cv::meanShift( InputArray _probImage, Rect& window, TermCriteria criteria )
         int dx = cvRound( m.m10/m.m00 - window.width*0.5 );
         int dy = cvRound( m.m01/m.m00 - window.height*0.5 );
 
-        int nx = std::min(std::max(cur_rect.x + dx, 0), mat.cols - cur_rect.width);
-        int ny = std::min(std::max(cur_rect.y + dy, 0), mat.rows - cur_rect.height);
+        int nx = std::min(std::max(cur_rect.x + dx, 0), size.width - cur_rect.width);
+        int ny = std::min(std::max(cur_rect.y + dy, 0), size.height - cur_rect.height);
 
         dx = nx - cur_rect.x;
         dy = ny - cur_rect.y;
@@ -99,9 +109,17 @@ cv::RotatedRect cv::CamShift( InputArray _probImage, Rect& window,
                               TermCriteria criteria )
 {
     const int TOLERANCE = 10;
-    Mat mat = _probImage.getMat();
+    Size size;
+    Mat mat;
+    UMat umat;
+    bool isUMat = _probImage.isUMat();
 
-    meanShift( mat, window, criteria );
+    if (isUMat)
+        umat = _probImage.getUMat(), size = umat.size();
+    else
+        mat = _probImage.getMat(), size = mat.size();
+
+    meanShift( _probImage, window, criteria );
 
     window.x -= TOLERANCE;
     if( window.x < 0 )
@@ -112,15 +130,15 @@ cv::RotatedRect cv::CamShift( InputArray _probImage, Rect& window,
         window.y = 0;
 
     window.width += 2 * TOLERANCE;
-    if( window.x + window.width > mat.cols )
-        window.width = mat.cols - window.x;
+    if( window.x + window.width > size.width )
+        window.width = size.width - window.x;
 
     window.height += 2 * TOLERANCE;
-    if( window.y + window.height > mat.rows )
-        window.height = mat.rows - window.y;
+    if( window.y + window.height > size.height )
+        window.height = size.height - window.y;
 
     // Calculating moments in new center mass
-    Moments m = moments( mat(window) );
+    Moments m = isUMat ? moments(umat(window)) : moments(mat(window));
 
     double m00 = m.m00, m10 = m.m10, m01 = m.m01;
     double mu11 = m.mu11, mu20 = m.mu20, mu02 = m.mu02;
@@ -164,19 +182,19 @@ cv::RotatedRect cv::CamShift( InputArray _probImage, Rect& window,
     int t1 = cvRound( fabs( width * sn ));
 
     t0 = MAX( t0, t1 ) + 2;
-    window.width = MIN( t0, (mat.cols - _xc) * 2 );
+    window.width = MIN( t0, (size.width - _xc) * 2 );
 
     t0 = cvRound( fabs( length * sn ));
     t1 = cvRound( fabs( width * cs ));
 
     t0 = MAX( t0, t1 ) + 2;
-    window.height = MIN( t0, (mat.rows - _yc) * 2 );
+    window.height = MIN( t0, (size.height - _yc) * 2 );
 
     window.x = MAX( 0, _xc - window.width / 2 );
     window.y = MAX( 0, _yc - window.height / 2 );
 
-    window.width = MIN( mat.cols - window.x, window.width );
-    window.height = MIN( mat.rows - window.y, window.height );
+    window.width = MIN( size.width - window.x, window.width );
+    window.height = MIN( size.height - window.y, window.height );
 
     RotatedRect box;
     box.size.height = (float)length;
diff --git a/modules/video/src/kalman.cpp b/modules/video/src/kalman.cpp
index 03d88f50d..2c7b9e788 100644
--- a/modules/video/src/kalman.cpp
+++ b/modules/video/src/kalman.cpp
@@ -127,4 +127,4 @@ const Mat& KalmanFilter::correct(const Mat& measurement)
     return statePost;
 }
 
-};
+}
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 155737ba7..598e69c88 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -43,6 +43,7 @@
 #include <float.h>
 #include <stdio.h>
 #include "lkpyramid.hpp"
+#include "opencl_kernels.hpp"
 
 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
 
@@ -590,6 +591,265 @@ int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Si
     return maxLevel;
 }
 
+namespace cv
+{
+    class PyrLKOpticalFlow
+    {
+        struct dim3
+        {
+            unsigned int x, y, z;
+            dim3() : x(0), y(0), z(0) { }
+        };
+    public:
+        PyrLKOpticalFlow()
+        {
+            winSize = Size(21, 21);
+            maxLevel = 3;
+            iters = 30;
+            derivLambda = 0.5;
+            useInitialFlow = false;
+
+            waveSize = 0;
+        }
+
+        bool checkParam()
+        {
+            iters = std::min(std::max(iters, 0), 100);
+
+            derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
+            if (derivLambda < 0)
+                return false;
+            if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
+                return false;
+            calcPatchSize();
+            if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
+                return false;
+            if (!initWaveSize())
+                return false;
+            return true;
+        }
+
+        bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
+        {
+            if (!checkParam())
+                return false;
+
+            UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
+            UMat temp2 = nextPts.reshape(1);
+            multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
+
+            status.setTo(Scalar::all(1));
+
+            // build the image pyramids.
+            std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
+            std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
+
+            prevImg.convertTo(prevPyr[0], CV_32F);
+            nextImg.convertTo(nextPyr[0], CV_32F);
+
+            for (int level = 1; level <= maxLevel; ++level)
+            {
+                pyrDown(prevPyr[level - 1], prevPyr[level]);
+                pyrDown(nextPyr[level - 1], nextPyr[level]);
+            }
+
+            // dI/dx ~ Ix, dI/dy ~ Iy
+            for (int level = maxLevel; level >= 0; level--)
+            {
+                if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
+                                  nextPts, status, err,
+                                  prevPts.cols, level))
+                    return false;
+            }
+            return true;
+        }
+
+        Size winSize;
+        int maxLevel;
+        int iters;
+        double derivLambda;
+        bool useInitialFlow;
+
+    private:
+        int waveSize;
+        bool initWaveSize()
+        {
+            waveSize = 1;
+            if (isDeviceCPU())
+                return true;
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
+                return false;
+            waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
+            return true;
+        }
+        dim3 patch;
+        void calcPatchSize()
+        {
+            dim3 block;
+
+            if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+            {
+                block.x = 32;
+                block.y = 8;
+            }
+            else
+            {
+                block.x = 16;
+                block.y = 16;
+            }
+
+            patch.x = (winSize.width  + block.x - 1) / block.x;
+            patch.y = (winSize.height + block.y - 1) / block.y;
+
+            block.z = patch.z = 1;
+        }
+
+        #define SAFE_KERNEL_SET_ARG(idx, arg) \
+        {\
+            int idxNew = kernel.set(idx, arg);\
+            if (-1 == idxNew)\
+            {\
+                printf("lkSparse_run can't setup argument index = %d to kernel\n", idx);\
+                return false;\
+            }\
+            idx = idxNew;\
+        }
+        bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
+            int ptcount, int level)
+        {
+            size_t localThreads[3]  = { 8, 8};
+            size_t globalThreads[3] = { 8 * ptcount, 8};
+            char calcErr = (0 == level) ? 1 : 0;
+
+            cv::String build_options;
+            if (isDeviceCPU())
+                build_options = " -D CPU";
+            else
+                build_options = cv::format("-D WAVE_SIZE=%d", waveSize);
+
+            ocl::Kernel kernel;
+            if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
+                return false;
+
+            ocl::Image2D imageI(I);
+            ocl::Image2D imageJ(J);
+            int idxArg = 0;
+#if 0
+            idxArg = kernel.set(idxArg, imageI); //image2d_t I
+            idxArg = kernel.set(idxArg, imageJ); //image2d_t J
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
+            idxArg = kernel.set(idxArg, (int)prevPts.step); // int prevPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
+            idxArg = kernel.set(idxArg, (int)nextPts.step); //  int nextPtsStep
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
+            idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
+            idxArg = kernel.set(idxArg, (int)level); // const int level
+            idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
+            idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
+            idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
+            idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
+            idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
+            idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
+            idxArg = kernel.set(idxArg, (int)iters); // int c_iters
+            idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
+#else
+            SAFE_KERNEL_SET_ARG(idxArg, imageI); //image2d_t I
+            SAFE_KERNEL_SET_ARG(idxArg, imageJ); //image2d_t J
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
+            SAFE_KERNEL_SET_ARG(idxArg, (int)prevPts.step); // int prevPtsStep
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
+            SAFE_KERNEL_SET_ARG(idxArg, (int)nextPts.step); //  int nextPtsStep
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
+            SAFE_KERNEL_SET_ARG(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
+            SAFE_KERNEL_SET_ARG(idxArg, (int)level); // const int level
+            SAFE_KERNEL_SET_ARG(idxArg, (int)I.rows); // const int rows
+            SAFE_KERNEL_SET_ARG(idxArg, (int)I.cols); // const int cols
+            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.x); // int PATCH_X
+            SAFE_KERNEL_SET_ARG(idxArg, (int)patch.y); // int PATCH_Y
+            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.width); // int c_winSize_x
+            SAFE_KERNEL_SET_ARG(idxArg, (int)winSize.height); // int c_winSize_y
+            SAFE_KERNEL_SET_ARG(idxArg, (int)iters); // int c_iters
+            SAFE_KERNEL_SET_ARG(idxArg, (char)calcErr); //char calcErr
+#endif
+
+            return kernel.run(2, globalThreads, localThreads, true);
+        }
+    private:
+        inline static bool isDeviceCPU()
+        {
+            return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
+        }
+    };
+
+
+    static bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
+                                  InputArray _prevPts, InputOutputArray _nextPts,
+                                  OutputArray _status, OutputArray _err,
+                                  Size winSize, int maxLevel,
+                                  TermCriteria criteria,
+                                  int flags/*, double minEigThreshold*/ )
+    {
+        if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
+            return false;
+        if (!cv::ocl::Device::getDefault().imageSupport())
+            return false;
+        if (_nextImg.size() != _prevImg.size())
+            return false;
+        int typePrev = _prevImg.type();
+        int typeNext = _nextImg.type();
+        if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
+            return false;
+        if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
+            return false;
+
+        if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
+            return false;
+        if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width))
+            return false;
+        size_t npoints = _prevPts.total();
+        bool useInitialFlow  = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW));
+        if (useInitialFlow)
+        {
+            if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
+                return false;
+            if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width))
+                return false;
+            if (_nextPts.total() != npoints)
+                return false;
+        }
+        else
+        {
+            _nextPts.create(_prevPts.size(), _prevPts.type());
+        }
+
+        PyrLKOpticalFlow opticalFlow;
+        opticalFlow.winSize     = winSize;
+        opticalFlow.maxLevel    = maxLevel;
+        opticalFlow.iters       = criteria.maxCount;
+        opticalFlow.derivLambda = criteria.epsilon;
+        opticalFlow.useInitialFlow  = useInitialFlow;
+
+        if (!opticalFlow.checkParam())
+            return false;
+
+        UMat umatErr;
+        if (_err.needed())
+        {
+            _err.create((int)npoints, 1, CV_32FC1);
+            umatErr = _err.getUMat();
+        }
+        else
+            umatErr.create((int)npoints, 1, CV_32FC1);
+
+        _status.create((int)npoints, 1, CV_8UC1);
+        UMat umatNextPts = _nextPts.getUMat();
+        UMat umatStatus = _status.getUMat();
+        return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
+    }
+};
+
 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            InputArray _prevPts, InputOutputArray _nextPts,
                            OutputArray _status, OutputArray _err,
@@ -597,6 +857,10 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
                            TermCriteria criteria,
                            int flags, double minEigThreshold )
 {
+    bool use_opencl = ocl::useOpenCL() && (_prevImg.isUMat() || _nextImg.isUMat());
+    if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
+        return;
+
     Mat prevPtsMat = _prevPts.getMat();
     const int derivDepth = DataType<cv::detail::deriv_type>::depth;
 
diff --git a/modules/video/src/motempl.cpp b/modules/video/src/motempl.cpp
index aa6d12d8d..3fc87e657 100644
--- a/modules/video/src/motempl.cpp
+++ b/modules/video/src/motempl.cpp
@@ -40,34 +40,62 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace  cv {
+
+static bool ocl_updateMotionHistory( InputArray _silhouette, InputOutputArray _mhi,
+                                     float timestamp, float delbound )
+{
+    ocl::Kernel k("updateMotionHistory", ocl::video::updatemotionhistory_oclsrc);
+    if (k.empty())
+        return false;
+
+    UMat silh = _silhouette.getUMat(), mhi = _mhi.getUMat();
+
+    k.args(ocl::KernelArg::ReadOnlyNoSize(silh), ocl::KernelArg::ReadWrite(mhi),
+           timestamp, delbound);
+
+    size_t globalsize[2] = { silh.cols, silh.rows };
+    return k.run(2, globalsize, NULL, false);
+}
+
+}
+
+#endif
 
 void cv::updateMotionHistory( InputArray _silhouette, InputOutputArray _mhi,
                               double timestamp, double duration )
 {
+    CV_Assert( _silhouette.type() == CV_8UC1 && _mhi.type() == CV_32FC1 );
+    CV_Assert( _silhouette.sameSize(_mhi) );
+
+    float ts = (float)timestamp;
+    float delbound = (float)(timestamp - duration);
+
+    CV_OCL_RUN(_mhi.isUMat() && _mhi.dims() <= 2,
+               ocl_updateMotionHistory(_silhouette, _mhi, ts, delbound))
+
     Mat silh = _silhouette.getMat(), mhi = _mhi.getMat();
-
-    CV_Assert( silh.type() == CV_8U && mhi.type() == CV_32F );
-    CV_Assert( silh.size() == mhi.size() );
-
     Size size = silh.size();
+
     if( silh.isContinuous() && mhi.isContinuous() )
     {
         size.width *= size.height;
         size.height = 1;
     }
 
-    float ts = (float)timestamp;
-    float delbound = (float)(timestamp - duration);
-    int x, y;
 #if CV_SSE2
     volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
 #endif
 
-    for( y = 0; y < size.height; y++ )
+    for(int y = 0; y < size.height; y++ )
     {
         const uchar* silhData = silh.ptr<uchar>(y);
         float* mhiData = mhi.ptr<float>(y);
-        x = 0;
+        int x = 0;
 
 #if CV_SSE2
         if( useSIMD )
diff --git a/modules/video/src/opencl/bgfg_mog2.cl b/modules/video/src/opencl/bgfg_mog2.cl
new file mode 100644
index 000000000..f895b5be7
--- /dev/null
+++ b/modules/video/src/opencl/bgfg_mog2.cl
@@ -0,0 +1,272 @@
+#if CN==1
+
+#define T_MEAN float
+#define F_ZERO (0.0f)
+#define cnMode 1
+
+#define frameToMean(a, b) (b) = *(a);
+#define meanToFrame(a, b) *b = convert_uchar_sat(a);
+
+inline float sqr(float val)
+{
+    return val * val;
+}
+
+inline float sum(float val)
+{
+    return val;
+}
+
+#else
+
+#define T_MEAN float4
+#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
+#define cnMode 4
+
+#define meanToFrame(a, b)\
+    b[0] = convert_uchar_sat(a.x); \
+    b[1] = convert_uchar_sat(a.y); \
+    b[2] = convert_uchar_sat(a.z);
+
+#define frameToMean(a, b)\
+    b.x = a[0]; \
+    b.y = a[1]; \
+    b.z = a[2]; \
+    b.w = 0.0f;
+
+inline float sqr(const float4 val)
+{
+    return val.x * val.x + val.y * val.y + val.z * val.z;
+}
+
+inline float sum(const float4 val)
+{
+    return (val.x + val.y + val.z);
+}
+
+inline void swap4(__global float4* ptr, int x, int y, int k, int rows, int ptr_step)
+{
+    float4 val = ptr[(k * rows + y) * ptr_step + x];
+    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
+    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
+}
+
+#endif
+
+inline void swap(__global float* ptr, int x, int y, int k, int rows, int ptr_step)
+{
+    float val = ptr[(k * rows + y) * ptr_step + x];
+    ptr[(k * rows + y) * ptr_step + x] = ptr[((k + 1) * rows + y) * ptr_step + x];
+    ptr[((k + 1) * rows + y) * ptr_step + x] = val;
+}
+
+__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col, //uchar || uchar3
+                          __global uchar* modesUsed, int modesUsed_step, int modesUsed_offset,                         //int
+                          __global uchar* weight, int weight_step, int weight_offset,                                  //float
+                          __global uchar* mean, int mean_step, int mean_offset,                                        //T_MEAN=float || float4
+                          __global uchar* variance, int var_step, int var_offset,                                      //float
+                          __global uchar* fgmask, int fgmask_step, int fgmask_offset,                                  //int
+                          float alphaT, float alpha1, float prune,
+                          int detectShadows_flag,
+                          float c_Tb, float c_TB, float c_Tg, float c_varMin,                     //constants
+                          float c_varMax, float c_varInit, float c_tau, uchar c_shadowVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    weight_step/= sizeof(float);
+    var_step   /= sizeof(float);
+    mean_step  /= (sizeof(float)*cnMode);
+
+    if( x < frame_col && y < frame_row)
+    {
+        __global const uchar* _frame = (frame + mad24( y, frame_step, x*CN + frame_offset));
+        T_MEAN pix;
+        frameToMean(_frame, pix);
+
+        bool background = false; // true - the pixel classified as background
+
+        bool fitsPDF = false; //if it remains zero a new GMM mode will be added
+
+        __global int* _modesUsed = (__global int*)(modesUsed + mad24( y, modesUsed_step, x*(int)(sizeof(int))));
+        int nmodes = _modesUsed[0];
+        int nNewModes = nmodes; //current number of modes in GMM
+
+        float totalWeight = 0.0f;
+
+        __global float* _weight = (__global float*)(weight);
+        __global float* _variance = (__global float*)(variance);
+        __global T_MEAN* _mean = (__global T_MEAN*)(mean);
+
+        for (int mode = 0; mode < nmodes; ++mode)
+        {
+
+            float c_weight = alpha1 * _weight[(mode * frame_row + y) * weight_step + x] + prune;
+
+            if (!fitsPDF)
+            {
+                float c_var = _variance[(mode * frame_row + y) * var_step + x];
+
+                T_MEAN c_mean = _mean[(mode * frame_row + y) * mean_step + x];
+
+                T_MEAN diff = c_mean - pix;
+                float dist2 = sqr(diff);
+
+                if (totalWeight < c_TB && dist2 < c_Tb * c_var)
+                    background = true;
+
+                if (dist2 < c_Tg * c_var)
+                {
+                    fitsPDF = true;
+                    c_weight += alphaT;
+                    float k = alphaT / c_weight;
+
+                    _mean[(mode * frame_row + y) * mean_step + x] = c_mean - k * diff;
+
+                    float varnew = c_var + k * (dist2 - c_var);
+                    varnew = fmax(varnew, c_varMin);
+                    varnew = fmin(varnew, c_varMax);
+
+                    _variance[(mode * frame_row + y) * var_step + x] = varnew;
+                    for (int i = mode; i > 0; --i)
+                    {
+                        if (c_weight < _weight[((i - 1) * frame_row + y) * weight_step + x])
+                            break;
+                        swap(_weight, x, y, i - 1, frame_row, weight_step);
+                        swap(_variance, x, y, i - 1, frame_row, var_step);
+                        #if (CN==1)
+                        swap(_mean, x, y, i - 1, frame_row, mean_step);
+                        #else
+                        swap4(_mean, x, y, i - 1, frame_row, mean_step);
+                        #endif
+                    }
+                }
+            } // !fitsPDF
+
+            if (c_weight < -prune)
+            {
+                c_weight = 0.0f;
+                nmodes--;
+            }
+
+            _weight[(mode * frame_row + y) * weight_step + x] = c_weight; //update weight by the calculated value
+            totalWeight += c_weight;
+        }
+
+        totalWeight = 1.f / totalWeight;
+        for (int mode = 0; mode < nmodes; ++mode)
+            _weight[(mode * frame_row + y) * weight_step + x] *= totalWeight;
+
+        nmodes = nNewModes;
+
+        if (!fitsPDF)
+        {
+            int mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;
+
+            if (nmodes == 1)
+                _weight[(mode * frame_row + y) * weight_step + x] = 1.f;
+            else
+            {
+                _weight[(mode * frame_row + y) * weight_step + x] = alphaT;
+
+                for (int i = 0; i < nmodes - 1; ++i)
+                    _weight[(i * frame_row + y) * weight_step + x] *= alpha1;
+            }
+
+            _mean[(mode * frame_row + y) * mean_step + x] = pix;
+            _variance[(mode * frame_row + y) * var_step + x] = c_varInit;
+
+            for (int i = nmodes - 1; i > 0; --i)
+            {
+                if (alphaT < _weight[((i - 1) * frame_row + y) * weight_step + x])
+                    break;
+
+                swap(_weight, x, y, i - 1, frame_row, weight_step);
+                swap(_variance, x, y, i - 1, frame_row, var_step);
+                #if (CN==1)
+                swap(_mean, x, y, i - 1, frame_row, mean_step);
+                #else
+                swap4(_mean, x, y, i - 1, frame_row, mean_step);
+                #endif
+            }
+        }
+
+        _modesUsed[0] = nmodes;
+        bool isShadow = false;
+        if (detectShadows_flag && !background)
+        {
+            float tWeight = 0.0f;
+
+            for (int mode = 0; mode < nmodes; ++mode)
+            {
+                T_MEAN c_mean = _mean[(mode * frame_row + y) * mean_step + x];
+
+                T_MEAN pix_mean = pix * c_mean;
+
+                float numerator = sum(pix_mean);
+                float denominator = sqr(c_mean);
+
+                if (denominator == 0)
+                    break;
+
+                if (numerator <= denominator && numerator >= c_tau * denominator)
+                {
+                    float a = numerator / denominator;
+
+                    T_MEAN dD = a * c_mean - pix;
+
+                    if (sqr(dD) < c_Tb * _variance[(mode * frame_row + y) * var_step + x] * a * a)
+                    {
+                        isShadow = true;
+                        break;
+                    }
+                }
+
+                tWeight += _weight[(mode * frame_row + y) * weight_step + x];
+                if (tWeight > c_TB)
+                    break;
+            }
+        }
+        __global int* _fgmask = (__global int*)(fgmask + mad24(y, fgmask_step, x*(int)(sizeof(int)) + fgmask_offset));
+        *_fgmask = background ? 0 : isShadow ? c_shadowVal : 255;
+    }
+}
+
+__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed, int modesUsed_step, int modesUsed_offset, int modesUsed_row, int modesUsed_col,
+                                         __global const uchar* weight, int weight_step, int weight_offset,
+                                         __global const uchar* mean, int mean_step, int mean_offset,
+                                         __global uchar* dst, int dst_step, int dst_offset,
+                                         float c_TB)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < modesUsed_col && y < modesUsed_row)
+    {
+        __global int* _modesUsed = (__global int*)(modesUsed + mad24( y, modesUsed_step, x*(int)(sizeof(int))));
+        int nmodes = _modesUsed[0];
+
+        T_MEAN meanVal = (T_MEAN)F_ZERO;
+
+        float totalWeight = 0.0f;
+
+        for (int mode = 0; mode < nmodes; ++mode)
+        {
+            __global const float* _weight = (__global const float*)(weight + mad24(mode * modesUsed_row + y, weight_step, x*(int)(sizeof(float))));
+            float c_weight = _weight[0];
+
+            __global const T_MEAN* _mean = (__global const T_MEAN*)(mean + mad24(mode * modesUsed_row + y, mean_step, x*(int)(sizeof(float))*cnMode));
+            T_MEAN c_mean = _mean[0];
+            meanVal = meanVal + c_weight * c_mean;
+
+            totalWeight += c_weight;
+
+            if(totalWeight > c_TB)
+                break;
+        }
+
+        meanVal = meanVal * (1.f / totalWeight);
+        __global uchar* _dst = dst + y * dst_step + x*CN + dst_offset;
+        meanToFrame(meanVal, _dst);
+    }
+}
\ No newline at end of file
diff --git a/modules/ocl/src/opencl/optical_flow_farneback.cl b/modules/video/src/opencl/optical_flow_farneback.cl
similarity index 55%
rename from modules/ocl/src/opencl/optical_flow_farneback.cl
rename to modules/video/src/opencl/optical_flow_farneback.cl
index 4725662c6..778583943 100644
--- a/modules/ocl/src/opencl/optical_flow_farneback.cl
+++ b/modules/video/src/opencl/optical_flow_farneback.cl
@@ -56,28 +56,37 @@
 #define polyN 5
 #endif
 
-__kernel void polynomialExpansion(__global float * dst,
-                                  __global __const float * src,
+#if USE_DOUBLE
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#define TYPE double
+#define VECTYPE double4
+#else
+#define TYPE float
+#define VECTYPE float4
+#endif
+
+__kernel void polynomialExpansion(__global __const float * src, int srcStep,
+                                  __global float * dst, int dstStep,
+                                  const int rows, const  int cols,
                                   __global __const float * c_g,
                                   __global __const float * c_xg,
                                   __global __const float * c_xxg,
                                   __local float * smem,
-                                  const float4 ig,
-                                  const int height, const int width,
-                                  int dstStep, int srcStep)
+                                  const VECTYPE ig)
 {
     const int y = get_global_id(1);
     const int x = bx * (bdx - 2*polyN) + tx - polyN;
 
-    dstStep /= sizeof(*dst);
-    srcStep /= sizeof(*src);
-
     int xWarped;
     __local float *row = smem + tx;
 
-    if (y < height && y >= 0)
+    if (y < rows && y >= 0)
     {
-        xWarped = min(max(x, 0), width - 1);
+        xWarped = min(max(x, 0), cols - 1);
 
         row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];
         row[bdx] = 0.f;
@@ -87,7 +96,7 @@ __kernel void polynomialExpansion(__global float * dst,
         for (int k = 1; k <= polyN; ++k)
         {
             float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];
-            float t1 = src[mad24(min(y + k, height - 1), srcStep, xWarped)];
+            float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];
 
             row[0] += c_g[k] * (t0 + t1);
             row[bdx] += c_xg[k] * (t1 - t0);
@@ -97,12 +106,12 @@ __kernel void polynomialExpansion(__global float * dst,
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (y < height && y >= 0 && tx >= polyN && tx + polyN < bdx && x < width)
+    if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)
     {
-        float b1 = c_g[0] * row[0];
-        float b3 = c_g[0] * row[bdx];
-        float b5 = c_g[0] * row[2*bdx];
-        float b2 = 0, b4 = 0, b6 = 0;
+        TYPE b1 = c_g[0] * row[0];
+        TYPE b3 = c_g[0] * row[bdx];
+        TYPE b5 = c_g[0] * row[2*bdx];
+        TYPE b2 = 0, b4 = 0, b6 = 0;
 
 #pragma unroll
         for (int k = 1; k <= polyN; ++k)
@@ -115,11 +124,11 @@ __kernel void polynomialExpansion(__global float * dst,
             b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
         }
 
-        dst[mad24(y, dstStep, xWarped)] = b3*ig.s0;
-        dst[mad24(height + y, dstStep, xWarped)] = b2*ig.s0;
-        dst[mad24(2*height + y, dstStep, xWarped)] = b1*ig.s1 + b5*ig.s2;
-        dst[mad24(3*height + y, dstStep, xWarped)] = b1*ig.s1 + b4*ig.s2;
-        dst[mad24(4*height + y, dstStep, xWarped)] = b6*ig.s3;
+        dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);
+        dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);
+        dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);
+        dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);
+        dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);
     }
 }
 
@@ -133,11 +142,6 @@ inline int idx_row_high(const int y, const int last_row)
     return abs(last_row - abs(last_row - y)) % (last_row + 1);
 }
 
-inline int idx_row(const int y, const int last_row)
-{
-    return idx_row_low(idx_row_high(y, last_row), last_row);
-}
-
 inline int idx_col_low(const int x, const int last_col)
 {
     return abs(x) % (last_col + 1);
@@ -153,39 +157,33 @@ inline int idx_col(const int x, const int last_col)
     return idx_col_low(idx_col_high(x, last_col), last_col);
 }
 
-__kernel void gaussianBlur(__global float * dst,
-                           __global const float * src,
-                           __global const float * c_gKer,
-                           __local float * smem,
-                           const int height,  const int width,
-                           int dstStep, int srcStep,
-                           const int ksizeHalf)
+__kernel void gaussianBlur(__global const float * src, int srcStep,
+                           __global float * dst, int dstStep, const int rows, const  int cols,
+                           __global const float * c_gKer, const int ksizeHalf,
+                           __local float * smem)
 {
     const int y = get_global_id(1);
     const int x = get_global_id(0);
 
-    dstStep /= sizeof(*dst);
-    srcStep /= sizeof(*src);
-
     __local float *row = smem + ty * (bdx + 2*ksizeHalf);
 
-    if (y < height)
+    if (y < rows)
     {
         // Vertical pass
         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
         {
             int xExt = (int)(bx * bdx) + i - ksizeHalf;
-            xExt = idx_col(xExt, width - 1);
+            xExt = idx_col(xExt, cols - 1);
             row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];
             for (int j = 1; j <= ksizeHalf; ++j)
-                row[i] += (src[mad24(idx_row_low(y - j, height - 1), srcStep, xExt)]
-                           + src[mad24(idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
+                row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]
+                           + src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];
         }
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (y < height && y >= 0 && x < width && x >= 0)
+    if (y < rows && y >= 0 && x < cols && x >= 0)
     {
         // Horizontal pass
         row += tx + ksizeHalf;
@@ -197,237 +195,42 @@ __kernel void gaussianBlur(__global float * dst,
     }
 }
 
-__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };
-
-__kernel void updateMatrices(__global float * M,
-                             __global const float * flowx, __global const float * flowy,
-                             __global const float * R0, __global const float * R1,
-                             const int height, const int width,
-                             int mStep, int xStep,  int yStep, int R0Step, int R1Step)
+__kernel void gaussianBlur5(__global const float * src, int srcStep,
+                            __global float * dst, int dstStep,
+                            const int rows, const  int cols,
+                            __global const float * c_gKer, const int ksizeHalf,
+                            __local float * smem)
 {
     const int y = get_global_id(1);
     const int x = get_global_id(0);
 
-    mStep /= sizeof(*M);
-    xStep /= sizeof(*flowx);
-    yStep /= sizeof(*flowy);
-    R0Step /= sizeof(*R0);
-    R1Step /= sizeof(*R1);
-
-    if (y < height && y >= 0 && x < width && x >= 0)
-    {
-        float dx = flowx[mad24(y, xStep, x)];
-        float dy = flowy[mad24(y, yStep, x)];
-        float fx = x + dx;
-        float fy = y + dy;
-
-        int x1 = convert_int(floor(fx));
-        int y1 = convert_int(floor(fy));
-        fx -= x1;
-        fy -= y1;
-
-        float r2, r3, r4, r5, r6;
-
-        if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
-        {
-            float a00 = (1.f - fx) * (1.f - fy);
-            float a01 = fx * (1.f - fy);
-            float a10 = (1.f - fx) * fy;
-            float a11 = fx * fy;
-
-            r2 = a00 * R1[mad24(y1, R1Step, x1)] +
-                 a01 * R1[mad24(y1, R1Step, x1 + 1)] +
-                 a10 * R1[mad24(y1 + 1, R1Step, x1)] +
-                 a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];
-
-            r3 = a00 * R1[mad24(height + y1, R1Step, x1)] +
-                 a01 * R1[mad24(height + y1, R1Step, x1 + 1)] +
-                 a10 * R1[mad24(height + y1 + 1, R1Step, x1)] +
-                 a11 * R1[mad24(height + y1 + 1, R1Step, x1 + 1)];
-
-            r4 = a00 * R1[mad24(2*height + y1, R1Step, x1)] +
-                 a01 * R1[mad24(2*height + y1, R1Step, x1 + 1)] +
-                 a10 * R1[mad24(2*height + y1 + 1, R1Step, x1)] +
-                 a11 * R1[mad24(2*height + y1 + 1, R1Step, x1 + 1)];
-
-            r5 = a00 * R1[mad24(3*height + y1, R1Step, x1)] +
-                 a01 * R1[mad24(3*height + y1, R1Step, x1 + 1)] +
-                 a10 * R1[mad24(3*height + y1 + 1, R1Step, x1)] +
-                 a11 * R1[mad24(3*height + y1 + 1, R1Step, x1 + 1)];
-
-            r6 = a00 * R1[mad24(4*height + y1, R1Step, x1)] +
-                 a01 * R1[mad24(4*height + y1, R1Step, x1 + 1)] +
-                 a10 * R1[mad24(4*height + y1 + 1, R1Step, x1)] +
-                 a11 * R1[mad24(4*height + y1 + 1, R1Step, x1 + 1)];
-
-            r4 = (R0[mad24(2*height + y, R0Step, x)] + r4) * 0.5f;
-            r5 = (R0[mad24(3*height + y, R0Step, x)] + r5) * 0.5f;
-            r6 = (R0[mad24(4*height + y, R0Step, x)] + r6) * 0.25f;
-        }
-        else
-        {
-            r2 = r3 = 0.f;
-            r4 = R0[mad24(2*height + y, R0Step, x)];
-            r5 = R0[mad24(3*height + y, R0Step, x)];
-            r6 = R0[mad24(4*height + y, R0Step, x)] * 0.5f;
-        }
-
-        r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;
-        r3 = (R0[mad24(height + y, R0Step, x)] - r3) * 0.5f;
-
-        r2 += r4*dy + r6*dx;
-        r3 += r6*dy + r5*dx;
-
-        float scale =
-            c_border[min(x, BORDER_SIZE)] *
-            c_border[min(y, BORDER_SIZE)] *
-            c_border[min(width - x - 1, BORDER_SIZE)] *
-            c_border[min(height - y - 1, BORDER_SIZE)];
-
-        r2 *= scale;
-        r3 *= scale;
-        r4 *= scale;
-        r5 *= scale;
-        r6 *= scale;
-
-        M[mad24(y, mStep, x)] = r4*r4 + r6*r6;
-        M[mad24(height + y, mStep, x)] = (r4 + r5)*r6;
-        M[mad24(2*height + y, mStep, x)] = r5*r5 + r6*r6;
-        M[mad24(3*height + y, mStep, x)] = r4*r2 + r6*r3;
-        M[mad24(4*height + y, mStep, x)] = r6*r2 + r5*r3;
-    }
-}
-
-__kernel void boxFilter5(__global float * dst,
-                         __global const float * src,
-                         __local float * smem,
-                         const int height,  const int width,
-                         int dstStep, int srcStep,
-                         const int ksizeHalf)
-{
-    const int y = get_global_id(1);
-    const int x = get_global_id(0);
-
-    const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-    const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-    __local float *row = smem + 5 * ty * smw;
-
-    dstStep /= sizeof(*dst);
-    srcStep /= sizeof(*src);
-
-    if (y < height)
-    {
-        // Vertical pass
-        for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-        {
-            int xExt = (int)(bx * bdx) + i - ksizeHalf;
-            xExt = min(max(xExt, 0), width - 1);
-
-#pragma unroll
-            for (int k = 0; k < 5; ++k)
-                row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)];
-
-            for (int j = 1; j <= ksizeHalf; ++j)
-#pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] +=
-                        src[mad24(k*height + max(y - j, 0), srcStep, xExt)] +
-                        src[mad24(k*height + min(y + j, height - 1), srcStep, xExt)];
-        }
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (y < height && y >= 0 && x < width && x >= 0)
-    {
-        // Horizontal pass
-
-        row += tx + ksizeHalf;
-        float res[5];
-
-#pragma unroll
-        for (int k = 0; k < 5; ++k)
-            res[k] = row[k*smw];
-
-        for (int i = 1; i <= ksizeHalf; ++i)
-#pragma unroll
-            for (int k = 0; k < 5; ++k)
-                res[k] += row[k*smw - i] + row[k*smw + i];
-
-#pragma unroll
-        for (int k = 0; k < 5; ++k)
-            dst[mad24(k*height + y, dstStep, x)] = res[k] * boxAreaInv;
-    }
-}
-
-__kernel void updateFlow(__global float4 * flowx, __global float4 * flowy,
-                         __global const float4 * M,
-                         const int height, const int width,
-                         int xStep, int yStep, int mStep)
-{
-    const int y = get_global_id(1);
-    const int x = get_global_id(0);
-
-    xStep /= sizeof(*flowx);
-    yStep /= sizeof(*flowy);
-    mStep /= sizeof(*M);
-
-    if (y < height && y >= 0 && x < width && x >= 0)
-    {
-        float4 g11 = M[mad24(y, mStep, x)];
-        float4 g12 = M[mad24(height + y, mStep, x)];
-        float4 g22 = M[mad24(2*height + y, mStep, x)];
-        float4 h1 =  M[mad24(3*height + y, mStep, x)];
-        float4 h2 =  M[mad24(4*height + y, mStep, x)];
-
-        float4 detInv = (float4)(1.f) / (g11*g22 - g12*g12 + (float4)(1e-3f));
-
-        flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;
-        flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;
-    }
-}
-
-__kernel void gaussianBlur5(__global float * dst,
-                            __global const float * src,
-                            __global const float * c_gKer,
-                            __local float * smem,
-                            const int height,  const int width,
-                            int dstStep, int srcStep,
-                            const int ksizeHalf)
-{
-    const int y = get_global_id(1);
-    const int x = get_global_id(0);
-
-    const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+    const int smw = bdx + 2*ksizeHalf; // shared memory "cols"
     __local volatile float *row = smem + 5 * ty * smw;
 
-    dstStep /= sizeof(*dst);
-    srcStep /= sizeof(*src);
-
-    if (y < height)
+    if (y < rows)
     {
         // Vertical pass
         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
         {
             int xExt = (int)(bx * bdx) + i - ksizeHalf;
-            xExt = idx_col(xExt, width - 1);
+            xExt = idx_col(xExt, cols - 1);
 
 #pragma unroll
             for (int k = 0; k < 5; ++k)
-                row[k*smw + i] = src[mad24(k*height + y, srcStep, xExt)] * c_gKer[0];
+                row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];
 
             for (int j = 1; j <= ksizeHalf; ++j)
 #pragma unroll
                 for (int k = 0; k < 5; ++k)
                     row[k*smw + i] +=
-                        (src[mad24(k*height + idx_row_low(y - j, height - 1), srcStep, xExt)] +
-                         src[mad24(k*height + idx_row_high(y + j, height - 1), srcStep, xExt)]) * c_gKer[j];
+                        (src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +
+                         src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];
         }
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (y < height && y >= 0 && x < width && x >= 0)
+    if (y < rows && y >= 0 && x < cols && x >= 0)
     {
         // Horizontal pass
 
@@ -445,6 +248,182 @@ __kernel void gaussianBlur5(__global float * dst,
 
 #pragma unroll
         for (int k = 0; k < 5; ++k)
-            dst[mad24(k*height + y, dstStep, x)] = res[k];
+            dst[mad24(k*rows + y, dstStep, x)] = res[k];
+    }
+}
+__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };
+
+__kernel void updateMatrices(__global const float * flowx, int xStep,
+                             __global const float * flowy, int yStep,
+                             const int rows, const int cols,
+                             __global const float * R0, int R0Step,
+                             __global const float * R1, int R1Step,
+                             __global float * M, int mStep)
+{
+    const int y = get_global_id(1);
+    const int x = get_global_id(0);
+
+    if (y < rows && y >= 0 && x < cols && x >= 0)
+    {
+        float dx = flowx[mad24(y, xStep, x)];
+        float dy = flowy[mad24(y, yStep, x)];
+        float fx = x + dx;
+        float fy = y + dy;
+
+        int x1 = convert_int(floor(fx));
+        int y1 = convert_int(floor(fy));
+        fx -= x1;
+        fy -= y1;
+
+        float r2, r3, r4, r5, r6;
+
+        if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)
+        {
+            float a00 = (1.f - fx) * (1.f - fy);
+            float a01 = fx * (1.f - fy);
+            float a10 = (1.f - fx) * fy;
+            float a11 = fx * fy;
+
+            r2 = a00 * R1[mad24(y1, R1Step, x1)] +
+                 a01 * R1[mad24(y1, R1Step, x1 + 1)] +
+                 a10 * R1[mad24(y1 + 1, R1Step, x1)] +
+                 a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];
+
+            r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +
+                 a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +
+                 a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +
+                 a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];
+
+            r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +
+                 a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +
+                 a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +
+                 a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];
+
+            r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +
+                 a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +
+                 a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +
+                 a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];
+
+            r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +
+                 a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +
+                 a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +
+                 a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];
+
+            r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;
+            r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;
+            r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;
+        }
+        else
+        {
+            r2 = r3 = 0.f;
+            r4 = R0[mad24(2*rows + y, R0Step, x)];
+            r5 = R0[mad24(3*rows + y, R0Step, x)];
+            r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;
+        }
+
+        r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;
+        r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;
+
+        r2 += r4*dy + r6*dx;
+        r3 += r6*dy + r5*dx;
+
+        float scale =
+            c_border[min(x, BORDER_SIZE)] *
+            c_border[min(y, BORDER_SIZE)] *
+            c_border[min(cols - x - 1, BORDER_SIZE)] *
+            c_border[min(rows - y - 1, BORDER_SIZE)];
+
+        r2 *= scale;
+        r3 *= scale;
+        r4 *= scale;
+        r5 *= scale;
+        r6 *= scale;
+
+        M[mad24(y, mStep, x)] = r4*r4 + r6*r6;
+        M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;
+        M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;
+        M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;
+        M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;
+    }
+}
+
+__kernel void boxFilter5(__global const float * src, int srcStep,
+                         __global float * dst, int dstStep,
+                         const int rows, const  int cols,
+                         const int ksizeHalf,
+                         __local float * smem)
+{
+    const int y = get_global_id(1);
+    const int x = get_global_id(0);
+
+    const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+    const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+    __local float *row = smem + 5 * ty * smw;
+
+    if (y < rows)
+    {
+        // Vertical pass
+        for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+        {
+            int xExt = (int)(bx * bdx) + i - ksizeHalf;
+            xExt = min(max(xExt, 0), cols - 1);
+
+#pragma unroll
+            for (int k = 0; k < 5; ++k)
+                row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];
+
+            for (int j = 1; j <= ksizeHalf; ++j)
+#pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] +=
+                        src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +
+                        src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (y < rows && y >= 0 && x < cols && x >= 0)
+    {
+        // Horizontal pass
+
+        row += tx + ksizeHalf;
+        float res[5];
+
+#pragma unroll
+        for (int k = 0; k < 5; ++k)
+            res[k] = row[k*smw];
+
+        for (int i = 1; i <= ksizeHalf; ++i)
+#pragma unroll
+            for (int k = 0; k < 5; ++k)
+                res[k] += row[k*smw - i] + row[k*smw + i];
+
+#pragma unroll
+        for (int k = 0; k < 5; ++k)
+            dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;
+    }
+}
+
+__kernel void updateFlow(__global const float * M, int mStep,
+                         __global float * flowx, int xStep,
+                         __global float * flowy, int yStep,
+                         const int rows, const int cols)
+{
+    const int y = get_global_id(1);
+    const int x = get_global_id(0);
+
+    if (y < rows && y >= 0 && x < cols && x >= 0)
+    {
+        float g11 = M[mad24(y, mStep, x)];
+        float g12 = M[mad24(rows + y, mStep, x)];
+        float g22 = M[mad24(2*rows + y, mStep, x)];
+        float h1 =  M[mad24(3*rows + y, mStep, x)];
+        float h2 =  M[mad24(4*rows + y, mStep, x)];
+
+        float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
+
+        flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;
+        flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;
     }
 }
diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl
similarity index 54%
rename from modules/ocl/src/opencl/pyrlk.cl
rename to modules/video/src/opencl/pyrlk.cl
index 303d26892..c01855490 100644
--- a/modules/ocl/src/opencl/pyrlk.cl
+++ b/modules/video/src/opencl/pyrlk.cl
@@ -52,7 +52,7 @@
 #endif
 #ifdef CPU
 
-static void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
+inline void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -71,7 +71,7 @@ static void reduce3(float val1, float val2, float val3,  __local float* smem1,
     }
 }
 
-static void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
+inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -88,7 +88,7 @@ static void reduce2(float val1, float val2, volatile __local float* smem1, volat
     }
 }
 
-static void reduce1(float val1, volatile __local float* smem1, int tid)
+inline void reduce1(float val1, volatile __local float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -103,7 +103,7 @@ static void reduce1(float val1, volatile __local float* smem1, int tid)
     }
 }
 #else
-static void reduce3(float val1, float val2, float val3,
+inline void reduce3(float val1, float val2, float val3,
              __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
 {
     smem1[tid] = val1;
@@ -150,7 +150,7 @@ static void reduce3(float val1, float val2, float val3,
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 
-static void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
+inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
 {
     smem1[tid] = val1;
     smem2[tid] = val2;
@@ -189,7 +189,7 @@ static void reduce2(float val1, float val2, __local volatile float* smem1, __loc
     barrier(CLK_LOCAL_MEM_FENCE);
 }
 
-static void reduce1(float val1, __local volatile float* smem1, int tid)
+inline void reduce1(float val1, __local volatile float* smem1, int tid)
 {
     smem1[tid] = val1;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -225,7 +225,7 @@ static void reduce1(float val1, __local volatile float* smem1, int tid)
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
 
-static void SetPatch(image2d_t I, float x, float y,
+inline void SetPatch(image2d_t I, float x, float y,
               float* Pch, float* Dx, float* Dy,
               float* A11, float* A12, float* A22)
 {
@@ -262,7 +262,7 @@ inline void GetError(image2d_t J, const float x, const float y, const float* Pch
     *errval += fabs(diff);
 }
 
-static void SetPatch4(image2d_t I, const float x, const float y,
+inline void SetPatch4(image2d_t I, const float x, const float y,
                float4* Pch, float4* Dx, float4* Dy,
                float* A11, float* A12, float* A22)
 {
@@ -285,7 +285,7 @@ static void SetPatch4(image2d_t I, const float x, const float y,
     *A22 += sqIdx.x + sqIdx.y + sqIdx.z;
 }
 
-static void GetPatch4(image2d_t J, const float x, const float y,
+inline void GetPatch4(image2d_t J, const float x, const float y,
                const float4* Pch, const float4* Dx, const float4* Dy,
                float* b1, float* b2)
 {
@@ -297,16 +297,16 @@ static void GetPatch4(image2d_t J, const float x, const float y,
     *b2 += xdiff.x + xdiff.y + xdiff.z;
 }
 
-static void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
+inline void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
 {
     float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
     *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
 }
 
 #define	GRIDSIZE	3
-__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
-                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
-                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+__kernel void lkSparse(image2d_t I, image2d_t J,
+                       __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
+                       const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
 {
     __local float smem1[BUFFER];
     __local float smem2[BUFFER];
@@ -580,440 +580,4 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
         if (calcErr)
             err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);
     }
-}
-
-
-__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
-                             __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
-                             const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
-{
-    __local float smem1[BUFFER];
-    __local float smem2[BUFFER];
-    __local float smem3[BUFFER];
-
-    unsigned int xid=get_local_id(0);
-    unsigned int yid=get_local_id(1);
-    unsigned int gid=get_group_id(0);
-    unsigned int xsize=get_local_size(0);
-    unsigned int ysize=get_local_size(1);
-    int xBase, yBase, k;
-
-    float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
-
-    const int tid = mad24(yid, xsize, xid);
-
-    float2 nextPt = prevPts[gid]/(float2)(1<<level);
-
-    if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
-    {
-        if (tid == 0 && level == 0)
-        {
-            status[gid] = 0;
-        }
-
-        return;
-    }
-
-    nextPt -= c_halfWin;
-
-    // extract the patch from the first image, compute covariation matrix of derivatives
-
-    float A11 = 0.0f;
-    float A12 = 0.0f;
-    float A22 = 0.0f;
-
-    float4 I_patch[8];
-    float4 dIdx_patch[8];
-    float4 dIdy_patch[8];
-    float4 I_add,Dx_add,Dy_add;
-
-    yBase=yid;
-    {
-        xBase=xid;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
-                  &A11, &A12, &A22);
-
-
-        xBase+=xsize;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
-                  &A11, &A12, &A22);
-
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
-                      &A11, &A12, &A22);
-
-    }
-    yBase+=ysize;
-    {
-        xBase=xid;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
-                  &A11, &A12, &A22);
-
-
-        xBase+=xsize;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
-                  &A11, &A12, &A22);
-
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
-                      &A11, &A12, &A22);
-    }
-    yBase+=ysize;
-    if(yBase<c_winSize_y)
-    {
-        xBase=xid;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
-                  &A11, &A12, &A22);
-
-
-        xBase+=xsize;
-        SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                  &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
-                  &A11, &A12, &A22);
-
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch4(I, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_add, &Dx_add, &Dy_add,
-                      &A11, &A12, &A22);
-    }
-
-    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
-
-    A11 = smem1[0];
-    A12 = smem2[0];
-    A22 = smem3[0];
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    float D = A11 * A22 - A12 * A12;
-
-    if (D < 1.192092896e-07f)
-    {
-        if (tid == 0 && level == 0)
-            status[gid] = 0;
-
-        return;
-    }
-
-    A11 /= D;
-    A12 /= D;
-    A22 /= D;
-
-    nextPt = nextPts[gid] * 2.0f - c_halfWin;
-
-    for (k = 0; k < c_iters; ++k)
-    {
-        if (nextPt.x < -c_halfWin.x || nextPt.x >= cols || nextPt.y < -c_halfWin.y || nextPt.y >= rows)
-        {
-            if (tid == 0 && level == 0)
-                status[gid] = 0;
-            return;
-        }
-
-        float b1 = 0;
-        float b2 = 0;
-
-        yBase=yid;
-        {
-            xBase=xid;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[0], &dIdx_patch[0], &dIdy_patch[0],
-                      &b1, &b2);
-
-
-            xBase+=xsize;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[1], &dIdx_patch[1], &dIdy_patch[1],
-                      &b1, &b2);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_patch[2], &dIdx_patch[2], &dIdy_patch[2],
-                          &b1, &b2);
-        }
-        yBase+=ysize;
-        {
-            xBase=xid;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[3], &dIdx_patch[3], &dIdy_patch[3],
-                      &b1, &b2);
-
-
-            xBase+=xsize;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[4], &dIdx_patch[4], &dIdy_patch[4],
-                      &b1, &b2);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_patch[5], &dIdx_patch[5], &dIdy_patch[5],
-                          &b1, &b2);
-        }
-        yBase+=ysize;
-        if(yBase<c_winSize_y)
-        {
-            xBase=xid;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[6], &dIdx_patch[6], &dIdy_patch[6],
-                      &b1, &b2);
-
-
-            xBase+=xsize;
-            GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[7], &dIdx_patch[7], &dIdy_patch[7],
-                      &b1, &b2);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_add, &Dx_add, &Dy_add,
-                          &b1, &b2);
-        }
-
-        reduce2(b1, b2, smem1, smem2, tid);
-
-        b1 = smem1[0];
-        b2 = smem2[0];
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        float2 delta;
-        delta.x = A12 * b2 - A22 * b1;
-        delta.y = A12 * b1 - A11 * b2;
-
-        nextPt +=delta;
-
-        if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)
-            break;
-    }
-
-    D = 0.0f;
-    if (calcErr)
-    {
-        yBase=yid;
-        {
-            xBase=xid;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[0], &D);
-
-
-            xBase+=xsize;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[1], &D);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_patch[2], &D);
-        }
-        yBase+=ysize;
-        {
-            xBase=xid;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[3], &D);
-
-
-            xBase+=xsize;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[4], &D);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_patch[5], &D);
-        }
-        yBase+=ysize;
-        if(yBase<c_winSize_y)
-        {
-            xBase=xid;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[6], &D);
-
-
-            xBase+=xsize;
-            GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                      &I_patch[7], &D);
-
-            xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetError4(J, nextPt.x + xBase + 0.5f, nextPt.y + yBase + 0.5f,
-                          &I_add, &D);
-        }
-
-        reduce1(D, smem1, tid);
-    }
-
-    if (tid == 0)
-    {
-        nextPt += c_halfWin;
-        nextPts[gid] = nextPt;
-
-        if (calcErr)
-            err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y);
-    }
-}
-
-__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep,
-                            const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
-{
-    int c_halfWin_x = (c_winSize_x - 1) / 2;
-    int c_halfWin_y = (c_winSize_y - 1) / 2;
-
-    const int patchWidth  = get_local_size(0) + 2 * c_halfWin_x;
-    const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
-
-    __local int smem[8192];
-
-    __local int* I_patch = smem;
-    __local int* dIdx_patch = I_patch + patchWidth * patchHeight;
-    __local int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
-
-    const int xBase = get_group_id(0) * get_local_size(0);
-    const int yBase = get_group_id(1) * get_local_size(1);
-
-    sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-    for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
-    {
-        for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
-        {
-            float x = xBase - c_halfWin_x + j + 0.5f;
-            float y = yBase - c_halfWin_y + i + 0.5f;
-
-            I_patch[i * patchWidth + j] = read_imagei(I, sampleri, (float2)(x, y)).x;
-
-            // Sharr Deriv
-
-            dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
-                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
-
-            dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
-                                             (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // extract the patch from the first image, compute covariation matrix of derivatives
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-
-    if (x >= cols || y >= rows)
-        return;
-
-    int A11i = 0;
-    int A12i = 0;
-    int A22i = 0;
-
-    for (int i = 0; i < c_winSize_y; ++i)
-    {
-        for (int j = 0; j < c_winSize_x; ++j)
-        {
-            int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
-            int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
-
-            A11i += dIdx * dIdx;
-            A12i += dIdx * dIdy;
-            A22i += dIdy * dIdy;
-        }
-    }
-
-    float A11 = A11i;
-    float A12 = A12i;
-    float A22 = A22i;
-
-    float D = A11 * A22 - A12 * A12;
-
-    //if (calcErr && GET_MIN_EIGENVALS)
-    //    (err + y * errStep)[x] = minEig;
-
-    if (D < 1.192092896e-07f)
-    {
-        //if (calcErr)
-        //    err(y, x) = 3.402823466e+38f;
-
-        return;
-    }
-
-    D = 1.f / D;
-
-    A11 *= D;
-    A12 *= D;
-    A22 *= D;
-
-    float2 nextPt;
-    nextPt.x = x + prevU[y/2 * prevUStep / 4 + x/2] * 2.0f;
-    nextPt.y = y + prevV[y/2 * prevVStep / 4 + x/2] * 2.0f;
-
-    for (int k = 0; k < c_iters; ++k)
-    {
-        if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
-        {
-            //if (calcErr)
-            //    err(y, x) = 3.402823466e+38f;
-
-            return;
-        }
-
-        int b1 = 0;
-        int b2 = 0;
-
-        for (int i = 0; i < c_winSize_y; ++i)
-        {
-            for (int j = 0; j < c_winSize_x; ++j)
-            {
-                int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
-                int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
-
-                int diff = (iJ - iI) * 32;
-
-                int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
-                int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
-
-                b1 += diff * dIdx;
-                b2 += diff * dIdy;
-            }
-        }
-
-        float2 delta;
-        delta.x = A12 * b2 - A22 * b1;
-        delta.y = A12 * b1 - A11 * b2;
-
-        nextPt.x += delta.x;
-        nextPt.y += delta.y;
-
-        if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
-            break;
-    }
-
-    u[y * uStep / 4 + x] = nextPt.x - x;
-    v[y * vStep / 4 + x] = nextPt.y - y;
-
-    if (calcErr)
-    {
-        int errval = 0;
-
-        for (int i = 0; i < c_winSize_y; ++i)
-        {
-            for (int j = 0; j < c_winSize_x; ++j)
-            {
-                int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
-                int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
-
-                errval += abs(iJ - iI);
-            }
-        }
-
-        //err[y * errStep / 4 + x] = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
-    }
-}
+}
\ No newline at end of file
diff --git a/modules/video/src/opencl/updatemotionhistory.cl b/modules/video/src/opencl/updatemotionhistory.cl
new file mode 100644
index 000000000..913e40b26
--- /dev/null
+++ b/modules/video/src/opencl/updatemotionhistory.cl
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+__kernel void updateMotionHistory(__global const uchar * silh, int silh_step, int silh_offset,
+                                  __global uchar * mhiptr, int mhi_step, int mhi_offset, int mhi_rows, int mhi_cols,
+                                  float timestamp, float delbound)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < mhi_cols && y < mhi_rows)
+    {
+        int silh_index = mad24(y, silh_step, silh_offset + x);
+        int mhi_index = mad24(y, mhi_step, mhi_offset + x * (int)sizeof(float));
+
+        silh += silh_index;
+        __global float * mhi = (__global float *)(mhiptr + mhi_index);
+
+        float val = mhi[0];
+        val = silh[0] ? timestamp : val < delbound ? 0 : val;
+        mhi[0] = val;
+    }
+}
diff --git a/modules/video/src/optflowgf.cpp b/modules/video/src/optflowgf.cpp
index 19e96885b..c0b1d88f1 100644
--- a/modules/video/src/optflowgf.cpp
+++ b/modules/video/src/optflowgf.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "precomp.hpp"
+#include "opencl_kernels.hpp"
 
 //
 // 2D dense optical flow algorithm from the following paper:
@@ -52,47 +53,40 @@ namespace cv
 {
 
 static void
-FarnebackPolyExp( const Mat& src, Mat& dst, int n, double sigma )
+FarnebackPrepareGaussian(int n, double sigma, float *g, float *xg, float *xxg,
+                         double &ig11, double &ig03, double &ig33, double &ig55)
 {
-    int k, x, y;
-
-    CV_Assert( src.type() == CV_32FC1 );
-    int width = src.cols;
-    int height = src.rows;
-    AutoBuffer<float> kbuf(n*6 + 3), _row((width + n*2)*3);
-    float* g = kbuf + n;
-    float* xg = g + n*2 + 1;
-    float* xxg = xg + n*2 + 1;
-    float *row = (float*)_row + n*3;
-
     if( sigma < FLT_EPSILON )
         sigma = n*0.3;
 
     double s = 0.;
-    for( x = -n; x <= n; x++ )
+    for (int x = -n; x <= n; x++)
     {
         g[x] = (float)std::exp(-x*x/(2*sigma*sigma));
         s += g[x];
     }
 
     s = 1./s;
-    for( x = -n; x <= n; x++ )
+    for (int x = -n; x <= n; x++)
     {
         g[x] = (float)(g[x]*s);
         xg[x] = (float)(x*g[x]);
         xxg[x] = (float)(x*x*g[x]);
     }
 
-    Mat_<double> G = Mat_<double>::zeros(6, 6);
+    Mat_<double> G(6, 6);
+    G.setTo(0);
 
-    for( y = -n; y <= n; y++ )
-        for( x = -n; x <= n; x++ )
+    for (int y = -n; y <= n; y++)
+    {
+        for (int x = -n; x <= n; x++)
         {
             G(0,0) += g[y]*g[x];
             G(1,1) += g[y]*g[x]*x*x;
             G(3,3) += g[y]*g[x]*x*x*x*x;
             G(5,5) += g[y]*g[x]*x*x*y*y;
         }
+    }
 
     //G[0][0] = 1.;
     G(2,2) = G(0,3) = G(0,4) = G(3,0) = G(4,0) = G(1,1);
@@ -107,7 +101,29 @@ FarnebackPolyExp( const Mat& src, Mat& dst, int n, double sigma )
     // [ e           z    ]
     // [                u ]
     Mat_<double> invG = G.inv(DECOMP_CHOLESKY);
-    double ig11 = invG(1,1), ig03 = invG(0,3), ig33 = invG(3,3), ig55 = invG(5,5);
+
+    ig11 = invG(1,1);
+    ig03 = invG(0,3);
+    ig33 = invG(3,3);
+    ig55 = invG(5,5);
+}
+
+static void
+FarnebackPolyExp( const Mat& src, Mat& dst, int n, double sigma )
+{
+    int k, x, y;
+
+    CV_Assert( src.type() == CV_32FC1 );
+    int width = src.cols;
+    int height = src.rows;
+    AutoBuffer<float> kbuf(n*6 + 3), _row((width + n*2)*3);
+    float* g = kbuf + n;
+    float* xg = g + n*2 + 1;
+    float* xxg = xg + n*2 + 1;
+    float *row = (float*)_row + n*3;
+    double ig11, ig03, ig33, ig55;
+
+    FarnebackPrepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
 
     dst.create( height, width, CV_32FC(5));
 
@@ -563,10 +579,506 @@ FarnebackUpdateFlow_GaussianBlur( const Mat& _R0, const Mat& _R1,
 
 }
 
+namespace cv
+{
+class FarnebackOpticalFlow
+{
+public:
+    FarnebackOpticalFlow()
+    {
+        numLevels = 5;
+        pyrScale = 0.5;
+        fastPyramids = false;
+        winSize = 13;
+        numIters = 10;
+        polyN = 5;
+        polySigma = 1.1;
+        flags = 0;
+    }
+
+    int numLevels;
+    double pyrScale;
+    bool fastPyramids;
+    int winSize;
+    int numIters;
+    int polyN;
+    double polySigma;
+    int flags;
+
+    bool operator ()(const UMat &frame0, const UMat &frame1, UMat &flowx, UMat &flowy)
+    {
+        CV_Assert(frame0.channels() == 1 && frame1.channels() == 1);
+        CV_Assert(frame0.size() == frame1.size());
+        CV_Assert(polyN == 5 || polyN == 7);
+        CV_Assert(!fastPyramids || std::abs(pyrScale - 0.5) < 1e-6);
+
+        const int min_size = 32;
+
+        Size size = frame0.size();
+        UMat prevFlowX, prevFlowY, curFlowX, curFlowY;
+
+        flowx.create(size, CV_32F);
+        flowy.create(size, CV_32F);
+        UMat flowx0 = flowx;
+        UMat flowy0 = flowy;
+
+        // Crop unnecessary levels
+        double scale = 1;
+        int numLevelsCropped = 0;
+        for (; numLevelsCropped < numLevels; numLevelsCropped++)
+        {
+            scale *= pyrScale;
+            if (size.width*scale < min_size || size.height*scale < min_size)
+                break;
+        }
+
+        frame0.convertTo(frames_[0], CV_32F);
+        frame1.convertTo(frames_[1], CV_32F);
+
+        if (fastPyramids)
+        {
+            // Build Gaussian pyramids using pyrDown()
+            pyramid0_.resize(numLevelsCropped + 1);
+            pyramid1_.resize(numLevelsCropped + 1);
+            pyramid0_[0] = frames_[0];
+            pyramid1_[0] = frames_[1];
+            for (int i = 1; i <= numLevelsCropped; ++i)
+            {
+                pyrDown(pyramid0_[i - 1], pyramid0_[i]);
+                pyrDown(pyramid1_[i - 1], pyramid1_[i]);
+            }
+        }
+
+        setPolynomialExpansionConsts(polyN, polySigma);
+
+        for (int k = numLevelsCropped; k >= 0; k--)
+        {
+            scale = 1;
+            for (int i = 0; i < k; i++)
+                scale *= pyrScale;
+
+            double sigma = (1./scale - 1) * 0.5;
+            int smoothSize = cvRound(sigma*5) | 1;
+            smoothSize = std::max(smoothSize, 3);
+
+            int width = cvRound(size.width*scale);
+            int height = cvRound(size.height*scale);
+
+            if (fastPyramids)
+            {
+                width = pyramid0_[k].cols;
+                height = pyramid0_[k].rows;
+            }
+
+            if (k > 0)
+            {
+                curFlowX.create(height, width, CV_32F);
+                curFlowY.create(height, width, CV_32F);
+            }
+            else
+            {
+                curFlowX = flowx0;
+                curFlowY = flowy0;
+            }
+
+            if (prevFlowX.empty())
+            {
+                if (flags & cv::OPTFLOW_USE_INITIAL_FLOW)
+                {
+                    resize(flowx0, curFlowX, Size(width, height), 0, 0, INTER_LINEAR);
+                    resize(flowy0, curFlowY, Size(width, height), 0, 0, INTER_LINEAR);
+                    multiply(scale, curFlowX, curFlowX);
+                    multiply(scale, curFlowY, curFlowY);
+                }
+                else
+                {
+                    curFlowX.setTo(0);
+                    curFlowY.setTo(0);
+                }
+            }
+            else
+            {
+                resize(prevFlowX, curFlowX, Size(width, height), 0, 0, INTER_LINEAR);
+                resize(prevFlowY, curFlowY, Size(width, height), 0, 0, INTER_LINEAR);
+                multiply(1./pyrScale, curFlowX, curFlowX);
+                multiply(1./pyrScale, curFlowY, curFlowY);
+            }
+
+            UMat M = allocMatFromBuf(5*height, width, CV_32F, M_);
+            UMat bufM = allocMatFromBuf(5*height, width, CV_32F, bufM_);
+            UMat R[2] =
+            {
+                allocMatFromBuf(5*height, width, CV_32F, R_[0]),
+                allocMatFromBuf(5*height, width, CV_32F, R_[1])
+            };
+
+            if (fastPyramids)
+            {
+                if (!polynomialExpansionOcl(pyramid0_[k], R[0]))
+                    return false;
+                if (!polynomialExpansionOcl(pyramid1_[k], R[1]))
+                    return false;
+            }
+            else
+            {
+                UMat blurredFrame[2] =
+                {
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[0]),
+                    allocMatFromBuf(size.height, size.width, CV_32F, blurredFrame_[1])
+                };
+                UMat pyrLevel[2] =
+                {
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[0]),
+                    allocMatFromBuf(height, width, CV_32F, pyrLevel_[1])
+                };
+
+                setGaussianBlurKernel(smoothSize, sigma);
+
+                for (int i = 0; i < 2; i++)
+                {
+                    if (!gaussianBlurOcl(frames_[i], smoothSize/2, blurredFrame[i]))
+                        return false;
+                    resize(blurredFrame[i], pyrLevel[i], Size(width, height), INTER_LINEAR);
+                    if (!polynomialExpansionOcl(pyrLevel[i], R[i]))
+                        return false;
+                }
+            }
+
+            if (!updateMatricesOcl(curFlowX, curFlowY, R[0], R[1], M))
+                return false;
+
+            if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
+                setGaussianBlurKernel(winSize, winSize/2*0.3f);
+            for (int i = 0; i < numIters; i++)
+            {
+                if (flags & OPTFLOW_FARNEBACK_GAUSSIAN)
+                {
+                    if (!updateFlow_gaussianBlur(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1))
+                        return false;
+                }
+                else
+                {
+                    if (!updateFlow_boxFilter(R[0], R[1], curFlowX, curFlowY, M, bufM, winSize, i < numIters-1))
+                        return false;
+                }
+            }
+
+            prevFlowX = curFlowX;
+            prevFlowY = curFlowY;
+        }
+
+        flowx = curFlowX;
+        flowy = curFlowY;
+        return true;
+    }
+
+    void releaseMemory()
+    {
+        frames_[0].release();
+        frames_[1].release();
+        pyrLevel_[0].release();
+        pyrLevel_[1].release();
+        M_.release();
+        bufM_.release();
+        R_[0].release();
+        R_[1].release();
+        blurredFrame_[0].release();
+        blurredFrame_[1].release();
+        pyramid0_.clear();
+        pyramid1_.clear();
+    }
+private:
+    UMat m_g;
+    UMat m_xg;
+    UMat m_xxg;
+
+    double m_igd[4];
+    float  m_ig[4];
+    void setPolynomialExpansionConsts(int n, double sigma)
+    {
+        std::vector<float> buf(n*6 + 3);
+        float* g = &buf[0] + n;
+        float* xg = g + n*2 + 1;
+        float* xxg = xg + n*2 + 1;
+
+        FarnebackPrepareGaussian(n, sigma, g, xg, xxg, m_igd[0], m_igd[1], m_igd[2], m_igd[3]);
+
+        cv::Mat t_g(1, n + 1, CV_32FC1, g);     t_g.copyTo(m_g);
+        cv::Mat t_xg(1, n + 1, CV_32FC1, xg);   t_xg.copyTo(m_xg);
+        cv::Mat t_xxg(1, n + 1, CV_32FC1, xxg); t_xxg.copyTo(m_xxg);
+
+        m_ig[0] = static_cast<float>(m_igd[0]);
+        m_ig[1] = static_cast<float>(m_igd[1]);
+        m_ig[2] = static_cast<float>(m_igd[2]);
+        m_ig[3] = static_cast<float>(m_igd[3]);
+    }
+private:
+    UMat m_gKer;
+    inline void setGaussianBlurKernel(int smoothSize, double sigma)
+    {
+        Mat g = getGaussianKernel(smoothSize, sigma, CV_32F);
+        Mat gKer(1, smoothSize/2 + 1, CV_32FC1, g.ptr<float>(smoothSize/2));
+        gKer.copyTo(m_gKer);
+    }
+private:
+    UMat frames_[2];
+    UMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
+    std::vector<UMat> pyramid0_, pyramid1_;
+
+    static UMat allocMatFromBuf(int rows, int cols, int type, UMat &mat)
+    {
+        if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)
+            return mat(Rect(0, 0, cols, rows));
+        return mat = UMat(rows, cols, type);
+    }
+private:
+#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
+
+    bool gaussianBlurOcl(const UMat &src, int ksizeHalf, UMat &dst)
+    {
+#ifdef ANDROID
+        size_t localsize[2] = { 128, 1};
+#else
+        size_t localsize[2] = { 256, 1};
+#endif
+        size_t globalsize[2] = { src.cols, src.rows};
+        int smem_size = (int)((localsize[0] + 2*ksizeHalf) * sizeof(float));
+        ocl::Kernel kernel;
+        if (!kernel.create("gaussianBlur", cv::ocl::video::optical_flow_farneback_oclsrc, ""))
+            return false;
+
+        CV_Assert(dst.size() == src.size());
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+        idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+        idxArg = kernel.set(idxArg, (int)(dst.step / dst.elemSize()));
+        idxArg = kernel.set(idxArg, dst.rows);
+        idxArg = kernel.set(idxArg, dst.cols);
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(m_gKer));
+        idxArg = kernel.set(idxArg, (int)ksizeHalf);
+        kernel.set(idxArg, (void *)NULL, smem_size);
+        return kernel.run(2, globalsize, localsize, false);
+    }
+    bool gaussianBlur5Ocl(const UMat &src, int ksizeHalf, UMat &dst)
+    {
+        int height = src.rows / 5;
+#ifdef ANDROID
+        size_t localsize[2] = { 128, 1};
+#else
+        size_t localsize[2] = { 256, 1};
+#endif
+        size_t globalsize[2] = { src.cols, height};
+        int smem_size = (int)((localsize[0] + 2*ksizeHalf) * 5 * sizeof(float));
+        ocl::Kernel kernel;
+        if (!kernel.create("gaussianBlur5", cv::ocl::video::optical_flow_farneback_oclsrc, ""))
+            return false;
+
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+        idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+        idxArg = kernel.set(idxArg, (int)(dst.step / dst.elemSize()));
+        idxArg = kernel.set(idxArg, height);
+        idxArg = kernel.set(idxArg, src.cols);
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(m_gKer));
+        idxArg = kernel.set(idxArg, (int)ksizeHalf);
+        kernel.set(idxArg, (void *)NULL, smem_size);
+        return kernel.run(2, globalsize, localsize, false);
+    }
+    bool polynomialExpansionOcl(const UMat &src, UMat &dst)
+    {
+#ifdef ANDROID
+        size_t localsize[2] = { 128, 1};
+#else
+        size_t localsize[2] = { 256, 1};
+#endif
+        size_t globalsize[2] = { DIVUP(src.cols, localsize[0] - 2*polyN) * localsize[0], src.rows};
+
+#if 0
+        const cv::ocl::Device &device = cv::ocl::Device::getDefault();
+        bool useDouble = (0 != device.doubleFPConfig());
+
+        cv::String build_options = cv::format("-D polyN=%d -D USE_DOUBLE=%d", polyN, useDouble ? 1 : 0);
+#else
+        cv::String build_options = cv::format("-D polyN=%d", polyN);
+#endif
+        ocl::Kernel kernel;
+        if (!kernel.create("polynomialExpansion", cv::ocl::video::optical_flow_farneback_oclsrc, build_options))
+            return false;
+
+        int smem_size = (int)(3 * localsize[0] * sizeof(float));
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+        idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+        idxArg = kernel.set(idxArg, (int)(dst.step / dst.elemSize()));
+        idxArg = kernel.set(idxArg, src.rows);
+        idxArg = kernel.set(idxArg, src.cols);
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(m_g));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(m_xg));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(m_xxg));
+        idxArg = kernel.set(idxArg, (void *)NULL, smem_size);
+        kernel.set(idxArg, (void *)m_ig, 4 * sizeof(float));
+        return kernel.run(2, globalsize, localsize, false);
+    }
+    bool boxFilter5Ocl(const UMat &src, int ksizeHalf, UMat &dst)
+    {
+        int height = src.rows / 5;
+#ifdef ANDROID
+        size_t localsize[2] = { 128, 1};
+#else
+        size_t localsize[2] = { 256, 1};
+#endif
+        size_t globalsize[2] = { src.cols, height};
+
+        ocl::Kernel kernel;
+        if (!kernel.create("boxFilter5", cv::ocl::video::optical_flow_farneback_oclsrc, ""))
+            return false;
+
+        int smem_size = (int)((localsize[0] + 2*ksizeHalf) * 5 * sizeof(float));
+
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(src));
+        idxArg = kernel.set(idxArg, (int)(src.step / src.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst));
+        idxArg = kernel.set(idxArg, (int)(dst.step / dst.elemSize()));
+        idxArg = kernel.set(idxArg, height);
+        idxArg = kernel.set(idxArg, src.cols);
+        idxArg = kernel.set(idxArg, (int)ksizeHalf);
+        kernel.set(idxArg, (void *)NULL, smem_size);
+        return kernel.run(2, globalsize, localsize, false);
+    }
+
+    bool updateFlowOcl(const UMat &M, UMat &flowx, UMat &flowy)
+    {
+#ifdef ANDROID
+        size_t localsize[2] = { 32, 4};
+#else
+        size_t localsize[2] = { 32, 8};
+#endif
+        size_t globalsize[2] = { flowx.cols, flowx.rows};
+
+        ocl::Kernel kernel;
+        if (!kernel.create("updateFlow", cv::ocl::video::optical_flow_farneback_oclsrc, ""))
+            return false;
+
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(M));
+        idxArg = kernel.set(idxArg, (int)(M.step / M.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(flowx));
+        idxArg = kernel.set(idxArg, (int)(flowx.step / flowx.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(flowy));
+        idxArg = kernel.set(idxArg, (int)(flowy.step / flowy.elemSize()));
+        idxArg = kernel.set(idxArg, (int)flowy.rows);
+        kernel.set(idxArg, (int)flowy.cols);
+        return kernel.run(2, globalsize, localsize, false);
+    }
+    bool updateMatricesOcl(const UMat &flowx, const UMat &flowy, const UMat &R0, const UMat &R1, UMat &M)
+    {
+#ifdef ANDROID
+        size_t localsize[2] = { 32, 4};
+#else
+        size_t localsize[2] = { 32, 8};
+#endif
+        size_t globalsize[2] = { flowx.cols, flowx.rows};
+
+        ocl::Kernel kernel;
+        if (!kernel.create("updateMatrices", cv::ocl::video::optical_flow_farneback_oclsrc, ""))
+            return false;
+
+        int idxArg = 0;
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(flowx));
+        idxArg = kernel.set(idxArg, (int)(flowx.step / flowx.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(flowy));
+        idxArg = kernel.set(idxArg, (int)(flowy.step / flowy.elemSize()));
+        idxArg = kernel.set(idxArg, (int)flowx.rows);
+        idxArg = kernel.set(idxArg, (int)flowx.cols);
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(R0));
+        idxArg = kernel.set(idxArg, (int)(R0.step / R0.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(R1));
+        idxArg = kernel.set(idxArg, (int)(R1.step / R1.elemSize()));
+        idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(M));
+        kernel.set(idxArg, (int)(M.step / M.elemSize()));
+        return kernel.run(2, globalsize, localsize, false);
+    }
+
+    bool updateFlow_boxFilter(
+        const UMat& R0, const UMat& R1, UMat& flowx, UMat &flowy,
+        UMat& M, UMat &bufM, int blockSize, bool updateMatrices)
+    {
+        if (!boxFilter5Ocl(M, blockSize/2, bufM))
+            return false;
+        swap(M, bufM);
+        if (!updateFlowOcl(M, flowx, flowy))
+            return false;
+        if (updateMatrices)
+            if (!updateMatricesOcl(flowx, flowy, R0, R1, M))
+                return false;
+        return true;
+    }
+    bool updateFlow_gaussianBlur(
+        const UMat& R0, const UMat& R1, UMat& flowx, UMat& flowy,
+        UMat& M, UMat &bufM, int blockSize, bool updateMatrices)
+    {
+        if (!gaussianBlur5Ocl(M, blockSize/2, bufM))
+            return false;
+        swap(M, bufM);
+        if (!updateFlowOcl(M, flowx, flowy))
+            return false;
+        if (updateMatrices)
+            if (!updateMatricesOcl(flowx, flowy, R0, R1, M))
+                return false;
+        return true;
+    }
+};
+
+static bool ocl_calcOpticalFlowFarneback( InputArray _prev0, InputArray _next0,
+                            InputOutputArray _flow0, double pyr_scale, int levels, int winsize,
+                            int iterations, int poly_n, double poly_sigma, int flags )
+{
+    if ((5 != poly_n) && (7 != poly_n))
+        return false;
+    if (_next0.size() != _prev0.size())
+        return false;
+    int typePrev = _prev0.type();
+    int typeNext = _next0.type();
+    if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
+        return false;
+
+    FarnebackOpticalFlow opticalFlow;
+    opticalFlow.numLevels   = levels;
+    opticalFlow.pyrScale    = pyr_scale;
+    opticalFlow.fastPyramids= false;
+    opticalFlow.winSize     = winsize;
+    opticalFlow.numIters    = iterations;
+    opticalFlow.polyN       = poly_n;
+    opticalFlow.polySigma   = poly_sigma;
+    opticalFlow.flags       = flags;
+
+    std::vector<UMat> flowar;
+    if (!_flow0.empty())
+        split(_flow0, flowar);
+    else
+    {
+        flowar.push_back(UMat());
+        flowar.push_back(UMat());
+    }
+    if (!opticalFlow(_prev0.getUMat(), _next0.getUMat(), flowar[0], flowar[1]))
+        return false;
+    merge(flowar, _flow0);
+    return true;
+}
+}
+
 void cv::calcOpticalFlowFarneback( InputArray _prev0, InputArray _next0,
                                InputOutputArray _flow0, double pyr_scale, int levels, int winsize,
                                int iterations, int poly_n, double poly_sigma, int flags )
 {
+    bool use_opencl = ocl::useOpenCL() && _flow0.isUMat();
+    if( use_opencl && ocl_calcOpticalFlowFarneback(_prev0, _next0, _flow0, pyr_scale, levels, winsize, iterations, poly_n, poly_sigma, flags))
+        return;
+
     Mat prev0 = _prev0.getMat(), next0 = _next0.getMat();
     const int min_size = 32;
     const Mat* img[2] = { &prev0, &next0 };
diff --git a/modules/video/src/precomp.hpp b/modules/video/src/precomp.hpp
index 43ff772cc..ba0c93112 100644
--- a/modules/video/src/precomp.hpp
+++ b/modules/video/src/precomp.hpp
@@ -46,6 +46,7 @@
 #include "opencv2/video.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
 #include "opencv2/video/video_tegra.hpp"
diff --git a/modules/video/src/simpleflow.cpp b/modules/video/src/simpleflow.cpp
index 765a34270..66f4c41bd 100644
--- a/modules/video/src/simpleflow.cpp
+++ b/modules/video/src/simpleflow.cpp
@@ -429,6 +429,7 @@ static inline float extrapolateValueInRect(int height, int width,
   if (r == height && c == 0) { return v21;}
   if (r == height && c == width) { return v22;}
 
+  CV_Assert(height > 0 && width > 0);
   float qr = float(r) / height;
   float pr = 1.0f - qr;
   float qc = float(c) / width;
diff --git a/modules/video/src/tvl1flow.cpp b/modules/video/src/tvl1flow.cpp
index 8d5993275..ddcc32b40 100644
--- a/modules/video/src/tvl1flow.cpp
+++ b/modules/video/src/tvl1flow.cpp
@@ -947,7 +947,7 @@ CV_INIT_ALGORITHM(OpticalFlowDual_TVL1, "DenseOpticalFlow.DualTVL1",
                                        "inner iterations (between outlier filtering) used in the numerical scheme");
                   obj.info()->addParam(obj, "outerIterations", obj.outerIterations, false, 0, 0,
                                        "outer iterations (number of inner loops) used in the numerical scheme");
-                  obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow));
+                  obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow))
 
 } // namespace
 
diff --git a/modules/video/test/ocl/test_bgfg_mog2.cpp b/modules/video/test/ocl/test_bgfg_mog2.cpp
new file mode 100644
index 000000000..bfb1621fe
--- /dev/null
+++ b/modules/video/test/ocl/test_bgfg_mog2.cpp
@@ -0,0 +1,136 @@
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+#if defined(HAVE_XINE)     || \
+defined(HAVE_GSTREAMER)    || \
+defined(HAVE_QUICKTIME)    || \
+defined(HAVE_AVFOUNDATION) || \
+defined(HAVE_FFMPEG)       || \
+defined(WIN32)
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
+namespace cvtest {
+namespace ocl {
+
+//////////////////////////Mog2_Update///////////////////////////////////
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(UseGray, bool)
+    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
+}
+
+PARAM_TEST_CASE(Mog2_Update, UseGray, DetectShadow)
+{
+    bool useGray;
+    bool detectShadow;
+    virtual void SetUp()
+    {
+        useGray = GET_PARAM(0);
+        detectShadow = GET_PARAM(1);
+    }
+};
+
+OCL_TEST_P(Mog2_Update, Accuracy)
+{
+    string inputFile = string(TS::ptr()->get_data_path()) + "video/768x576.avi";
+    VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    Ptr<BackgroundSubtractorMOG2> mog2_cpu = createBackgroundSubtractorMOG2();
+    Ptr<BackgroundSubtractorMOG2> mog2_ocl = createBackgroundSubtractorMOG2();
+
+    mog2_cpu->setDetectShadows(detectShadow);
+    mog2_ocl->setDetectShadows(detectShadow);
+
+    Mat frame, foreground;
+    UMat u_foreground;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        if (useGray)
+        {
+            Mat temp;
+            cvtColor(frame, temp, COLOR_BGR2GRAY);
+            swap(temp, frame);
+        }
+
+        OCL_OFF(mog2_cpu->apply(frame, foreground));
+        OCL_ON (mog2_ocl->apply(frame, u_foreground));
+
+        if (detectShadow)
+            EXPECT_MAT_SIMILAR(foreground, u_foreground, 15e-3)
+        else
+            EXPECT_MAT_NEAR(foreground, u_foreground, 0);
+    }
+}
+
+//////////////////////////Mog2_getBackgroundImage///////////////////////////////////
+
+PARAM_TEST_CASE(Mog2_getBackgroundImage, DetectShadow)
+{
+    bool detectShadow;
+    virtual void SetUp()
+    {
+        detectShadow = GET_PARAM(0);
+    }
+};
+
+OCL_TEST_P(Mog2_getBackgroundImage, Accuracy)
+{
+    string inputFile = string(TS::ptr()->get_data_path()) + "video/768x576.avi";
+    VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    Ptr<BackgroundSubtractorMOG2> mog2_cpu = createBackgroundSubtractorMOG2();
+    Ptr<BackgroundSubtractorMOG2> mog2_ocl = createBackgroundSubtractorMOG2();
+
+    mog2_cpu->setDetectShadows(detectShadow);
+    mog2_ocl->setDetectShadows(detectShadow);
+
+    Mat frame, foreground;
+    UMat u_foreground;
+
+    for (int i = 0; i < 10; ++i)
+    {
+        cap >> frame;
+        ASSERT_FALSE(frame.empty());
+
+        OCL_OFF(mog2_cpu->apply(frame, foreground));
+        OCL_ON (mog2_ocl->apply(frame, u_foreground));
+    }
+
+    Mat background;
+    OCL_OFF(mog2_cpu->getBackgroundImage(background));
+
+    UMat u_background;
+    OCL_ON (mog2_ocl->getBackgroundImage(u_background));
+
+    EXPECT_MAT_NEAR(background, u_background, 1.0);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+
+OCL_INSTANTIATE_TEST_CASE_P(OCL_Video, Mog2_Update, Combine(
+                                    Values(UseGray(true), UseGray(false)),
+                                    Values(DetectShadow(true), DetectShadow(false)))
+                           );
+
+OCL_INSTANTIATE_TEST_CASE_P(OCL_Video, Mog2_getBackgroundImage, (Values(DetectShadow(true), DetectShadow(false)))
+                           );
+
+}}// namespace cvtest::ocl
+
+    #endif
+#endif
\ No newline at end of file
diff --git a/modules/video/test/ocl/test_motempl.cpp b/modules/video/test/ocl/test_motempl.cpp
new file mode 100644
index 000000000..7b4c22755
--- /dev/null
+++ b/modules/video/test/ocl/test_motempl.cpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+PARAM_TEST_CASE(UpdateMotionHistory, bool)
+{
+    double timestamp, duration;
+    bool use_roi;
+
+    TEST_DECLARE_INPUT_PARAMETER(silhouette)
+    TEST_DECLARE_OUTPUT_PARAMETER(mhi)
+
+    virtual void SetUp()
+    {
+        use_roi = GET_PARAM(0);
+    }
+
+    virtual void generateTestData()
+    {
+        Size roiSize = randomSize(1, MAX_VALUE);
+        Border silhouetteBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(silhouette, silhouette_roi, roiSize, silhouetteBorder, CV_8UC1, -11, 11);
+
+        Border mhiBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+        randomSubMat(mhi, mhi_roi, roiSize, mhiBorder, CV_32FC1, 0, 1);
+
+        timestamp = randomDouble(0, 1);
+        duration = randomDouble(0, 1);
+        if (timestamp < duration)
+            std::swap(timestamp, duration);
+
+        UMAT_UPLOAD_INPUT_PARAMETER(silhouette)
+        UMAT_UPLOAD_OUTPUT_PARAMETER(mhi)
+    }
+};
+
+OCL_TEST_P(UpdateMotionHistory, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::updateMotionHistory(silhouette_roi, mhi_roi, timestamp, duration));
+        OCL_ON(cv::updateMotionHistory(usilhouette_roi, umhi_roi, timestamp, duration));
+
+        OCL_EXPECT_MATS_NEAR(mhi, 0)
+    }
+}
+
+//////////////////////////////////////// Instantiation /////////////////////////////////////////
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, UpdateMotionHistory, Values(false, true));
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
diff --git a/modules/video/test/ocl/test_optflow_farneback.cpp b/modules/video/test/ocl/test_optflow_farneback.cpp
new file mode 100644
index 000000000..c2d13e006
--- /dev/null
+++ b/modules/video/test/ocl/test_optflow_farneback.cpp
@@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// FarnebackOpticalFlow
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(PyrScale, double)
+    IMPLEMENT_PARAM_CLASS(PolyN, int)
+    CV_FLAGS(FarnebackOptFlowFlags, 0, OPTFLOW_FARNEBACK_GAUSSIAN)
+    IMPLEMENT_PARAM_CLASS(UseInitFlow, bool)
+}
+
+PARAM_TEST_CASE(FarnebackOpticalFlow, PyrScale, PolyN, FarnebackOptFlowFlags, UseInitFlow)
+{
+    int numLevels;
+    int winSize;
+    int numIters;
+    double pyrScale;
+    int polyN;
+    int flags;
+    bool useInitFlow;
+
+    virtual void SetUp()
+    {
+        numLevels = 5;
+        winSize = 13;
+        numIters = 10;
+        pyrScale = GET_PARAM(0);
+        polyN = GET_PARAM(1);
+        flags = GET_PARAM(2);
+        useInitFlow = GET_PARAM(3);
+    }
+};
+
+OCL_TEST_P(FarnebackOpticalFlow, Mat)
+{
+    cv::Mat frame0 = readImage("optflow/RubberWhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("optflow/RubberWhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    double polySigma = polyN <= 5 ? 1.1 : 1.5;
+
+    cv::Mat flow; cv::UMat uflow;
+    if (useInitFlow)
+    {
+        OCL_ON(cv::calcOpticalFlowFarneback(frame0, frame1, uflow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags));
+        uflow.copyTo(flow);
+        flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
+    }
+    OCL_OFF(cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags));
+    OCL_ON(cv::calcOpticalFlowFarneback(frame0, frame1, uflow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags));
+
+    EXPECT_MAT_SIMILAR(flow, uflow, 0.1)
+}
+
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlow,
+                            Combine(
+                                Values(PyrScale(0.3), PyrScale(0.5), PyrScale(0.8)),
+                                Values(PolyN(5), PolyN(7)),
+                                Values(FarnebackOptFlowFlags(0), FarnebackOptFlowFlags(cv::OPTFLOW_FARNEBACK_GAUSSIAN)),
+                                Values(UseInitFlow(false), UseInitFlow(true))
+                                )
+                           );
+
+
+} } // namespace cvtest::ocl
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/video/test/ocl/test_optflowpyrlk.cpp b/modules/video/test/ocl/test_optflowpyrlk.cpp
new file mode 100644
index 000000000..94195eabe
--- /dev/null
+++ b/modules/video/test/ocl/test_optflowpyrlk.cpp
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include "test_precomp.hpp"
+#include "opencv2/ts/ocl_test.hpp"
+
+
+#ifdef HAVE_OPENCL
+
+
+namespace cvtest {
+namespace ocl {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+PARAM_TEST_CASE(PyrLKOpticalFlow, int, int)
+{
+    Size winSize;
+    int maxLevel;
+    TermCriteria criteria;
+    int flags;
+    double minEigThreshold;
+
+    virtual void SetUp()
+    {
+        winSize = Size(GET_PARAM(0), GET_PARAM(0));
+        maxLevel = GET_PARAM(1);
+        criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01);
+        flags = 0;
+        minEigThreshold = 1e-4f;
+    }
+};
+
+OCL_TEST_P(PyrLKOpticalFlow, Mat)
+{
+    static const int npoints = 1000;
+    static const float eps = 0.03f;
+
+    cv::Mat frame0 = readImage("optflow/RubberWhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+    UMat umatFrame0; frame0.copyTo(umatFrame0);
+
+    cv::Mat frame1 = readImage("optflow/RubberWhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+    UMat umatFrame1; frame1.copyTo(umatFrame1);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(frame0, pts, npoints, 0.01, 0.0);
+
+    std::vector<cv::Point2f> cpuNextPts;
+    std::vector<unsigned char> cpuStatusCPU;
+    std::vector<float> cpuErr;
+    OCL_OFF(cv::calcOpticalFlowPyrLK(frame0, frame1, pts, cpuNextPts, cpuStatusCPU, cpuErr, winSize, maxLevel, criteria, flags, minEigThreshold));
+
+    UMat umatNextPts, umatStatus, umatErr;
+    OCL_ON(cv::calcOpticalFlowPyrLK(umatFrame0, umatFrame1, pts, umatNextPts, umatStatus, umatErr, winSize, maxLevel, criteria, flags, minEigThreshold));
+    std::vector<cv::Point2f> nextPts; umatNextPts.reshape(2, 1).copyTo(nextPts);
+    std::vector<unsigned char> status; umatStatus.reshape(1, 1).copyTo(status);
+    std::vector<float> err; umatErr.reshape(1, 1).copyTo(err);
+
+    ASSERT_EQ(cpuNextPts.size(), nextPts.size());
+    ASSERT_EQ(cpuStatusCPU.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        if (status[i] != cpuStatusCPU[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            cv::Point2i a = nextPts[i];
+            cv::Point2i b = cpuNextPts[i];
+
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+            float errdiff = 0.0f;
+
+            if (!eq || errdiff > 1e-1)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / (nextPts.size());
+
+    ASSERT_LE(bad_ratio, eps);
+}
+
+OCL_INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlow,
+                            Combine(
+                                Values(21, 25),
+                                Values(3, 5)
+                                )
+                           );
+
+} } // namespace cvtest::ocl
+
+
+#endif // HAVE_OPENCL
\ No newline at end of file
diff --git a/modules/video/test/test_accum.cpp b/modules/video/test/test_accum.cpp
index ef6c05f98..6895bb4ea 100644
--- a/modules/video/test/test_accum.cpp
+++ b/modules/video/test/test_accum.cpp
@@ -100,7 +100,7 @@ double CV_AccumBaseTest::get_success_error_level( int /*test_case_idx*/, int /*i
 class CV_AccTest : public CV_AccumBaseTest
 {
 public:
-    CV_AccTest() {};
+    CV_AccTest() { }
 protected:
     void run_func();
     void prepare_to_validation( int );
diff --git a/modules/video/test/test_ecc.cpp b/modules/video/test/test_ecc.cpp
index 4065e6182..6b60a181f 100644
--- a/modules/video/test/test_ecc.cpp
+++ b/modules/video/test/test_ecc.cpp
@@ -107,7 +107,7 @@ protected:
     bool testTranslation(int);
 };
 
-CV_ECC_Test_Translation::CV_ECC_Test_Translation(){};
+CV_ECC_Test_Translation::CV_ECC_Test_Translation(){}
 
 bool CV_ECC_Test_Translation::testTranslation(int from)
 {
@@ -256,7 +256,7 @@ protected:
     bool testAffine(int);
 };
 
-CV_ECC_Test_Affine::CV_ECC_Test_Affine(){};
+CV_ECC_Test_Affine::CV_ECC_Test_Affine(){}
 
 
 bool CV_ECC_Test_Affine::testAffine(int from)
@@ -332,7 +332,7 @@ protected:
     bool testHomography(int);
 };
 
-CV_ECC_Test_Homography::CV_ECC_Test_Homography(){};
+CV_ECC_Test_Homography::CV_ECC_Test_Homography(){}
 
 bool CV_ECC_Test_Homography::testHomography(int from)
 {
diff --git a/modules/viz/doc/viz3d.rst b/modules/viz/doc/viz3d.rst
index 2f8dfcd9d..d0e24ae8e 100644
--- a/modules/viz/doc/viz3d.rst
+++ b/modules/viz/doc/viz3d.rst
@@ -13,7 +13,7 @@ viz::makeTransformToGlobal
 --------------------------
 Takes coordinate frame data and builds transform to global coordinate frame.
 
-.. ocv:function:: Affine3f viz::makeTransformToGlobal(const Vec3f& axis_x, const Vec3f& axis_y, const Vec3f& axis_z, const Vec3f& origin = Vec3f::all(0))
+.. ocv:function:: Affine3d viz::makeTransformToGlobal(const Vec3f& axis_x, const Vec3f& axis_y, const Vec3f& axis_z, const Vec3f& origin = Vec3f::all(0))
 
     :param axis_x: X axis vector in global coordinate frame.
     :param axis_y: Y axis vector in global coordinate frame.
@@ -26,7 +26,7 @@ viz::makeCameraPose
 -------------------
 Constructs camera pose from position, focal_point and up_vector (see gluLookAt() for more infromation).
 
-.. ocv:function:: Affine3f makeCameraPose(const Vec3f& position, const Vec3f& focal_point, const Vec3f& y_dir)
+.. ocv:function:: Affine3d makeCameraPose(const Vec3f& position, const Vec3f& focal_point, const Vec3f& y_dir)
 
     :param position: Position of the camera in global coordinate frame.
     :param focal_point: Focal point of the camera in global coordinate frame.
@@ -34,11 +34,11 @@ Constructs camera pose from position, focal_point and up_vector (see gluLookAt()
 
 This function returns pose of the camera in global coordinate frame.
 
-viz::get
---------
+viz::getWindowByName
+--------------------
 Retrieves a window by its name.
 
-.. ocv:function:: Viz3d get(const String &window_name)
+.. ocv:function:: Viz3d getWindowByName(const String &window_name)
 
     :param window_name: Name of the window that is to be retrieved.
 
@@ -51,8 +51,8 @@ This function returns a :ocv:class:`Viz3d` object with the given name.
           .. code-block:: cpp
 
                 /// window and window_2 are the same windows.
-                viz::Viz3d window   = viz::get("myWindow");
-                viz::Viz3d window_2 = viz::get("Viz - myWindow");
+                viz::Viz3d window   = viz::getWindowByName("myWindow");
+                viz::Viz3d window_2 = viz::getWindowByName("Viz - myWindow");
 
 viz::isNan
 ----------
@@ -94,19 +94,21 @@ The Viz3d class represents a 3D visualizer window. This class is implicitly shar
         Viz3d& operator=(const Viz3d&);
         ~Viz3d();
 
-        void showWidget(const String &id, const Widget &widget, const Affine3f &pose = Affine3f::Identity());
+        void showWidget(const String &id, const Widget &widget, const Affine3d &pose = Affine3d::Identity());
         void removeWidget(const String &id);
         Widget getWidget(const String &id) const;
         void removeAllWidgets();
 
-        void setWidgetPose(const String &id, const Affine3f &pose);
-        void updateWidgetPose(const String &id, const Affine3f &pose);
-        Affine3f getWidgetPose(const String &id) const;
+        void setWidgetPose(const String &id, const Affine3d &pose);
+        void updateWidgetPose(const String &id, const Affine3d &pose);
+        Affine3d getWidgetPose(const String &id) const;
+
+        void showImage(InputArray image, const Size& window_size = Size(-1, -1));
 
         void setCamera(const Camera &camera);
         Camera getCamera() const;
-        Affine3f getViewerPose();
-        void setViewerPose(const Affine3f &pose);
+        Affine3d getViewerPose();
+        void setViewerPose(const Affine3d &pose);
 
         void resetCameraViewpoint (const String &id);
         void resetCamera();
@@ -132,8 +134,6 @@ The Viz3d class represents a 3D visualizer window. This class is implicitly shar
         void setRenderingProperty(const String &id, int property, double value);
         double getRenderingProperty(const String &id, int property);
 
-        void setDesiredUpdateRate(double rate);
-        double getDesiredUpdateRate();
 
         void setRepresentation(int representation);
     private:
@@ -152,7 +152,7 @@ viz::Viz3d::showWidget
 ----------------------
 Shows a widget in the window.
 
-.. ocv:function:: void Viz3d::showWidget(const String &id, const Widget &widget, const Affine3f &pose = Affine3f::Identity())
+.. ocv:function:: void Viz3d::showWidget(const String &id, const Widget &widget, const Affine3d &pose = Affine3d::Identity())
 
     :param id: A unique id for the widget.
     :param widget: The widget to be displayed in the window.
@@ -182,11 +182,20 @@ Removes all widgets from the window.
 
 .. ocv:function:: void removeAllWidgets()
 
+viz::Viz3d::showImage
+---------------------
+Removed all widgets and displays image scaled to whole window area.
+
+.. ocv:function:: void showImage(InputArray image, const Size& window_size = Size(-1, -1))
+
+    :param image: Image to be displayed.
+    :param size: Size of Viz3d window. Default value means no change.
+
 viz::Viz3d::setWidgetPose
 -------------------------
 Sets pose of a widget in the window.
 
-.. ocv:function:: void setWidgetPose(const String &id, const Affine3f &pose)
+.. ocv:function:: void setWidgetPose(const String &id, const Affine3d &pose)
 
     :param id: The id of the widget whose pose will be set.
     :param pose: The new pose of the widget.
@@ -195,7 +204,7 @@ viz::Viz3d::updateWidgetPose
 ----------------------------
 Updates pose of a widget in the window by pre-multiplying its current pose.
 
-.. ocv:function:: void updateWidgetPose(const String &id, const Affine3f &pose)
+.. ocv:function:: void updateWidgetPose(const String &id, const Affine3d &pose)
 
     :param id: The id of the widget whose pose will be updated.
     :param pose: The pose that the current pose of the widget will be pre-multiplied by.
@@ -204,7 +213,7 @@ viz::Viz3d::getWidgetPose
 -------------------------
 Returns the current pose of a widget in the window.
 
-.. ocv:function:: Affine3f getWidgetPose(const String &id) const
+.. ocv:function:: Affine3d getWidgetPose(const String &id) const
 
     :param id: The id of the widget whose pose will be returned.
 
@@ -226,13 +235,13 @@ viz::Viz3d::getViewerPose
 -------------------------
 Returns the current pose of the viewer.
 
-..ocv:function:: Affine3f getViewerPose()
+..ocv:function:: Affine3d getViewerPose()
 
 viz::Viz3d::setViewerPose
 -------------------------
 Sets pose of the viewer.
 
-.. ocv:function:: void setViewerPose(const Affine3f &pose)
+.. ocv:function:: void setViewerPose(const Affine3d &pose)
 
     :param pose: The new pose of the viewer.
 
@@ -414,20 +423,6 @@ Returns rendering property of a widget.
         * **SHADING_GOURAUD**
         * **SHADING_PHONG**
 
-viz::Viz3d::setDesiredUpdateRate
---------------------------------
-Sets desired update rate of the window.
-
-.. ocv:function:: void setDesiredUpdateRate(double rate)
-
-    :param rate: Desired update rate. The default is 30.
-
-viz::Viz3d::getDesiredUpdateRate
---------------------------------
-Returns desired update rate of the window.
-
-.. ocv:function:: double getDesiredUpdateRate()
-
 viz::Viz3d::setRepresentation
 -----------------------------
 Sets geometry representation of the widgets to surface, wireframe or points.
@@ -468,33 +463,33 @@ This class a represents BGR color. ::
         static Color gray();
     };
 
-viz::Mesh3d
+viz::Mesh
 -----------
-.. ocv:class:: Mesh3d
+.. ocv:class:: Mesh
 
 This class wraps mesh attributes, and it can load a mesh from a ``ply`` file. ::
 
-    class CV_EXPORTS Mesh3d
+    class CV_EXPORTS Mesh
     {
     public:
 
-        Mat cloud, colors;
+        Mat cloud, colors, normals;
+
+        //! Raw integer list of the form: (n,id1,id2,...,idn, n,id1,id2,...,idn, ...)
+        //! where n is the number of points in the poligon, and id is a zero-offset index into an associated cloud.
         Mat polygons;
 
         //! Loads mesh from a given ply file
-        static Mesh3d loadMesh(const String& file);
-
-    private:
-        /* hidden */
+        static Mesh load(const String& file);
     };
 
-viz::Mesh3d::loadMesh
+viz::Mesh::load
 ---------------------
 Loads a mesh from a ``ply`` file.
 
-.. ocv:function:: static Mesh3d loadMesh(const String& file)
+.. ocv:function:: static Mesh load(const String& file)
 
-    :param file: File name.
+    :param file: File name (for no only PLY is supported)
 
 
 viz::KeyboardEvent
@@ -506,40 +501,28 @@ This class represents a keyboard event. ::
     class CV_EXPORTS KeyboardEvent
     {
     public:
-        static const unsigned int Alt   = 1;
-        static const unsigned int Ctrl  = 2;
-        static const unsigned int Shift = 4;
+        enum { ALT = 1, CTRL = 2, SHIFT = 4 };
+        enum Action { KEY_UP = 0, KEY_DOWN = 1 };
 
-        //! Create a keyboard event
-        //! - Note that action is true if key is pressed, false if released
-        KeyboardEvent (bool action, const std::string& key_sym, unsigned char key, bool alt, bool ctrl, bool shift);
+        KeyboardEvent(Action action, const String& symbol, unsigned char code, int modifiers);
 
-        bool isAltPressed () const;
-        bool isCtrlPressed () const;
-        bool isShiftPressed () const;
-
-        unsigned char getKeyCode () const;
-
-        const String& getKeySym () const;
-        bool keyDown () const;
-        bool keyUp () const;
-
-    protected:
-        /* hidden */
+        Action action;
+        String symbol;
+        unsigned char code;
+        int modifiers;
     };
 
 viz::KeyboardEvent::KeyboardEvent
 ---------------------------------
 Constructs a KeyboardEvent.
 
-.. ocv:function:: KeyboardEvent (bool action, const std::string& key_sym, unsigned char key, bool alt, bool ctrl, bool shift)
+.. ocv:function:: KeyboardEvent (Action action, const String& symbol, unsigned char code, Modifiers modifiers)
+
+    :param action: Signals if key is pressed or released.
+    :param symbol: Name of the key.
+    :param code: Code of the key.
+    :param modifiers: Signals if ``alt``, ``ctrl`` or ``shift`` are pressed or their combination.
 
-    :param action: If true, key is pressed. If false, key is released.
-    :param key_sym: Name of the key.
-    :param key: Code of the key.
-    :param alt: If true, ``alt`` is pressed.
-    :param ctrl: If true, ``ctrl`` is pressed.
-    :param shift: If true, ``shift`` is pressed.
 
 viz::MouseEvent
 ---------------
@@ -553,26 +536,24 @@ This class represents a mouse event. ::
         enum Type { MouseMove = 1, MouseButtonPress, MouseButtonRelease, MouseScrollDown, MouseScrollUp, MouseDblClick } ;
         enum MouseButton { NoButton = 0, LeftButton, MiddleButton, RightButton, VScroll } ;
 
-        MouseEvent (const Type& type, const MouseButton& button, const Point& p, bool alt, bool ctrl, bool shift);
+        MouseEvent(const Type& type, const MouseButton& button, const Point& pointer, int modifiers);
 
         Type type;
         MouseButton button;
         Point pointer;
-        unsigned int key_state;
+        int modifiers;
     };
 
 viz::MouseEvent::MouseEvent
 ---------------------------
 Constructs a MouseEvent.
 
-.. ocv:function:: MouseEvent (const Type& type, const MouseButton& button, const Point& p, bool alt, bool ctrl, bool shift)
+.. ocv:function:: MouseEvent (const Type& type, const MouseButton& button, const Point& p, Modifiers modifiers)
 
     :param type: Type of the event. This can be **MouseMove**, **MouseButtonPress**, **MouseButtonRelease**, **MouseScrollDown**, **MouseScrollUp**, **MouseDblClick**.
     :param button: Mouse button. This can be **NoButton**, **LeftButton**, **MiddleButton**, **RightButton**, **VScroll**.
     :param p: Position of the event.
-    :param alt: If true, ``alt`` is pressed.
-    :param ctrl: If true, ``ctrl`` is pressed.
-    :param shift: If true, ``shift`` is pressed.
+    :param modifiers: Signals if ``alt``, ``ctrl`` or ``shift`` are pressed or their combination.
 
 viz::Camera
 -----------
@@ -585,24 +566,24 @@ that can extract the intrinsic parameters from ``field of view``, ``intrinsic ma
     class CV_EXPORTS Camera
     {
     public:
-        Camera(float f_x, float f_y, float c_x, float c_y, const Size &window_size);
-        Camera(const Vec2f &fov, const Size &window_size);
-        Camera(const cv::Matx33f &K, const Size &window_size);
-        Camera(const cv::Matx44f &proj, const Size &window_size);
+        Camera(double f_x, double f_y, double c_x, double c_y, const Size &window_size);
+        Camera(const Vec2d &fov, const Size &window_size);
+        Camera(const Matx33d &K, const Size &window_size);
+        Camera(const Matx44d &proj, const Size &window_size);
 
-        inline const Vec2d & getClip() const { return clip_; }
-        inline void setClip(const Vec2d &clip) { clip_ = clip; }
+        inline const Vec2d & getClip() const;
+        inline void setClip(const Vec2d &clip);
 
-        inline const Size & getWindowSize() const { return window_size_; }
+        inline const Size & getWindowSize() const;
         void setWindowSize(const Size &window_size);
 
-        inline const Vec2f & getFov() const { return fov_; }
-        inline void setFov(const Vec2f & fov) { fov_ = fov; }
+        inline const Vec2d & getFov() const;
+        inline void setFov(const Vec2d & fov);
 
-        inline const Vec2f & getPrincipalPoint() const { return principal_point_; }
-        inline const Vec2f & getFocalLength() const { return focal_; }
+        inline const Vec2d & getPrincipalPoint() const;
+        inline const Vec2d & getFocalLength() const;
 
-        void computeProjectionMatrix(Matx44f &proj) const;
+        void computeProjectionMatrix(Matx44d &proj) const;
 
         static Camera KinectCamera(const Size &window_size);
 
@@ -614,7 +595,7 @@ viz::Camera::Camera
 -------------------
 Constructs a Camera.
 
-.. ocv:function:: Camera(float f_x, float f_y, float c_x, float c_y, const Size &window_size)
+.. ocv:function:: Camera(double f_x, double f_y, double c_x, double c_y, const Size &window_size)
 
     :param f_x: Horizontal focal length.
     :param f_y: Vertical focal length.
@@ -622,19 +603,19 @@ Constructs a Camera.
     :param c_y: y coordinate of the principal point.
     :param window_size: Size of the window. This together with focal length and principal point determines the field of view.
 
-.. ocv:function:: Camera(const Vec2f &fov, const Size &window_size)
+.. ocv:function:: Camera(const Vec2d &fov, const Size &window_size)
 
     :param fov: Field of view (horizontal, vertical)
     :param window_size: Size of the window.
 
     Principal point is at the center of the window by default.
 
-.. ocv:function:: Camera(const cv::Matx33f &K, const Size &window_size)
+.. ocv:function:: Camera(const Matx33d &K, const Size &window_size)
 
     :param K: Intrinsic matrix of the camera.
     :param window_size: Size of the window. This together with intrinsic matrix determines the field of view.
 
-.. ocv:function:: Camera(const cv::Matx44f &proj, const Size &window_size)
+.. ocv:function:: Camera(const Matx44d &proj, const Size &window_size)
 
     :param proj: Projection matrix of the camera.
     :param window_size: Size of the window. This together with projection matrix determines the field of view.
@@ -643,7 +624,7 @@ viz::Camera::computeProjectionMatrix
 ------------------------------------
 Computes projection matrix using intrinsic parameters of the camera.
 
-.. ocv:function:: void computeProjectionMatrix(Matx44f &proj) const
+.. ocv:function:: void computeProjectionMatrix(Matx44d &proj) const
 
     :param proj: Output projection matrix.
 
diff --git a/modules/viz/doc/widget.rst b/modules/viz/doc/widget.rst
index bbbfef63a..9ed28a775 100644
--- a/modules/viz/doc/widget.rst
+++ b/modules/viz/doc/widget.rst
@@ -170,20 +170,23 @@ Base class of all 3D widgets. ::
     public:
         Widget3D() {}
 
-        void setPose(const Affine3f &pose);
-        void updatePose(const Affine3f &pose);
-        Affine3f getPose() const;
+        //! widget position manipulation, i.e. place where it is rendered.
+        void setPose(const Affine3d &pose);
+        void updatePose(const Affine3d &pose);
+        Affine3d getPose() const;
+
+        //! updates internal widget data, i.e. points, normals, etc.
+        void applyTransform(const Affine3d &transform);
 
         void setColor(const Color &color);
-    private:
-        /* hidden */
+
     };
 
 viz::Widget3D::setPose
 ----------------------
 Sets pose of the widget.
 
-.. ocv:function:: void setPose(const Affine3f &pose)
+.. ocv:function:: void setPose(const Affine3d &pose)
 
     :param pose: The new pose of the widget.
 
@@ -191,7 +194,7 @@ viz::Widget3D::updateWidgetPose
 -------------------------------
 Updates pose of the widget by pre-multiplying its current pose.
 
-.. ocv:function:: void updateWidgetPose(const Affine3f &pose)
+.. ocv:function:: void updateWidgetPose(const Affine3d &pose)
 
     :param pose: The pose that the current pose of the widget will be pre-multiplied by.
 
@@ -199,7 +202,16 @@ viz::Widget3D::getPose
 ----------------------
 Returns the current pose of the widget.
 
-.. ocv:function:: Affine3f getWidgetPose() const
+.. ocv:function:: Affine3d getWidgetPose() const
+
+
+viz::Widget3D::applyTransform
+-------------------------------
+Transforms internal widget data (i.e. points, normals) using the given transform.
+
+.. ocv:function::  void applyTransform(const Affine3d &transform)
+
+    :param transform: Specified transformation to apply.
 
 viz::Widget3D::setColor
 -----------------------
@@ -262,27 +274,31 @@ This 3D Widget defines a finite plane. ::
     class CV_EXPORTS WPlane : public Widget3D
     {
     public:
-        WPlane(const Vec4f& coefs, float size = 1.0, const Color &color = Color::white());
-        WPlane(const Vec4f& coefs, const Point3f& pt, float size = 1.0, const Color &color = Color::white());
-    private:
-        /* hidden */
+        //! created default plane with center point at origin and normal oriented along z-axis
+        WPlane(const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white());
+
+        //! repositioned plane
+        WPlane(const Point3d& center, const Vec3d& normal, const Vec3d& new_plane_yaxis,const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white());
     };
 
 viz::WPlane::WPlane
 -------------------
-Constructs a WPlane.
+Constructs a default plane with center point at origin and normal oriented along z-axis.
 
-.. ocv:function:: WPlane(const Vec4f& coefs, float size = 1.0, const Color &color = Color::white())
+.. ocv:function:: WPlane(const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white())
 
-    :param coefs: Plane coefficients as in (A,B,C,D) where Ax + By + Cz + D = 0.
-    :param size: Size of the plane.
+    :param size: Size of the plane
     :param color: :ocv:class:`Color` of the plane.
 
-.. ocv:function:: WPlane(const Vec4f& coefs, const Point3f& pt, float size = 1.0, const Color &color = Color::white())
+viz::WPlane::WPlane
+-------------------
+Constructs a repositioned plane
 
-    :param coefs: Plane coefficients as in (A,B,C,D) where Ax + By + Cz + D = 0.
-    :param pt: Position of the plane.
-    :param size: Size of the plane.
+.. ocv:function:: WPlane(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis,const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white())
+
+    :param center: Center of the plane
+    :param normal: Plane normal orientation
+    :param new_yaxis: Up-vector. New orientation of plane y-axis.
     :param color: :ocv:class:`Color` of the plane.
 
 viz::WSphere
@@ -294,14 +310,14 @@ This 3D Widget defines a sphere. ::
     class CV_EXPORTS WSphere : public Widget3D
     {
     public:
-        WSphere(const cv::Point3f &center, float radius, int sphere_resolution = 10, const Color &color = Color::white())
+        WSphere(const cv::Point3f &center, double radius, int sphere_resolution = 10, const Color &color = Color::white())
     };
 
 viz::WSphere::WSphere
 ---------------------
 Constructs a WSphere.
 
-.. ocv:function:: WSphere(const cv::Point3f &center, float radius, int sphere_resolution = 10, const Color &color = Color::white())
+.. ocv:function:: WSphere(const cv::Point3f &center, double radius, int sphere_resolution = 10, const Color &color = Color::white())
 
     :param center: Center of the sphere.
     :param radius: Radius of the sphere.
@@ -317,14 +333,14 @@ This 3D Widget defines an arrow. ::
     class CV_EXPORTS WArrow : public Widget3D
     {
     public:
-        WArrow(const Point3f& pt1, const Point3f& pt2, float thickness = 0.03, const Color &color = Color::white());
+        WArrow(const Point3f& pt1, const Point3f& pt2, double thickness = 0.03, const Color &color = Color::white());
     };
 
 viz::WArrow::WArrow
 -----------------------------
 Constructs an WArrow.
 
-.. ocv:function:: WArrow(const Point3f& pt1, const Point3f& pt2, float thickness = 0.03, const Color &color = Color::white())
+.. ocv:function:: WArrow(const Point3f& pt1, const Point3f& pt2, double thickness = 0.03, const Color &color = Color::white())
 
     :param pt1: Start point of the arrow.
     :param pt2: End point of the arrow.
@@ -342,20 +358,75 @@ This 3D Widget defines a circle. ::
     class CV_EXPORTS WCircle : public Widget3D
     {
     public:
-        WCircle(const Point3f& pt, float radius, float thickness = 0.01, const Color &color = Color::white());
+        //! creates default planar circle centred at origin with plane normal along z-axis
+        WCircle(double radius, double thickness = 0.01, const Color &color = Color::white());
+
+        //! creates repositioned circle
+        WCircle(double radius, const Point3d& center, const Vec3d& normal, double thickness = 0.01, const Color &color = Color::white());
     };
 
 viz::WCircle::WCircle
 -------------------------------
-Constructs a WCircle.
+Constructs default planar circle centred at origin with plane normal along z-axis
 
-.. ocv:function:: WCircle(const Point3f& pt, float radius, float thickness = 0.01, const Color &color = Color::white())
+.. ocv:function:: WCircle(double radius, double thickness = 0.01, const Color &color = Color::white())
 
-    :param pt: Center of the circle.
     :param radius: Radius of the circle.
     :param thickness: Thickness of the circle.
     :param color: :ocv:class:`Color` of the circle.
 
+viz::WCircle::WCircle
+-------------------------------
+Constructs repositioned planar circle.
+
+.. ocv:function:: WCircle(double radius, const Point3d& center, const Vec3d& normal, double thickness = 0.01, const Color &color = Color::white())
+
+    :param radius: Radius of the circle.
+    :param center: Center of the circle.
+    :param normal: Normal of the plane in which the circle lies.
+    :param thickness: Thickness of the circle.
+    :param color: :ocv:class:`Color` of the circle.
+
+
+viz::WCone
+-------------------------------
+.. ocv:class:: WCone
+
+This 3D Widget defines a cone. ::
+
+    class CV_EXPORTS WCone : public Widget3D
+    {
+    public:
+        //! create default cone, oriented along x-axis with center of its base located at origin
+        WCone(double lenght, double radius, int resolution = 6.0, const Color &color = Color::white());
+
+        //! creates repositioned cone
+        WCone(double radius, const Point3d& center, const Point3d& tip, int resolution = 6.0, const Color &color = Color::white());
+    };
+
+viz::WCone::WCone
+-------------------------------
+Constructs default cone oriented along x-axis with center of its base located at origin
+
+.. ocv:function:: WCone(double length, double radius, int resolution = 6.0, const Color &color = Color::white())
+
+    :param length: Length of the cone.
+    :param radius: Radius of the cone.
+    :param resolution: Resolution of the cone.
+    :param color: :ocv:class:`Color` of the cone.
+
+viz::WCone::WCone
+-------------------------------
+Constructs repositioned planar cone.
+
+.. ocv:function:: WCone(double radius, const Point3d& center, const Point3d& tip, int resolution = 6.0, const Color &color = Color::white())
+
+    :param radius: Radius of the cone.
+    :param center: Center of the cone base.
+    :param tip: Tip of the cone.
+    :param resolution: Resolution of the cone.
+    :param color: :ocv:class:`Color` of the cone.
+
 viz::WCylinder
 --------------
 .. ocv:class:: WCylinder
@@ -365,17 +436,17 @@ This 3D Widget defines a cylinder. ::
     class CV_EXPORTS WCylinder : public Widget3D
     {
     public:
-        WCylinder(const Point3f& pt_on_axis, const Point3f& axis_direction, float radius, int numsides = 30, const Color &color = Color::white());
+        WCylinder(const Point3d& axis_point1, const Point3d& axis_point2, double radius, int numsides = 30, const Color &color = Color::white());
     };
 
 viz::WCylinder::WCylinder
 -----------------------------------
 Constructs a WCylinder.
 
-.. ocv:function:: WCylinder(const Point3f& pt_on_axis, const Point3f& axis_direction, float radius, int numsides = 30, const Color &color = Color::white())
+.. ocv:function:: WCylinder(const Point3f& pt_on_axis, const Point3f& axis_direction, double radius, int numsides = 30, const Color &color = Color::white())
 
-    :param pt_on_axis: A point on the axis of the cylinder.
-    :param axis_direction: Direction of the axis of the cylinder.
+    :param axis_point1: A point1 on the axis of the cylinder.
+    :param axis_point2: A point2 on the axis of the cylinder.
     :param radius: Radius of the cylinder.
     :param numsides: Resolution of the cylinder.
     :param color: :ocv:class:`Color` of the cylinder.
@@ -416,14 +487,14 @@ This 3D Widget represents a coordinate system. ::
     class CV_EXPORTS WCoordinateSystem : public Widget3D
     {
     public:
-        WCoordinateSystem(float scale = 1.0);
+        WCoordinateSystem(double scale = 1.0);
     };
 
 viz::WCoordinateSystem::WCoordinateSystem
 ---------------------------------------------------
 Constructs a WCoordinateSystem.
 
-.. ocv:function:: WCoordinateSystem(float scale = 1.0)
+.. ocv:function:: WCoordinateSystem(double scale = 1.0)
 
     :param scale: Determines the size of the axes.
 
@@ -437,9 +508,6 @@ This 3D Widget defines a poly line. ::
     {
     public:
         WPolyLine(InputArray points, const Color &color = Color::white());
-
-    private:
-        /* hidden */
     };
 
 viz::WPolyLine::WPolyLine
@@ -460,30 +528,32 @@ This 3D Widget defines a grid. ::
     class CV_EXPORTS WGrid : public Widget3D
     {
     public:
-        //! Creates grid at the origin
-        WGrid(const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white());
-        //! Creates grid based on the plane equation
-        WGrid(const Vec4f &coeffs, const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white());
-    private:
-        /* hidden */
+        //! Creates grid at the origin and normal oriented along z-axis
+        WGrid(const Vec2i &cells = Vec2i::all(10), const Vec2d &cells_spacing = Vec2d::all(1.0), const Color &color = Color::white());
+
+        //! Creates repositioned grid
+        WGrid(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis,
+              const Vec2i &cells = Vec2i::all(10), const Vec2d &cells_spacing = Vec2d::all(1.0), const Color &color = Color::white());
     };
 
 viz::WGrid::WGrid
 ---------------------------
 Constructs a WGrid.
 
-.. ocv:function:: WGrid(const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white())
+.. ocv:function::  WGrid(const Vec2i &cells = Vec2i::all(10), const Vec2d &cells_spacing = Vec2d::all(1.0), const Color &color = Color::white())
 
-    :param dimensions: Number of columns and rows, respectively.
-    :param spacing: Size of each column and row, respectively.
+    :param cells: Number of cell columns and rows, respectively.
+    :param cells_spacing: Size of each cell, respectively.
     :param color: :ocv:class:`Color` of the grid.
 
-.. ocv:function:  WGrid(const Vec4f &coeffs, const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white())
+.. ocv:function:  WGrid(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis, Vec2i &cells, const Vec2d &cells_spacing, const Color &color;
 
-    :param coeffs: Plane coefficients as in (A,B,C,D) where Ax + By + Cz + D = 0.
-    :param dimensions: Number of columns and rows, respectively.
-    :param spacing: Size of each column and row, respectively.
-    :param color: :ocv:class:`Color` of the grid.
+    :param center: Center of the grid
+    :param normal: Grid normal orientation
+    :param new_yaxis: Up-vector. New orientation of grid y-axis.
+    :param cells: Number of cell columns and rows, respectively.
+    :param cells_spacing: Size of each cell, respectively.
+    :param color: :ocv:class:`Color` of the grid..
 
 viz::WText3D
 ------------
@@ -494,7 +564,7 @@ This 3D Widget represents 3D text. The text always faces the camera. ::
     class CV_EXPORTS WText3D : public Widget3D
     {
     public:
-        WText3D(const String &text, const Point3f &position, float text_scale = 1.0, bool face_camera = true, const Color &color = Color::white());
+        WText3D(const String &text, const Point3f &position, double text_scale = 1.0, bool face_camera = true, const Color &color = Color::white());
 
         void setText(const String &text);
         String getText() const;
@@ -504,7 +574,7 @@ viz::WText3D::WText3D
 -------------------------------
 Constructs a WText3D.
 
-.. ocv:function:: WText3D(const String &text, const Point3f &position, float text_scale = 1.0, bool face_camera = true, const Color &color = Color::white())
+.. ocv:function:: WText3D(const String &text, const Point3f &position, double text_scale = 1.0, bool face_camera = true, const Color &color = Color::white())
 
     :param text: Text content of the widget.
     :param position: Position of the text.
@@ -575,16 +645,16 @@ This 2D Widget represents an image overlay. ::
     class CV_EXPORTS WImageOverlay : public Widget2D
     {
     public:
-        WImageOverlay(const Mat &image, const Rect &rect);
+        WImageOverlay(InputArray image, const Rect &rect);
 
-        void setImage(const Mat &image);
+        void setImage(InputArray image);
     };
 
 viz::WImageOverlay::WImageOverlay
 ---------------------------------
 Constructs an WImageOverlay.
 
-.. ocv:function:: WImageOverlay(const Mat &image, const Rect &rect)
+.. ocv:function:: WImageOverlay(InputArray image, const Rect &rect)
 
     :param image: BGR or Gray-Scale image.
     :param rect: Image is scaled and positioned based on rect.
@@ -593,7 +663,7 @@ viz::WImageOverlay::setImage
 ----------------------------
 Sets the image content of the widget.
 
-.. ocv:function:: void setImage(const Mat &image)
+.. ocv:function:: void setImage(InputArray image)
 
     :param image: BGR or Gray-Scale image.
 
@@ -607,23 +677,23 @@ This 3D Widget represents an image in 3D space. ::
     {
     public:
         //! Creates 3D image at the origin
-        WImage3D(const Mat &image, const Size &size);
+        WImage3D(InputArray image, const Size2d &size);
         //! Creates 3D image at a given position, pointing in the direction of the normal, and having the up_vector orientation
-        WImage3D(const Vec3f &position, const Vec3f &normal, const Vec3f &up_vector, const Mat &image, const Size &size);
+        WImage3D(InputArray image, const Size2d &size, const Vec3d &position, const Vec3d &normal, const Vec3d &up_vector);
 
-        void setImage(const Mat &image);
+        void setImage(InputArray image);
     };
 
 viz::WImage3D::WImage3D
 -----------------------
 Constructs an WImage3D.
 
-.. ocv:function:: WImage3D(const Mat &image, const Size &size)
+.. ocv:function:: WImage3D(InputArray image, const Size2d &size)
 
     :param image: BGR or Gray-Scale image.
     :param size: Size of the image.
 
-.. ocv:function:: WImage3D(const Vec3f &position, const Vec3f &normal, const Vec3f &up_vector, const Mat &image, const Size &size)
+.. ocv:function:: WImage3D(InputArray image, const Size2d &size, const Vec3d &position, const Vec3d &normal, const Vec3d &up_vector)
 
     :param position: Position of the image.
     :param normal: Normal of the plane that represents the image.
@@ -635,7 +705,7 @@ viz::WImage3D::setImage
 -----------------------
 Sets the image content of the widget.
 
-.. ocv:function:: void setImage(const Mat &image)
+.. ocv:function:: void setImage(InputArray image)
 
     :param image: BGR or Gray-Scale image.
 
@@ -649,15 +719,15 @@ This 3D Widget represents camera position in a scene by its axes or viewing frus
     {
     public:
         //! Creates camera coordinate frame (axes) at the origin
-        WCameraPosition(float scale = 1.0);
+        WCameraPosition(double scale = 1.0);
         //! Creates frustum based on the intrinsic marix K at the origin
-        WCameraPosition(const Matx33f &K, float scale = 1.0, const Color &color = Color::white());
+        WCameraPosition(const Matx33d &K, double scale = 1.0, const Color &color = Color::white());
         //! Creates frustum based on the field of view at the origin
-        WCameraPosition(const Vec2f &fov, float scale = 1.0, const Color &color = Color::white());
+        WCameraPosition(const Vec2d &fov, double scale = 1.0, const Color &color = Color::white());
         //! Creates frustum and display given image at the far plane
-        WCameraPosition(const Matx33f &K, const Mat &img, float scale = 1.0, const Color &color = Color::white());
+        WCameraPosition(const Matx33d &K, InputArray image, double scale = 1.0, const Color &color = Color::white());
         //! Creates frustum and display given image at the far plane
-        WCameraPosition(const Vec2f &fov, const Mat &img, float scale = 1.0, const Color &color = Color::white());
+        WCameraPosition(const Vec2d &fov, InputArray image, double scale = 1.0, const Color &color = Color::white());
     };
 
 viz::WCameraPosition::WCameraPosition
@@ -666,7 +736,7 @@ Constructs a WCameraPosition.
 
 - **Display camera coordinate frame.**
 
-    .. ocv:function:: WCameraPosition(float scale = 1.0)
+    .. ocv:function:: WCameraPosition(double scale = 1.0)
 
         Creates camera coordinate frame at the origin.
 
@@ -676,7 +746,7 @@ Constructs a WCameraPosition.
 
 - **Display the viewing frustum.**
 
-    .. ocv:function:: WCameraPosition(const Matx33f &K, float scale = 1.0, const Color &color = Color::white())
+    .. ocv:function:: WCameraPosition(const Matx33d &K, double scale = 1.0, const Color &color = Color::white())
 
         :param K: Intrinsic matrix of the camera.
         :param scale: Scale of the frustum.
@@ -684,7 +754,7 @@ Constructs a WCameraPosition.
 
         Creates viewing frustum of the camera based on its intrinsic matrix K.
 
-    .. ocv:function:: WCameraPosition(const Vec2f &fov, float scale = 1.0, const Color &color = Color::white())
+    .. ocv:function:: WCameraPosition(const Vec2d &fov, double scale = 1.0, const Color &color = Color::white())
 
         :param fov: Field of view of the camera (horizontal, vertical).
         :param scale: Scale of the frustum.
@@ -698,7 +768,7 @@ Constructs a WCameraPosition.
 
 - **Display image on the far plane of the viewing frustum.**
 
-    .. ocv:function:: WCameraPosition(const Matx33f &K, const Mat &img, float scale = 1.0, const Color &color = Color::white())
+    .. ocv:function:: WCameraPosition(const Matx33d &K, InputArray image, double scale = 1.0, const Color &color = Color::white())
 
         :param K: Intrinsic matrix of the camera.
         :param img: BGR or Gray-Scale image that is going to be displayed on the far plane of the frustum.
@@ -707,7 +777,7 @@ Constructs a WCameraPosition.
 
         Creates viewing frustum of the camera based on its intrinsic matrix K, and displays image on the far end plane.
 
-    .. ocv:function:: WCameraPosition(const Vec2f &fov, const Mat &img, float scale = 1.0, const Color &color = Color::white())
+    .. ocv:function:: WCameraPosition(const Vec2d &fov, InputArray image, double scale = 1.0, const Color &color = Color::white())
 
         :param fov: Field of view of the camera (horizontal, vertical).
         :param img: BGR or Gray-Scale image that is going to be displayed on the far plane of the frustum.
@@ -729,81 +799,91 @@ This 3D Widget represents a trajectory. ::
     class CV_EXPORTS WTrajectory : public Widget3D
     {
     public:
-        enum {DISPLAY_FRAMES = 1, DISPLAY_PATH = 2};
+        enum {FRAMES = 1, PATH = 2, BOTH = FRAMES + PATH};
 
         //! Displays trajectory of the given path either by coordinate frames or polyline
-        WTrajectory(const std::vector<Affine3f> &path, int display_mode = WTrajectory::DISPLAY_PATH, const Color &color = Color::white(), float scale = 1.0);
-        //! Displays trajectory of the given path by frustums
-        WTrajectory(const std::vector<Affine3f> &path, const Matx33f &K, float scale = 1.0, const Color &color = Color::white());
-        //! Displays trajectory of the given path by frustums
-        WTrajectory(const std::vector<Affine3f> &path, const Vec2f &fov, float scale = 1.0, const Color &color = Color::white());
-
-    private:
-        /* hidden */
+        WTrajectory(InputArray path, int display_mode = WTrajectory::PATH, double scale = 1.0, const Color &color = Color::white(),;
     };
 
 viz::WTrajectory::WTrajectory
 -----------------------------
 Constructs a WTrajectory.
 
-.. ocv:function:: WTrajectory(const std::vector<Affine3f> &path, int display_mode = WTrajectory::DISPLAY_PATH, const Color &color = Color::white(), float scale = 1.0)
+.. ocv:function:: WTrajectory(InputArray path, int display_mode = WTrajectory::PATH, double scale = 1.0, const Color &color = Color::white())
 
-    :param path: List of poses on a trajectory.
-    :param display_mode: Display mode. This can be DISPLAY_PATH, DISPLAY_FRAMES, DISPLAY_PATH & DISPLAY_FRAMES.
-    :param color: :ocv:class:`Color` of the polyline that represents path. Frames are not affected.
+    :param path: List of poses on a trajectory. Takes std::vector<Affine<T>> with T == [float | double]
+    :param display_mode: Display mode. This can be PATH, FRAMES, and BOTH.
     :param scale: Scale of the frames. Polyline is not affected.
+    :param color: :ocv:class:`Color` of the polyline that represents path. Frames are not affected.
 
     Displays trajectory of the given path as follows:
 
-    * DISPLAY_PATH : Displays a poly line that represents the path.
-    * DISPLAY_FRAMES : Displays coordinate frames at each pose.
-    * DISPLAY_PATH & DISPLAY_FRAMES : Displays both poly line and coordinate frames.
+    * PATH : Displays a poly line that represents the path.
+    * FRAMES : Displays coordinate frames at each pose.
+    * PATH & FRAMES : Displays both poly line and coordinate frames.
 
-.. ocv:function:: WTrajectory(const std::vector<Affine3f> &path, const Matx33f &K, float scale = 1.0, const Color &color = Color::white())
+viz::WTrajectoryFrustums
+------------------------
+.. ocv:class:: WTrajectoryFrustums
 
-    :param path: List of poses on a trajectory.
+This 3D Widget represents a trajectory. ::
+
+    class CV_EXPORTS WTrajectoryFrustums : public Widget3D
+    {
+    public:
+        //! Displays trajectory of the given path by frustums
+        WTrajectoryFrustums(InputArray path, const Matx33d &K, double scale = 1.0, const Color &color = Color::white());
+        //! Displays trajectory of the given path by frustums
+        WTrajectoryFrustums(InputArray path, const Vec2d &fov, double scale = 1.0, const Color &color = Color::white());
+    };
+
+viz::WTrajectoryFrustums::WTrajectoryFrustums
+---------------------------------------------
+Constructs a WTrajectoryFrustums.
+
+.. ocv:function:: WTrajectoryFrustums(const std::vector<Affine3d> &path, const Matx33d &K, double scale = 1.0, const Color &color = Color::white())
+
+    :param path: List of poses on a trajectory. Takes std::vector<Affine<T>> with T == [float | double]
     :param K: Intrinsic matrix of the camera.
     :param scale: Scale of the frustums.
     :param color: :ocv:class:`Color` of the frustums.
 
     Displays frustums at each pose of the trajectory.
 
-.. ocv:function:: WTrajectory(const std::vector<Affine3f> &path, const Vec2f &fov, float scale = 1.0, const Color &color = Color::white())
+.. ocv:function:: WTrajectoryFrustums(const std::vector<Affine3d> &path, const Vec2d &fov, double scale = 1.0, const Color &color = Color::white())
 
-    :param path: List of poses on a trajectory.
+    :param path: List of poses on a trajectory. Takes std::vector<Affine<T>> with T == [float | double]
     :param fov: Field of view of the camera (horizontal, vertical).
     :param scale: Scale of the frustums.
     :param color: :ocv:class:`Color` of the frustums.
 
     Displays frustums at each pose of the trajectory.
 
-viz::WSpheresTrajectory
+viz::WTrajectorySpheres
 -----------------------
-.. ocv:class:: WSpheresTrajectory
+.. ocv:class:: WTrajectorySpheres
 
 This 3D Widget represents a trajectory using spheres and lines, where spheres represent the positions of the camera, and lines
 represent the direction from previous position to the current. ::
 
-    class CV_EXPORTS WSpheresTrajectory : public Widget3D
+    class CV_EXPORTS WTrajectorySpheres : public Widget3D
     {
     public:
-        WSpheresTrajectory(const std::vector<Affine3f> &path, float line_length = 0.05f,
-                    float init_sphere_radius = 0.021, sphere_radius = 0.007,
-                    Color &line_color = Color::white(), const Color &sphere_color = Color::white());
+        WTrajectorySpheres(InputArray path, double line_length = 0.05, double radius = 0.007,
+                               const Color &from = Color::red(), const Color &to = Color::white());
     };
 
-viz::WSpheresTrajectory::WSpheresTrajectory
+viz::WTrajectorySpheres::WTrajectorySpheres
 -------------------------------------------
-Constructs a WSpheresTrajectory.
+Constructs a WTrajectorySpheres.
 
-.. ocv:function:: WSpheresTrajectory(const std::vector<Affine3f> &path, float line_length = 0.05f, float init_sphere_radius = 0.021, float sphere_radius = 0.007, const Color &line_color = Color::white(), const Color &sphere_color = Color::white())
+.. ocv:function:: WTrajectorySpheres(InputArray path, double line_length = 0.05, double radius = 0.007, const Color &from = Color::red(), const Color &to = Color::white())
 
-    :param path: List of poses on a trajectory.
-    :param line_length: Length of the lines.
-    :param init_sphere_radius: Radius of the first sphere which represents the initial position of the camera.
-    :param sphere_radius: Radius of the rest of the spheres.
-    :param line_color: :ocv:class:`Color` of the lines.
-    :param sphere_color: :ocv:class:`Color` of the spheres.
+    :param path: List of poses on a trajectory. Takes std::vector<Affine<T>> with T == [float | double]
+    :param line_length: Max length of the lines which point to previous position
+    :param sphere_radius: Radius of the spheres.
+    :param from: :ocv:class:`Color` for first sphere.
+    :param to: :ocv:class:`Color` for last sphere. Intermediate spheres will have interpolated color.
 
 viz::WCloud
 -----------
@@ -818,9 +898,6 @@ This 3D Widget defines a point cloud. ::
         WCloud(InputArray cloud, InputArray colors);
         //! All points in cloud have the same color
         WCloud(InputArray cloud, const Color &color = Color::white());
-
-    private:
-        /* hidden */
     };
 
 viz::WCloud::WCloud
@@ -855,12 +932,9 @@ This 3D Widget defines a collection of clouds. ::
         WCloudCollection();
 
         //! Each point in cloud is mapped to a color in colors
-        void addCloud(InputArray cloud, InputArray colors, const Affine3f &pose = Affine3f::Identity());
+        void addCloud(InputArray cloud, InputArray colors, const Affine3d &pose = Affine3d::Identity());
         //! All points in cloud have the same color
-        void addCloud(InputArray cloud, const Color &color = Color::white(), Affine3f &pose = Affine3f::Identity());
-
-    private:
-        /* hidden */
+        void addCloud(InputArray cloud, const Color &color = Color::white(), Affine3d &pose = Affine3d::Identity());
     };
 
 viz::WCloudCollection::WCloudCollection
@@ -873,7 +947,7 @@ viz::WCloudCollection::addCloud
 -------------------------------
 Adds a cloud to the collection.
 
-.. ocv:function:: void addCloud(InputArray cloud, InputArray colors, const Affine3f &pose = Affine3f::Identity())
+.. ocv:function:: void addCloud(InputArray cloud, InputArray colors, const Affine3d &pose = Affine3d::Identity())
 
     :param cloud: Point set which can be of type: ``CV_32FC3``, ``CV_32FC4``, ``CV_64FC3``, ``CV_64FC4``.
     :param colors: Set of colors. It has to be of the same size with cloud.
@@ -881,7 +955,7 @@ Adds a cloud to the collection.
 
     Points in the cloud belong to mask when they are set to (NaN, NaN, NaN).
 
-.. ocv:function:: void addCloud(InputArray cloud, const Color &color = Color::white(), const Affine3f &pose = Affine3f::Identity())
+.. ocv:function:: void addCloud(InputArray cloud, const Color &color = Color::white(), const Affine3d &pose = Affine3d::Identity())
 
     :param cloud: Point set which can be of type: ``CV_32FC3``, ``CV_32FC4``, ``CV_64FC3``, ``CV_64FC4``.
     :param colors: A single :ocv:class:`Color` for the whole cloud.
@@ -900,17 +974,14 @@ This 3D Widget represents normals of a point cloud. ::
     class CV_EXPORTS WCloudNormals : public Widget3D
     {
     public:
-        WCloudNormals(InputArray cloud, InputArray normals, int level = 100, float scale = 0.02f, const Color &color = Color::white());
-
-    private:
-        /* hidden */
+        WCloudNormals(InputArray cloud, InputArray normals, int level = 100, double scale = 0.02f, const Color &color = Color::white());
     };
 
 viz::WCloudNormals::WCloudNormals
 ---------------------------------
 Constructs a WCloudNormals.
 
-.. ocv:function:: WCloudNormals(InputArray cloud, InputArray normals, int level = 100, float scale = 0.02f, const Color &color = Color::white())
+.. ocv:function:: WCloudNormals(InputArray cloud, InputArray normals, int level = 100, double scale = 0.02f, const Color &color = Color::white())
 
     :param cloud: Point set which can be of type: ``CV_32FC3``, ``CV_32FC4``, ``CV_64FC3``, ``CV_64FC4``.
     :param normals: A set of normals that has to be of same type with cloud.
@@ -929,16 +1000,21 @@ This 3D Widget defines a mesh. ::
     class CV_EXPORTS WMesh : public Widget3D
     {
     public:
-        WMesh(const Mesh3d &mesh);
-
-    private:
-        /* hidden */
+        WMesh(const Mesh &mesh);
+        WMesh(InputArray cloud, InputArray polygons, InputArray colors = noArray(), InputArray normals = noArray());
     };
 
 viz::WMesh::WMesh
 -----------------
 Constructs a WMesh.
 
-.. ocv:function:: WMesh(const Mesh3d &mesh)
+.. ocv:function:: WMesh(const Mesh &mesh)
 
-    :param mesh: :ocv:class:`Mesh3d` object that will be displayed.
+    :param mesh: :ocv:class:`Mesh` object that will be displayed.
+
+.. ocv:function:: WMesh(InputArray cloud, InputArray polygons, InputArray colors = noArray(), InputArray normals = noArray())
+
+    :param cloud: Points of the mesh object.
+    :param polygons: Points of the mesh object.
+    :param colors: Point colors.
+    :param normals: Point normals.
diff --git a/modules/viz/include/opencv2/viz.hpp b/modules/viz/include/opencv2/viz.hpp
index d4f08af72..6fa6249e3 100644
--- a/modules/viz/include/opencv2/viz.hpp
+++ b/modules/viz/include/opencv2/viz.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_HPP__
@@ -52,46 +49,6 @@
 #include <opencv2/viz/types.hpp>
 #include <opencv2/viz/widgets.hpp>
 #include <opencv2/viz/viz3d.hpp>
-
-namespace cv
-{
-    namespace viz
-    {
-        //! takes coordiante frame data and builds transfrom to global coordinate frame
-        CV_EXPORTS Affine3f makeTransformToGlobal(const Vec3f& axis_x, const Vec3f& axis_y, const Vec3f& axis_z, const Vec3f& origin = Vec3f::all(0));
-
-        //! constructs camera pose from position, focal_point and up_vector (see gluLookAt() for more infromation)
-        CV_EXPORTS Affine3f makeCameraPose(const Vec3f& position, const Vec3f& focal_point, const Vec3f& y_dir);
-
-        //! retrieves a window by its name. If no window with such name, then it creates new.
-        CV_EXPORTS Viz3d get(const String &window_name);
-
-        //! Unregisters all Viz windows from internal database. After it 'get()' will create new windows instead getting existing from the database.
-        CV_EXPORTS void unregisterAllWindows();
-
-        //! checks float value for Nan
-        inline bool isNan(float x)
-        {
-            unsigned int *u = reinterpret_cast<unsigned int *>(&x);
-            return ((u[0] & 0x7f800000) == 0x7f800000) && (u[0] & 0x007fffff);
-        }
-
-        //! checks double value for Nan
-        inline bool isNan(double x)
-        {
-            unsigned int *u = reinterpret_cast<unsigned int *>(&x);
-            return (u[1] & 0x7ff00000) == 0x7ff00000 && (u[0] != 0 || (u[1] & 0x000fffff) != 0);
-        }
-
-        //! checks vectors for Nans
-        template<typename _Tp, int cn> inline bool isNan(const Vec<_Tp, cn>& v)
-        { return isNan(v.val[0]) || isNan(v.val[1]) || isNan(v.val[2]); }
-
-        //! checks point for Nans
-        template<typename _Tp> inline bool isNan(const Point3_<_Tp>& p)
-        { return isNan(p.x) || isNan(p.y) || isNan(p.z); }
-
-    } /* namespace viz */
-} /* namespace cv */
+#include <opencv2/viz/vizcore.hpp>
 
 #endif /* __OPENCV_VIZ_HPP__ */
diff --git a/modules/viz/include/opencv2/viz/types.hpp b/modules/viz/include/opencv2/viz/types.hpp
index 682006f95..3c3571b83 100644
--- a/modules/viz/include/opencv2/viz/types.hpp
+++ b/modules/viz/include/opencv2/viz/types.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_TYPES_HPP__
@@ -77,49 +74,99 @@ namespace cv
             static Color white();
 
             static Color gray();
+
+            static Color mlab();
+
+            static Color navy();
+            static Color olive();
+            static Color maroon();
+            static Color teal();
+            static Color rose();
+            static Color azure();
+            static Color lime();
+            static Color gold();
+            static Color brown();
+            static Color orange();
+            static Color chartreuse();
+            static Color orange_red();
+            static Color purple();
+            static Color indigo();
+
+            static Color pink();
+            static Color cherry();
+            static Color bluberry();
+            static Color raspberry();
+            static Color silver();
+            static Color violet();
+            static Color apricot();
+            static Color turquoise();
+            static Color celestial_blue();
+            static Color amethyst();
+
+            static Color not_set();
         };
 
-        class CV_EXPORTS Mesh3d
+        class CV_EXPORTS Mesh
         {
         public:
+            Mat cloud, colors, normals;
 
-            Mat cloud, colors;
+            //! Raw integer list of the form: (n,id1,id2,...,idn, n,id1,id2,...,idn, ...)
+            //! where n is the number of points in the poligon, and id is a zero-offset index into an associated cloud.
             Mat polygons;
 
-            //! Loads mesh from a given ply file
-            static cv::viz::Mesh3d loadMesh(const String& file);
+            Mat texture, tcoords;
+
+            //! Loads mesh from a given ply file (no texture load support for now)
+            static Mesh load(const String& file);
+        };
+
+        class CV_EXPORTS Camera
+        {
+        public:
+            Camera(double fx, double fy, double cx, double cy, const Size &window_size);
+            explicit Camera(const Vec2d &fov, const Size &window_size);
+            explicit Camera(const Matx33d &K, const Size &window_size);
+            explicit Camera(const Matx44d &proj, const Size &window_size);
+
+            const Vec2d & getClip() const { return clip_; }
+            void setClip(const Vec2d &clip) { clip_ = clip; }
+
+            const Size & getWindowSize() const { return window_size_; }
+            void setWindowSize(const Size &window_size);
+
+            const Vec2d& getFov() const { return fov_; }
+            void setFov(const Vec2d& fov) { fov_ = fov; }
+
+            const Vec2d& getPrincipalPoint() const { return principal_point_; }
+            const Vec2d& getFocalLength() const { return focal_; }
+
+            void computeProjectionMatrix(Matx44d &proj) const;
+
+            static Camera KinectCamera(const Size &window_size);
 
         private:
-            struct loadMeshImpl;
+            void init(double fx, double fy, double cx, double cy, const Size &window_size);
+
+            Vec2d clip_;
+            Vec2d fov_;
+            Size window_size_;
+            Vec2d principal_point_;
+            Vec2d focal_;
         };
 
         class CV_EXPORTS KeyboardEvent
         {
         public:
-            static const unsigned int Alt   = 1;
-            static const unsigned int Ctrl  = 2;
-            static const unsigned int Shift = 4;
+            enum { NONE = 0, ALT = 1, CTRL = 2, SHIFT = 4 };
+            enum Action { KEY_UP = 0, KEY_DOWN = 1 };
 
-            //! Create a keyboard event
-            //! - Note that action is true if key is pressed, false if released
-            KeyboardEvent(bool action, const String& key_sym, unsigned char key, bool alt, bool ctrl, bool shift);
+            KeyboardEvent(Action action, const String& symbol, unsigned char code, int modifiers);
 
-            bool isAltPressed() const;
-            bool isCtrlPressed() const;
-            bool isShiftPressed() const;
-
-            unsigned char getKeyCode() const;
-
-            const String& getKeySym() const;
-            bool keyDown() const;
-            bool keyUp() const;
-
-        protected:
-
-            bool action_;
-            unsigned int modifiers_;
-            unsigned char key_code_;
-            String key_sym_;
+            Action action;
+            String symbol;
+            unsigned char code;
+            int modifiers;
         };
 
         class CV_EXPORTS MouseEvent
@@ -128,46 +175,12 @@ namespace cv
             enum Type { MouseMove = 1, MouseButtonPress, MouseButtonRelease, MouseScrollDown, MouseScrollUp, MouseDblClick } ;
             enum MouseButton { NoButton = 0, LeftButton, MiddleButton, RightButton, VScroll } ;
 
-            MouseEvent(const Type& type, const MouseButton& button, const Point& p, bool alt, bool ctrl, bool shift);
+            MouseEvent(const Type& type, const MouseButton& button, const Point& pointer, int modifiers);
 
             Type type;
             MouseButton button;
             Point pointer;
-            unsigned int key_state;
-        };
-
-        class CV_EXPORTS Camera
-        {
-        public:
-            Camera(float f_x, float f_y, float c_x, float c_y, const Size &window_size);
-            Camera(const Vec2f &fov, const Size &window_size);
-            Camera(const cv::Matx33f &K, const Size &window_size);
-            Camera(const cv::Matx44f &proj, const Size &window_size);
-
-            inline const Vec2d & getClip() const { return clip_; }
-            inline void setClip(const Vec2d &clip) { clip_ = clip; }
-
-            inline const Size & getWindowSize() const { return window_size_; }
-            void setWindowSize(const Size &window_size);
-
-            inline const Vec2f & getFov() const { return fov_; }
-            inline void setFov(const Vec2f & fov) { fov_ = fov; }
-
-            inline const Vec2f & getPrincipalPoint() const { return principal_point_; }
-            inline const Vec2f & getFocalLength() const { return focal_; }
-
-            void computeProjectionMatrix(Matx44f &proj) const;
-
-            static Camera KinectCamera(const Size &window_size);
-
-        private:
-            void init(float f_x, float f_y, float c_x, float c_y, const Size &window_size);
-
-            Vec2d clip_;
-            Vec2f fov_;
-            Size window_size_;
-            Vec2f principal_point_;
-            Vec2f focal_;
+            int modifiers;
         };
     } /* namespace viz */
 } /* namespace cv */
@@ -180,15 +193,44 @@ inline cv::viz::Color::Color(double _gray) : Scalar(_gray, _gray, _gray) {}
 inline cv::viz::Color::Color(double _blue, double _green, double _red) : Scalar(_blue, _green, _red) {}
 inline cv::viz::Color::Color(const Scalar& color) : Scalar(color) {}
 
-inline cv::viz::Color cv::viz::Color::black()   { return Color(  0,   0, 0); }
-inline cv::viz::Color cv::viz::Color::green()   { return Color(  0, 255, 0); }
-inline cv::viz::Color cv::viz::Color::blue()    { return Color(255,   0, 0); }
-inline cv::viz::Color cv::viz::Color::cyan()    { return Color(255, 255, 0); }
+inline cv::viz::Color cv::viz::Color::black()   { return Color(  0,   0,   0); }
+inline cv::viz::Color cv::viz::Color::green()   { return Color(  0, 255,   0); }
+inline cv::viz::Color cv::viz::Color::blue()    { return Color(255,   0,   0); }
+inline cv::viz::Color cv::viz::Color::cyan()    { return Color(255, 255,   0); }
 inline cv::viz::Color cv::viz::Color::red()     { return Color(  0,   0, 255); }
 inline cv::viz::Color cv::viz::Color::yellow()  { return Color(  0, 255, 255); }
 inline cv::viz::Color cv::viz::Color::magenta() { return Color(255,   0, 255); }
 inline cv::viz::Color cv::viz::Color::white()   { return Color(255, 255, 255); }
 inline cv::viz::Color cv::viz::Color::gray()    { return Color(128, 128, 128); }
 
+inline cv::viz::Color cv::viz::Color::mlab()    { return Color(255, 128, 128); }
+
+inline cv::viz::Color cv::viz::Color::navy()       { return Color(0,     0, 128); }
+inline cv::viz::Color cv::viz::Color::olive()      { return Color(0,   128, 128); }
+inline cv::viz::Color cv::viz::Color::maroon()     { return Color(0,     0, 128); }
+inline cv::viz::Color cv::viz::Color::teal()       { return Color(128, 128,   0); }
+inline cv::viz::Color cv::viz::Color::rose()       { return Color(128,   0, 255); }
+inline cv::viz::Color cv::viz::Color::azure()      { return Color(255, 128,   0); }
+inline cv::viz::Color cv::viz::Color::lime()       { return Color(0,   255, 191); }
+inline cv::viz::Color cv::viz::Color::gold()       { return Color(0,   215, 255); }
+inline cv::viz::Color cv::viz::Color::brown()      { return Color(0,    75, 150); }
+inline cv::viz::Color cv::viz::Color::orange()     { return Color(0,   165, 255); }
+inline cv::viz::Color cv::viz::Color::chartreuse() { return Color(0,   255, 128); }
+inline cv::viz::Color cv::viz::Color::orange_red() { return Color(0,    69, 255); }
+inline cv::viz::Color cv::viz::Color::purple()     { return Color(128,   0, 128); }
+inline cv::viz::Color cv::viz::Color::indigo()     { return Color(130,   0,  75); }
+
+inline cv::viz::Color cv::viz::Color::pink()           { return Color(203, 192, 255); }
+inline cv::viz::Color cv::viz::Color::cherry()         { return Color( 99,  29, 222); }
+inline cv::viz::Color cv::viz::Color::bluberry()       { return Color(247, 134,  79); }
+inline cv::viz::Color cv::viz::Color::raspberry()      { return Color( 92,  11, 227); }
+inline cv::viz::Color cv::viz::Color::silver()         { return Color(192, 192, 192); }
+inline cv::viz::Color cv::viz::Color::violet()         { return Color(226,  43, 138); }
+inline cv::viz::Color cv::viz::Color::apricot()        { return Color(177, 206, 251); }
+inline cv::viz::Color cv::viz::Color::turquoise()      { return Color(208, 224,  64); }
+inline cv::viz::Color cv::viz::Color::celestial_blue() { return Color(208, 151,  73); }
+inline cv::viz::Color cv::viz::Color::amethyst()       { return Color(204, 102, 153); }
+
+inline cv::viz::Color cv::viz::Color::not_set()        { return Color(-1, -1, -1); }
 
 #endif
diff --git a/modules/viz/include/opencv2/viz/viz3d.hpp b/modules/viz/include/opencv2/viz/viz3d.hpp
index f19709eb5..7cb7d0cea 100644
--- a/modules/viz/include/opencv2/viz/viz3d.hpp
+++ b/modules/viz/include/opencv2/viz/viz3d.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_VIZ3D_HPP__
@@ -64,6 +61,7 @@ namespace cv
         class CV_EXPORTS Viz3d
         {
         public:
+            typedef cv::viz::Color Color;
             typedef void (*KeyboardCallback)(const KeyboardEvent&, void*);
             typedef void (*MouseCallback)(const MouseEvent&, void*);
 
@@ -72,19 +70,21 @@ namespace cv
             Viz3d& operator=(const Viz3d&);
             ~Viz3d();
 
-            void showWidget(const String &id, const Widget &widget, const Affine3f &pose = Affine3f::Identity());
+            void showWidget(const String &id, const Widget &widget, const Affine3d &pose = Affine3d::Identity());
             void removeWidget(const String &id);
             Widget getWidget(const String &id) const;
             void removeAllWidgets();
 
-            void setWidgetPose(const String &id, const Affine3f &pose);
-            void updateWidgetPose(const String &id, const Affine3f &pose);
-            Affine3f getWidgetPose(const String &id) const;
+            void showImage(InputArray image, const Size& window_size = Size(-1, -1));
+
+            void setWidgetPose(const String &id, const Affine3d &pose);
+            void updateWidgetPose(const String &id, const Affine3d &pose);
+            Affine3d getWidgetPose(const String &id) const;
 
             void setCamera(const Camera &camera);
             Camera getCamera() const;
-            Affine3f getViewerPose();
-            void setViewerPose(const Affine3f &pose);
+            Affine3d getViewerPose();
+            void setViewerPose(const Affine3d &pose);
 
             void resetCameraViewpoint(const String &id);
             void resetCamera();
@@ -96,13 +96,16 @@ namespace cv
             void setWindowSize(const Size &window_size);
             String getWindowName() const;
             void saveScreenshot(const String &file);
-            void setWindowPosition(int x, int y);
-            void setFullScreen(bool mode);
-            void setBackgroundColor(const Color& color = Color::black());
+            void setWindowPosition(const Point& window_position);
+            void setFullScreen(bool mode = true);
+            void setBackgroundColor(const Color& color = Color::black(), const Color& color2 = Color::not_set());
+            void setBackgroundTexture(InputArray image = noArray());
+            void setBackgroundMeshLab();
 
             void spin();
             void spinOnce(int time = 1, bool force_redraw = false);
             bool wasStopped() const;
+            void close();
 
             void registerKeyboardCallback(KeyboardCallback callback, void* cookie = 0);
             void registerMouseCallback(MouseCallback callback, void* cookie = 0);
@@ -110,9 +113,6 @@ namespace cv
             void setRenderingProperty(const String &id, int property, double value);
             double getRenderingProperty(const String &id, int property);
 
-            void setDesiredUpdateRate(double rate);
-            double getDesiredUpdateRate();
-
             void setRepresentation(int representation);
         private:
 
diff --git a/modules/viz/include/opencv2/viz/vizcore.hpp b/modules/viz/include/opencv2/viz/vizcore.hpp
new file mode 100644
index 000000000..0fde95b2f
--- /dev/null
+++ b/modules/viz/include/opencv2/viz/vizcore.hpp
@@ -0,0 +1,127 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Ozan Tonkal, ozantonkal@gmail.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#ifndef __OPENCV_VIZCORE_HPP__
+#define __OPENCV_VIZCORE_HPP__
+
+#include <opencv2/viz/types.hpp>
+#include <opencv2/viz/widgets.hpp>
+#include <opencv2/viz/viz3d.hpp>
+
+namespace cv
+{
+    namespace viz
+    {
+        //! takes coordiante frame data and builds transfrom to global coordinate frame
+        CV_EXPORTS Affine3d makeTransformToGlobal(const Vec3d& axis_x, const Vec3d& axis_y, const Vec3d& axis_z, const Vec3d& origin = Vec3d::all(0));
+
+        //! constructs camera pose from position, focal_point and up_vector (see gluLookAt() for more infromation)
+        CV_EXPORTS Affine3d makeCameraPose(const Vec3d& position, const Vec3d& focal_point, const Vec3d& y_dir);
+
+        //! retrieves a window by its name. If no window with such name, then it creates new.
+        CV_EXPORTS Viz3d getWindowByName(const String &window_name);
+
+        //! Unregisters all Viz windows from internal database. After it 'getWindowByName()' will create new windows instead getting existing from the database.
+        CV_EXPORTS void unregisterAllWindows();
+
+        //! Displays image in specified window
+        CV_EXPORTS Viz3d imshow(const String& window_name, InputArray image, const Size& window_size = Size(-1, -1));
+
+        //! checks float value for Nan
+        inline bool isNan(float x)
+        {
+            unsigned int *u = reinterpret_cast<unsigned int *>(&x);
+            return ((u[0] & 0x7f800000) == 0x7f800000) && (u[0] & 0x007fffff);
+        }
+
+        //! checks double value for Nan
+        inline bool isNan(double x)
+        {
+            unsigned int *u = reinterpret_cast<unsigned int *>(&x);
+            return (u[1] & 0x7ff00000) == 0x7ff00000 && (u[0] != 0 || (u[1] & 0x000fffff) != 0);
+        }
+
+        //! checks vectors for Nans
+        template<typename _Tp, int cn> inline bool isNan(const Vec<_Tp, cn>& v)
+        { return isNan(v.val[0]) || isNan(v.val[1]) || isNan(v.val[2]); }
+
+        //! checks point for Nans
+        template<typename _Tp> inline bool isNan(const Point3_<_Tp>& p)
+        { return isNan(p.x) || isNan(p.y) || isNan(p.z); }
+
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////
+        /// Read/write clouds. Supported formats: ply, xyz, obj and stl (readonly)
+
+        CV_EXPORTS void writeCloud(const String& file, InputArray cloud, InputArray colors = noArray(), InputArray normals = noArray(), bool binary = false);
+        CV_EXPORTS Mat  readCloud (const String& file, OutputArray colors = noArray(), OutputArray normals = noArray());
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////
+        /// Reads mesh. Only ply format is supported now and no texture load support
+
+        CV_EXPORTS Mesh readMesh(const String& file);
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////
+        /// Read/write poses and trajectories
+
+        CV_EXPORTS bool readPose(const String& file, Affine3d& pose, const String& tag = "pose");
+        CV_EXPORTS void writePose(const String& file, const Affine3d& pose, const String& tag = "pose");
+
+        //! takes vector<Affine3<T>> with T = float/dobule and writes to a sequence of files with given filename format
+        CV_EXPORTS void writeTrajectory(InputArray traj, const String& files_format = "pose%05d.xml", int start = 0, const String& tag = "pose");
+
+        //! takes vector<Affine3<T>> with T = float/dobule and loads poses from sequence of files
+        CV_EXPORTS void readTrajectory(OutputArray traj, const String& files_format = "pose%05d.xml", int start = 0, int end = INT_MAX, const String& tag = "pose");
+
+
+        ///////////////////////////////////////////////////////////////////////////////////////////////
+        /// Computing normals for mesh
+
+        CV_EXPORTS void computeNormals(const Mesh& mesh, OutputArray normals);
+
+    } /* namespace viz */
+} /* namespace cv */
+
+#endif /* __OPENCV_VIZCORE_HPP__ */
diff --git a/modules/viz/include/opencv2/viz/widget_accessor.hpp b/modules/viz/include/opencv2/viz/widget_accessor.hpp
index 394cfa858..29352a214 100644
--- a/modules/viz/include/opencv2/viz/widget_accessor.hpp
+++ b/modules/viz/include/opencv2/viz/widget_accessor.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_WIDGET_ACCESSOR_HPP__
@@ -69,4 +66,4 @@ namespace cv
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/viz/include/opencv2/viz/widgets.hpp b/modules/viz/include/opencv2/viz/widgets.hpp
index 07b335899..2c49b9d0e 100644
--- a/modules/viz/include/opencv2/viz/widgets.hpp
+++ b/modules/viz/include/opencv2/viz/widgets.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_WIDGETS_HPP__
@@ -68,14 +65,14 @@ namespace cv
             SHADING
         };
 
-        enum RenderingRepresentationProperties
+        enum RepresentationValues
         {
             REPRESENTATION_POINTS,
             REPRESENTATION_WIREFRAME,
             REPRESENTATION_SURFACE
         };
 
-        enum ShadingRepresentationProperties
+        enum ShadingValues
         {
             SHADING_FLAT,
             SHADING_GOURAUD,
@@ -114,13 +111,15 @@ namespace cv
         public:
             Widget3D() {}
 
-            void setPose(const Affine3f &pose);
-            void updatePose(const Affine3f &pose);
-            Affine3f getPose() const;
+            //! widget position manipulation, i.e. place where it is rendered
+            void setPose(const Affine3d &pose);
+            void updatePose(const Affine3d &pose);
+            Affine3d getPose() const;
+
+            //! update internal widget data, i.e. points, normals, etc.
+            void applyTransform(const Affine3d &transform);
 
             void setColor(const Color &color);
-        private:
-            struct MatrixConverter;
 
         };
 
@@ -134,92 +133,94 @@ namespace cv
             void setColor(const Color &color);
         };
 
+        /////////////////////////////////////////////////////////////////////////////
+        /// Simple widgets
+
         class CV_EXPORTS WLine : public Widget3D
         {
         public:
-            WLine(const Point3f &pt1, const Point3f &pt2, const Color &color = Color::white());
+            WLine(const Point3d &pt1, const Point3d &pt2, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WPlane : public Widget3D
         {
         public:
-            WPlane(const Vec4f& coefs, float size = 1.f, const Color &color = Color::white());
-            WPlane(const Vec4f& coefs, const Point3f& pt, float size = 1.f, const Color &color = Color::white());
-        private:
-            struct SetSizeImpl;
+            //! created default plane with center point at origin and normal oriented along z-axis
+            WPlane(const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white());
+
+            //! repositioned plane
+            WPlane(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis,
+                   const Size2d& size = Size2d(1.0, 1.0), const Color &color = Color::white());
         };
 
         class CV_EXPORTS WSphere : public Widget3D
         {
         public:
-            WSphere(const cv::Point3f &center, float radius, int sphere_resolution = 10, const Color &color = Color::white());
+            WSphere(const cv::Point3d &center, double radius, int sphere_resolution = 10, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WArrow : public Widget3D
         {
         public:
-            WArrow(const Point3f& pt1, const Point3f& pt2, float thickness = 0.03f, const Color &color = Color::white());
+            WArrow(const Point3d& pt1, const Point3d& pt2, double thickness = 0.03, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WCircle : public Widget3D
         {
         public:
-            WCircle(const Point3f& pt, float radius, float thickness = 0.01f, const Color &color = Color::white());
+            //! creates default planar circle centred at origin with plane normal along z-axis
+            WCircle(double radius, double thickness = 0.01, const Color &color = Color::white());
+
+            //! creates repositioned circle
+            WCircle(double radius, const Point3d& center, const Vec3d& normal, double thickness = 0.01, const Color &color = Color::white());
+        };
+
+        class CV_EXPORTS WCone : public Widget3D
+        {
+        public:
+            //! create default cone, oriented along x-axis with center of its base located at origin
+            WCone(double length, double radius, int resolution = 6.0, const Color &color = Color::white());
+
+            //! creates repositioned cone
+            WCone(double radius, const Point3d& center, const Point3d& tip, int resolution = 6.0, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WCylinder : public Widget3D
         {
         public:
-            WCylinder(const Point3f& pt_on_axis, const Point3f& axis_direction, float radius, int numsides = 30, const Color &color = Color::white());
+            WCylinder(const Point3d& axis_point1, const Point3d& axis_point2, double radius, int numsides = 30, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WCube : public Widget3D
         {
         public:
-            WCube(const Point3f& pt_min, const Point3f& pt_max, bool wire_frame = true, const Color &color = Color::white());
-        };
-
-        class CV_EXPORTS WCoordinateSystem : public Widget3D
-        {
-        public:
-            WCoordinateSystem(float scale = 1.f);
+            WCube(const Point3d& min_point = Vec3d::all(-0.5), const Point3d& max_point = Vec3d::all(0.5),
+                  bool wire_frame = true, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WPolyLine : public Widget3D
         {
         public:
             WPolyLine(InputArray points, const Color &color = Color::white());
-
-        private:
-            struct CopyImpl;
         };
 
-        class CV_EXPORTS WGrid : public Widget3D
+        /////////////////////////////////////////////////////////////////////////////
+        /// Text and image widgets
+
+        class CV_EXPORTS WText : public Widget2D
         {
         public:
-            //! Creates grid at the origin
-            WGrid(const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white());
-            //! Creates grid based on the plane equation
-            WGrid(const Vec4f &coeffs, const Vec2i &dimensions, const Vec2d &spacing, const Color &color = Color::white());
-
-        private:
-            struct GridImpl;
-
-        };
-
-        class CV_EXPORTS WText3D : public Widget3D
-        {
-        public:
-            WText3D(const String &text, const Point3f &position, float text_scale = 1.f, bool face_camera = true, const Color &color = Color::white());
+            WText(const String &text, const Point &pos, int font_size = 20, const Color &color = Color::white());
 
             void setText(const String &text);
             String getText() const;
         };
 
-        class CV_EXPORTS WText : public Widget2D
+        class CV_EXPORTS WText3D : public Widget3D
         {
         public:
-            WText(const String &text, const Point2i &pos, int font_size = 10, const Color &color = Color::white());
+            //! creates text label in 3D. If face_camera = false, text plane normal is oriented along z-axis. Use widget pose to orient it properly
+            WText3D(const String &text, const Point3d &position, double text_scale = 1., bool face_camera = true, const Color &color = Color::white());
 
             void setText(const String &text);
             String getText() const;
@@ -228,63 +229,91 @@ namespace cv
         class CV_EXPORTS WImageOverlay : public Widget2D
         {
         public:
-            WImageOverlay(const Mat &image, const Rect &rect);
-
-            void setImage(const Mat &image);
+            WImageOverlay(InputArray image, const Rect &rect);
+            void setImage(InputArray image);
         };
 
         class CV_EXPORTS WImage3D : public Widget3D
         {
         public:
-            //! Creates 3D image at the origin
-            WImage3D(const Mat &image, const Size &size);
-            //! Creates 3D image at a given position, pointing in the direction of the normal, and having the up_vector orientation
-            WImage3D(const Vec3f &position, const Vec3f &normal, const Vec3f &up_vector, const Mat &image, const Size &size);
+            //! Creates 3D image in a plane centered at the origin with normal orientaion along z-axis,
+            //! image x- and y-axes are oriented along x- and y-axes of 3d world
+            WImage3D(InputArray image, const Size2d &size);
 
-            void setImage(const Mat &image);
+            //! Creates 3D image at a given position, pointing in the direction of the normal, and having the up_vector orientation
+            WImage3D(InputArray image, const Size2d &size, const Vec3d &center, const Vec3d &normal, const Vec3d &up_vector);
+
+            void setImage(InputArray image);
+        };
+
+        /////////////////////////////////////////////////////////////////////////////
+        /// Compond widgets
+
+        class CV_EXPORTS WCoordinateSystem : public Widget3D
+        {
+        public:
+            WCoordinateSystem(double scale = 1.0);
+        };
+
+        class CV_EXPORTS WGrid : public Widget3D
+        {
+        public:
+            //! Creates grid at the origin and normal oriented along z-axis
+            WGrid(const Vec2i &cells = Vec2i::all(10), const Vec2d &cells_spacing = Vec2d::all(1.0), const Color &color = Color::white());
+
+            //! Creates repositioned grid
+            WGrid(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis,
+                  const Vec2i &cells = Vec2i::all(10), const Vec2d &cells_spacing = Vec2d::all(1.0), const Color &color = Color::white());
         };
 
         class CV_EXPORTS WCameraPosition : public Widget3D
         {
         public:
             //! Creates camera coordinate frame (axes) at the origin
-            WCameraPosition(float scale = 1.f);
+            WCameraPosition(double scale = 1.0);
             //! Creates frustum based on the intrinsic marix K at the origin
-            WCameraPosition(const Matx33f &K, float scale = 1.f, const Color &color = Color::white());
+            WCameraPosition(const Matx33d &K, double scale = 1.0, const Color &color = Color::white());
             //! Creates frustum based on the field of view at the origin
-            WCameraPosition(const Vec2f &fov, float scale = 1.f, const Color &color = Color::white());
+            WCameraPosition(const Vec2d &fov, double scale = 1.0, const Color &color = Color::white());
             //! Creates frustum and display given image at the far plane
-            WCameraPosition(const Matx33f &K, const Mat &img, float scale = 1.f, const Color &color = Color::white());
+            WCameraPosition(const Matx33d &K, InputArray image, double scale = 1.0, const Color &color = Color::white());
             //! Creates frustum and display given image at the far plane
-            WCameraPosition(const Vec2f &fov, const Mat &img, float scale = 1.f, const Color &color = Color::white());
-
-        private:
-            struct ProjectImage;
+            WCameraPosition(const Vec2d &fov, InputArray image, double scale = 1.0, const Color &color = Color::white());
         };
 
+        /////////////////////////////////////////////////////////////////////////////
+        /// Trajectories
+
         class CV_EXPORTS WTrajectory : public Widget3D
         {
         public:
-            enum {DISPLAY_FRAMES = 1, DISPLAY_PATH = 2};
+            enum {FRAMES = 1, PATH = 2, BOTH = FRAMES + PATH };
 
-            //! Displays trajectory of the given path either by coordinate frames or polyline
-            WTrajectory(const std::vector<Affine3f> &path, int display_mode = WTrajectory::DISPLAY_PATH, const Color &color = Color::white(), float scale = 1.f);
-            //! Displays trajectory of the given path by frustums
-            WTrajectory(const std::vector<Affine3f> &path, const Matx33f &K, float scale = 1.f, const Color &color = Color::white());
-            //! Displays trajectory of the given path by frustums
-            WTrajectory(const std::vector<Affine3f> &path, const Vec2f &fov, float scale = 1.f, const Color &color = Color::white());
-
-        private:
-            struct ApplyPath;
+            //! Takes vector<Affine3<T>> and displays trajectory of the given path either by coordinate frames or polyline
+            WTrajectory(InputArray path, int display_mode = WTrajectory::PATH, double scale = 1.0, const Color &color = Color::white());
         };
 
-        class CV_EXPORTS WSpheresTrajectory: public Widget3D
+        class CV_EXPORTS WTrajectoryFrustums : public Widget3D
         {
         public:
-            WSpheresTrajectory(const std::vector<Affine3f> &path, float line_length = 0.05f, float init_sphere_radius = 0.021f,
-                                    float sphere_radius = 0.007f, const Color &line_color = Color::white(), const Color &sphere_color = Color::white());
+            //! Takes vector<Affine3<T>> and displays trajectory of the given path by frustums
+            WTrajectoryFrustums(InputArray path, const Matx33d &K, double scale = 1., const Color &color = Color::white());
+
+            //! Takes vector<Affine3<T>> and displays trajectory of the given path by frustums
+            WTrajectoryFrustums(InputArray path, const Vec2d &fov, double scale = 1., const Color &color = Color::white());
         };
 
+        class CV_EXPORTS WTrajectorySpheres: public Widget3D
+        {
+        public:
+            //! Takes vector<Affine3<T>> and displays trajectory of the given path
+            WTrajectorySpheres(InputArray path, double line_length = 0.05, double radius = 0.007,
+                               const Color &from = Color::red(), const Color &to = Color::white());
+        };
+
+        /////////////////////////////////////////////////////////////////////////////
+        /// Clouds
+
         class CV_EXPORTS WCloud: public Widget3D
         {
         public:
@@ -292,9 +321,19 @@ namespace cv
             WCloud(InputArray cloud, InputArray colors);
             //! All points in cloud have the same color
             WCloud(InputArray cloud, const Color &color = Color::white());
+        };
 
-        private:
-            struct CreateCloudWidget;
+        class CV_EXPORTS WPaintedCloud: public Widget3D
+        {
+        public:
+            //! Paint cloud with default gradient between cloud bounds points
+            WPaintedCloud(InputArray cloud);
+
+            //! Paint cloud with default gradient between given points
+            WPaintedCloud(InputArray cloud, const Point3d& p1, const Point3d& p2);
+
+            //! Paint cloud with gradient specified by given colors between given points
+            WPaintedCloud(InputArray cloud, const Point3d& p1, const Point3d& p2, const Color& c1, const Color c2);
         };
 
         class CV_EXPORTS WCloudCollection : public Widget3D
@@ -303,32 +342,27 @@ namespace cv
             WCloudCollection();
 
             //! Each point in cloud is mapped to a color in colors
-            void addCloud(InputArray cloud, InputArray colors, const Affine3f &pose = Affine3f::Identity());
+            void addCloud(InputArray cloud, InputArray colors, const Affine3d &pose = Affine3d::Identity());
             //! All points in cloud have the same color
-            void addCloud(InputArray cloud, const Color &color = Color::white(), const Affine3f &pose = Affine3f::Identity());
-
-        private:
-            struct CreateCloudWidget;
+            void addCloud(InputArray cloud, const Color &color = Color::white(), const Affine3d &pose = Affine3d::Identity());
         };
 
         class CV_EXPORTS WCloudNormals : public Widget3D
         {
         public:
-            WCloudNormals(InputArray cloud, InputArray normals, int level = 100, float scale = 0.02f, const Color &color = Color::white());
-
-        private:
-            struct ApplyCloudNormals;
+            WCloudNormals(InputArray cloud, InputArray normals, int level = 64, double scale = 0.1, const Color &color = Color::white());
         };
 
         class CV_EXPORTS WMesh : public Widget3D
         {
         public:
-            WMesh(const Mesh3d &mesh);
-
-        private:
-            struct CopyImpl;
+            WMesh(const Mesh &mesh);
+            WMesh(InputArray cloud, InputArray polygons, InputArray colors = noArray(), InputArray normals = noArray());
         };
 
+        /////////////////////////////////////////////////////////////////////////////
+        /// Utility exports
+
         template<> CV_EXPORTS Widget2D Widget::cast<Widget2D>();
         template<> CV_EXPORTS Widget3D Widget::cast<Widget3D>();
         template<> CV_EXPORTS WLine Widget::cast<WLine>();
@@ -337,6 +371,7 @@ namespace cv
         template<> CV_EXPORTS WCylinder Widget::cast<WCylinder>();
         template<> CV_EXPORTS WArrow Widget::cast<WArrow>();
         template<> CV_EXPORTS WCircle Widget::cast<WCircle>();
+        template<> CV_EXPORTS WCone Widget::cast<WCone>();
         template<> CV_EXPORTS WCube Widget::cast<WCube>();
         template<> CV_EXPORTS WCoordinateSystem Widget::cast<WCoordinateSystem>();
         template<> CV_EXPORTS WPolyLine Widget::cast<WPolyLine>();
@@ -347,8 +382,10 @@ namespace cv
         template<> CV_EXPORTS WImage3D Widget::cast<WImage3D>();
         template<> CV_EXPORTS WCameraPosition Widget::cast<WCameraPosition>();
         template<> CV_EXPORTS WTrajectory Widget::cast<WTrajectory>();
-        template<> CV_EXPORTS WSpheresTrajectory Widget::cast<WSpheresTrajectory>();
+        template<> CV_EXPORTS WTrajectoryFrustums Widget::cast<WTrajectoryFrustums>();
+        template<> CV_EXPORTS WTrajectorySpheres Widget::cast<WTrajectorySpheres>();
         template<> CV_EXPORTS WCloud Widget::cast<WCloud>();
+        template<> CV_EXPORTS WPaintedCloud Widget::cast<WPaintedCloud>();
         template<> CV_EXPORTS WCloudCollection Widget::cast<WCloudCollection>();
         template<> CV_EXPORTS WCloudNormals Widget::cast<WCloudNormals>();
         template<> CV_EXPORTS WMesh Widget::cast<WMesh>();
diff --git a/modules/viz/src/cloud_widgets.cpp b/modules/viz/src/cloud_widgets.cpp
deleted file mode 100644
index 73cc26201..000000000
--- a/modules/viz/src/cloud_widgets.cpp
+++ /dev/null
@@ -1,773 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Ozan Tonkal, ozantonkal@gmail.com
-//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
-//
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv
-{
-    namespace viz
-    {
-        template<typename _Tp> Vec<_Tp, 3>* vtkpoints_data(vtkSmartPointer<vtkPoints>& points);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// Point Cloud Widget implementation
-
-struct cv::viz::WCloud::CreateCloudWidget
-{
-    static inline vtkSmartPointer<vtkPolyData> create(const Mat &cloud, vtkIdType &nr_points)
-    {
-        vtkSmartPointer<vtkPolyData> polydata = vtkSmartPointer<vtkPolyData>::New();
-        vtkSmartPointer<vtkCellArray> vertices = vtkSmartPointer<vtkCellArray>::New();
-
-        polydata->SetVerts(vertices);
-
-        vtkSmartPointer<vtkPoints> points = polydata->GetPoints();
-        vtkSmartPointer<vtkIdTypeArray> initcells;
-        nr_points = cloud.total();
-
-        if (!points)
-        {
-            points = vtkSmartPointer<vtkPoints>::New();
-            if (cloud.depth() == CV_32F)
-                points->SetDataTypeToFloat();
-            else if (cloud.depth() == CV_64F)
-                points->SetDataTypeToDouble();
-            polydata->SetPoints(points);
-        }
-        points->SetNumberOfPoints(nr_points);
-
-        if (cloud.depth() == CV_32F)
-        {
-            // Get a pointer to the beginning of the data array
-            Vec3f *data_beg = vtkpoints_data<float>(points);
-            Vec3f *data_end = NanFilter::copy(cloud, data_beg, cloud);
-            nr_points = data_end - data_beg;
-        }
-        else if (cloud.depth() == CV_64F)
-        {
-            // Get a pointer to the beginning of the data array
-            Vec3d *data_beg = vtkpoints_data<double>(points);
-            Vec3d *data_end = NanFilter::copy(cloud, data_beg, cloud);
-            nr_points = data_end - data_beg;
-        }
-        points->SetNumberOfPoints(nr_points);
-
-        // Update cells
-        vtkSmartPointer<vtkIdTypeArray> cells = vertices->GetData();
-        // If no init cells and cells has not been initialized...
-        if (!cells)
-            cells = vtkSmartPointer<vtkIdTypeArray>::New();
-
-        // If we have less values then we need to recreate the array
-        if (cells->GetNumberOfTuples() < nr_points)
-        {
-            cells = vtkSmartPointer<vtkIdTypeArray>::New();
-
-            // If init cells is given, and there's enough data in it, use it
-            if (initcells && initcells->GetNumberOfTuples() >= nr_points)
-            {
-                cells->DeepCopy(initcells);
-                cells->SetNumberOfComponents(2);
-                cells->SetNumberOfTuples(nr_points);
-            }
-            else
-            {
-                // If the number of tuples is still too small, we need to recreate the array
-                cells->SetNumberOfComponents(2);
-                cells->SetNumberOfTuples(nr_points);
-                vtkIdType *cell = cells->GetPointer(0);
-                // Fill it with 1s
-                std::fill_n(cell, nr_points * 2, 1);
-                cell++;
-                for (vtkIdType i = 0; i < nr_points; ++i, cell += 2)
-                    *cell = i;
-                // Save the results in initcells
-                initcells = vtkSmartPointer<vtkIdTypeArray>::New();
-                initcells->DeepCopy(cells);
-            }
-        }
-        else
-        {
-            // The assumption here is that the current set of cells has more data than needed
-            cells->SetNumberOfComponents(2);
-            cells->SetNumberOfTuples(nr_points);
-        }
-
-        // Set the cells and the vertices
-        vertices->SetCells(nr_points, cells);
-        return polydata;
-    }
-};
-
-cv::viz::WCloud::WCloud(InputArray _cloud, InputArray _colors)
-{
-    Mat cloud = _cloud.getMat();
-    Mat colors = _colors.getMat();
-    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
-    CV_Assert(colors.type() == CV_8UC3 && cloud.size() == colors.size());
-
-    if (cloud.isContinuous() && colors.isContinuous())
-    {
-        cloud.reshape(cloud.channels(), 1);
-        colors.reshape(colors.channels(), 1);
-    }
-
-    vtkIdType nr_points;
-    vtkSmartPointer<vtkPolyData> polydata = CreateCloudWidget::create(cloud, nr_points);
-
-    // Filter colors
-    Vec3b* colors_data = new Vec3b[nr_points];
-    NanFilter::copyColor(colors, colors_data, cloud);
-
-    vtkSmartPointer<vtkUnsignedCharArray> scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    scalars->SetNumberOfComponents(3);
-    scalars->SetNumberOfTuples(nr_points);
-    scalars->SetArray(colors_data->val, 3 * nr_points, 0);
-
-    // Assign the colors
-    polydata->GetPointData()->SetScalars(scalars);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(polydata);
-#else
-    mapper->SetInputData(polydata);
-#endif
-
-    Vec3d minmax(scalars->GetRange());
-    mapper->SetScalarRange(minmax.val);
-    mapper->SetScalarModeToUsePointData();
-
-    bool interpolation = (polydata && polydata->GetNumberOfCells() != polydata->GetNumberOfVerts());
-
-    mapper->SetInterpolateScalarsBeforeMapping(interpolation);
-    mapper->ScalarVisibilityOn();
-
-    mapper->ImmediateModeRenderingOff();
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetNumberOfCloudPoints(int(std::max<vtkIdType>(1, polydata->GetNumberOfPoints() / 10)));
-    actor->GetProperty()->SetInterpolationToFlat();
-    actor->GetProperty()->BackfaceCullingOn();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-cv::viz::WCloud::WCloud(InputArray _cloud, const Color &color)
-{
-    Mat cloud = _cloud.getMat();
-    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
-
-    vtkIdType nr_points;
-    vtkSmartPointer<vtkPolyData> polydata = CreateCloudWidget::create(cloud, nr_points);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(polydata);
-#else
-    mapper->SetInputData(polydata);
-#endif
-
-    bool interpolation = (polydata && polydata->GetNumberOfCells() != polydata->GetNumberOfVerts());
-
-    mapper->SetInterpolateScalarsBeforeMapping(interpolation);
-    mapper->ScalarVisibilityOff();
-
-    mapper->ImmediateModeRenderingOff();
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetNumberOfCloudPoints(int(std::max<vtkIdType>(1, polydata->GetNumberOfPoints() / 10)));
-    actor->GetProperty()->SetInterpolationToFlat();
-    actor->GetProperty()->BackfaceCullingOn();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WCloud cv::viz::Widget::cast<cv::viz::WCloud>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCloud&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// Cloud Collection Widget implementation
-
-struct cv::viz::WCloudCollection::CreateCloudWidget
-{
-    static inline vtkSmartPointer<vtkPolyData> create(const Mat &cloud, vtkIdType &nr_points)
-    {
-        vtkSmartPointer<vtkPolyData> polydata = vtkSmartPointer<vtkPolyData>::New();
-        vtkSmartPointer<vtkCellArray> vertices = vtkSmartPointer<vtkCellArray>::New();
-
-        polydata->SetVerts(vertices);
-
-        vtkSmartPointer<vtkPoints> points = polydata->GetPoints();
-        vtkSmartPointer<vtkIdTypeArray> initcells;
-        nr_points = cloud.total();
-
-        if (!points)
-        {
-            points = vtkSmartPointer<vtkPoints>::New();
-            if (cloud.depth() == CV_32F)
-                points->SetDataTypeToFloat();
-            else if (cloud.depth() == CV_64F)
-                points->SetDataTypeToDouble();
-            polydata->SetPoints(points);
-        }
-        points->SetNumberOfPoints(nr_points);
-
-        if (cloud.depth() == CV_32F)
-        {
-            // Get a pointer to the beginning of the data array
-            Vec3f *data_beg = vtkpoints_data<float>(points);
-            Vec3f *data_end = NanFilter::copy(cloud, data_beg, cloud);
-            nr_points = data_end - data_beg;
-        }
-        else if (cloud.depth() == CV_64F)
-        {
-            // Get a pointer to the beginning of the data array
-            Vec3d *data_beg = vtkpoints_data<double>(points);
-            Vec3d *data_end = NanFilter::copy(cloud, data_beg, cloud);
-            nr_points = data_end - data_beg;
-        }
-        points->SetNumberOfPoints(nr_points);
-
-        // Update cells
-        vtkSmartPointer<vtkIdTypeArray> cells = vertices->GetData();
-        // If no init cells and cells has not been initialized...
-        if (!cells)
-            cells = vtkSmartPointer<vtkIdTypeArray>::New();
-
-        // If we have less values then we need to recreate the array
-        if (cells->GetNumberOfTuples() < nr_points)
-        {
-            cells = vtkSmartPointer<vtkIdTypeArray>::New();
-
-            // If init cells is given, and there's enough data in it, use it
-            if (initcells && initcells->GetNumberOfTuples() >= nr_points)
-            {
-                cells->DeepCopy(initcells);
-                cells->SetNumberOfComponents(2);
-                cells->SetNumberOfTuples(nr_points);
-            }
-            else
-            {
-                // If the number of tuples is still too small, we need to recreate the array
-                cells->SetNumberOfComponents(2);
-                cells->SetNumberOfTuples(nr_points);
-                vtkIdType *cell = cells->GetPointer(0);
-                // Fill it with 1s
-                std::fill_n(cell, nr_points * 2, 1);
-                cell++;
-                for (vtkIdType i = 0; i < nr_points; ++i, cell += 2)
-                    *cell = i;
-                // Save the results in initcells
-                initcells = vtkSmartPointer<vtkIdTypeArray>::New();
-                initcells->DeepCopy(cells);
-            }
-        }
-        else
-        {
-            // The assumption here is that the current set of cells has more data than needed
-            cells->SetNumberOfComponents(2);
-            cells->SetNumberOfTuples(nr_points);
-        }
-
-        // Set the cells and the vertices
-        vertices->SetCells(nr_points, cells);
-        return polydata;
-    }
-
-    static void createMapper(vtkSmartPointer<vtkLODActor> actor, vtkSmartPointer<vtkPolyData> poly_data, Vec3d& minmax)
-    {
-        vtkDataSetMapper *mapper = vtkDataSetMapper::SafeDownCast(actor->GetMapper());
-        if (!mapper)
-        {
-            // This is the first cloud
-            vtkSmartPointer<vtkDataSetMapper> mapper_new = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-            mapper_new->SetInputConnection(poly_data->GetProducerPort());
-#else
-            mapper_new->SetInputData(poly_data);
-#endif
-
-            mapper_new->SetScalarRange(minmax.val);
-            mapper_new->SetScalarModeToUsePointData();
-
-            bool interpolation = (poly_data && poly_data->GetNumberOfCells() != poly_data->GetNumberOfVerts());
-
-            mapper_new->SetInterpolateScalarsBeforeMapping(interpolation);
-            mapper_new->ScalarVisibilityOn();
-            mapper_new->ImmediateModeRenderingOff();
-
-            actor->SetNumberOfCloudPoints(int(std::max<vtkIdType>(1, poly_data->GetNumberOfPoints() / 10)));
-            actor->GetProperty()->SetInterpolationToFlat();
-            actor->GetProperty()->BackfaceCullingOn();
-            actor->SetMapper(mapper_new);
-            return ;
-        }
-
-        vtkPolyData *data = vtkPolyData::SafeDownCast(mapper->GetInput());
-        CV_Assert("Cloud Widget without data" && data);
-
-        vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-#if VTK_MAJOR_VERSION <= 5
-        appendFilter->AddInputConnection(mapper->GetInput()->GetProducerPort());
-        appendFilter->AddInputConnection(poly_data->GetProducerPort());
-#else
-        appendFilter->AddInputData(data);
-        appendFilter->AddInputData(poly_data);
-#endif
-        mapper->SetInputConnection(appendFilter->GetOutputPort());
-
-        // Update the number of cloud points
-        vtkIdType old_cloud_points = actor->GetNumberOfCloudPoints();
-        actor->SetNumberOfCloudPoints(int(std::max<vtkIdType>(1, old_cloud_points+poly_data->GetNumberOfPoints() / 10)));
-    }
-};
-
-cv::viz::WCloudCollection::WCloudCollection()
-{
-    // Just create the actor
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    WidgetAccessor::setProp(*this, actor);
-}
-
-void cv::viz::WCloudCollection::addCloud(InputArray _cloud, InputArray _colors, const Affine3f &pose)
-{
-    Mat cloud = _cloud.getMat();
-    Mat colors = _colors.getMat();
-    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
-    CV_Assert(colors.type() == CV_8UC3 && cloud.size() == colors.size());
-
-    if (cloud.isContinuous() && colors.isContinuous())
-    {
-        cloud.reshape(cloud.channels(), 1);
-        colors.reshape(colors.channels(), 1);
-    }
-
-    vtkIdType nr_points;
-    vtkSmartPointer<vtkPolyData> polydata =  CreateCloudWidget::create(cloud, nr_points);
-
-    // Filter colors
-    Vec3b* colors_data = new Vec3b[nr_points];
-    NanFilter::copyColor(colors, colors_data, cloud);
-
-    vtkSmartPointer<vtkUnsignedCharArray> scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    scalars->SetNumberOfComponents(3);
-    scalars->SetNumberOfTuples(nr_points);
-    scalars->SetArray(colors_data->val, 3 * nr_points, 0);
-
-    // Assign the colors
-    polydata->GetPointData()->SetScalars(scalars);
-
-    // Transform the poly data based on the pose
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->PreMultiply();
-    transform->SetMatrix(convertToVtkMatrix(pose.matrix));
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transform_filter->SetTransform(transform);
-#if VTK_MAJOR_VERSION <= 5
-    transform_filter->SetInputConnection(polydata->GetProducerPort());
-#else
-    transform_filter->SetInputData(polydata);
-#endif
-    transform_filter->Update();
-
-    vtkLODActor *actor = vtkLODActor::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("Incompatible widget type." && actor);
-
-    Vec3d minmax(scalars->GetRange());
-    CreateCloudWidget::createMapper(actor, transform_filter->GetOutput(), minmax);
-}
-
-void cv::viz::WCloudCollection::addCloud(InputArray _cloud, const Color &color, const Affine3f &pose)
-{
-    Mat cloud = _cloud.getMat();
-    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
-
-    vtkIdType nr_points;
-    vtkSmartPointer<vtkPolyData> polydata =  CreateCloudWidget::create(cloud, nr_points);
-
-    vtkSmartPointer<vtkUnsignedCharArray> scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    scalars->SetNumberOfComponents(3);
-    scalars->SetNumberOfTuples(nr_points);
-    scalars->FillComponent(0, color[2]);
-    scalars->FillComponent(1, color[1]);
-    scalars->FillComponent(2, color[0]);
-
-    // Assign the colors
-    polydata->GetPointData()->SetScalars(scalars);
-
-    // Transform the poly data based on the pose
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->PreMultiply();
-    transform->SetMatrix(convertToVtkMatrix(pose.matrix));
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transform_filter->SetTransform(transform);
-#if VTK_MAJOR_VERSION <= 5
-    transform_filter->SetInputConnection(polydata->GetProducerPort());
-#else
-    transform_filter->SetInputData(polydata);
-#endif
-    transform_filter->Update();
-
-    vtkLODActor *actor = vtkLODActor::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("Incompatible widget type." && actor);
-
-    Vec3d minmax(scalars->GetRange());
-    CreateCloudWidget::createMapper(actor, transform_filter->GetOutput(), minmax);
-}
-
-template<> cv::viz::WCloudCollection cv::viz::Widget::cast<cv::viz::WCloudCollection>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCloudCollection&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// Cloud Normals Widget implementation
-
-struct cv::viz::WCloudNormals::ApplyCloudNormals
-{
-    template<typename _Tp>
-    struct Impl
-    {
-        static vtkSmartPointer<vtkCellArray> applyOrganized(const Mat &cloud, const Mat& normals, double level, float scale, _Tp *&pts, vtkIdType &nr_normals)
-        {
-            vtkIdType point_step = static_cast<vtkIdType>(std::sqrt(level));
-            nr_normals = (static_cast<vtkIdType>((cloud.cols - 1) / point_step) + 1) *
-                         (static_cast<vtkIdType>((cloud.rows - 1) / point_step) + 1);
-            vtkSmartPointer<vtkCellArray> lines = vtkSmartPointer<vtkCellArray>::New();
-
-            pts = new _Tp[2 * nr_normals * 3];
-
-            int cch = cloud.channels();
-            vtkIdType cell_count = 0;
-            for (vtkIdType y = 0; y < cloud.rows; y += point_step)
-            {
-                const _Tp *prow = cloud.ptr<_Tp>(y);
-                const _Tp *nrow = normals.ptr<_Tp>(y);
-                for (vtkIdType x = 0; x < cloud.cols; x += point_step * cch)
-                {
-                    pts[2 * cell_count * 3 + 0] = prow[x];
-                    pts[2 * cell_count * 3 + 1] = prow[x+1];
-                    pts[2 * cell_count * 3 + 2] = prow[x+2];
-                    pts[2 * cell_count * 3 + 3] = prow[x] + nrow[x] * scale;
-                    pts[2 * cell_count * 3 + 4] = prow[x+1] + nrow[x+1] * scale;
-                    pts[2 * cell_count * 3 + 5] = prow[x+2] + nrow[x+2] * scale;
-
-                    lines->InsertNextCell(2);
-                    lines->InsertCellPoint(2 * cell_count);
-                    lines->InsertCellPoint(2 * cell_count + 1);
-                    cell_count++;
-                }
-            }
-            return lines;
-        }
-
-        static vtkSmartPointer<vtkCellArray> applyUnorganized(const Mat &cloud, const Mat& normals, int level, float scale, _Tp *&pts, vtkIdType &nr_normals)
-        {
-            vtkSmartPointer<vtkCellArray> lines = vtkSmartPointer<vtkCellArray>::New();
-            nr_normals = (cloud.size().area() - 1) / level + 1 ;
-            pts = new _Tp[2 * nr_normals * 3];
-
-            int cch = cloud.channels();
-            const _Tp *p = cloud.ptr<_Tp>();
-            const _Tp *n = normals.ptr<_Tp>();
-            for (vtkIdType i = 0, j = 0; j < nr_normals; j++, i = j * level * cch)
-            {
-
-                pts[2 * j * 3 + 0] = p[i];
-                pts[2 * j * 3 + 1] = p[i+1];
-                pts[2 * j * 3 + 2] = p[i+2];
-                pts[2 * j * 3 + 3] = p[i] + n[i] * scale;
-                pts[2 * j * 3 + 4] = p[i+1] + n[i+1] * scale;
-                pts[2 * j * 3 + 5] = p[i+2] + n[i+2] * scale;
-
-                lines->InsertNextCell(2);
-                lines->InsertCellPoint(2 * j);
-                lines->InsertCellPoint(2 * j + 1);
-            }
-            return lines;
-        }
-    };
-
-    template<typename _Tp>
-    static inline vtkSmartPointer<vtkCellArray> apply(const Mat &cloud, const Mat& normals, int level, float scale, _Tp *&pts, vtkIdType &nr_normals)
-    {
-        if (cloud.cols > 1 && cloud.rows > 1)
-            return ApplyCloudNormals::Impl<_Tp>::applyOrganized(cloud, normals, level, scale, pts, nr_normals);
-        else
-            return ApplyCloudNormals::Impl<_Tp>::applyUnorganized(cloud, normals, level, scale, pts, nr_normals);
-    }
-};
-
-cv::viz::WCloudNormals::WCloudNormals(InputArray _cloud, InputArray _normals, int level, float scale, const Color &color)
-{
-    Mat cloud = _cloud.getMat();
-    Mat normals = _normals.getMat();
-    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
-    CV_Assert(cloud.size() == normals.size() && cloud.type() == normals.type());
-
-    vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
-    vtkSmartPointer<vtkCellArray> lines = vtkSmartPointer<vtkCellArray>::New();
-    vtkIdType nr_normals = 0;
-
-    if (cloud.depth() == CV_32F)
-    {
-        points->SetDataTypeToFloat();
-
-        vtkSmartPointer<vtkFloatArray> data = vtkSmartPointer<vtkFloatArray>::New();
-        data->SetNumberOfComponents(3);
-
-        float* pts = 0;
-        lines = ApplyCloudNormals::apply(cloud, normals, level, scale, pts, nr_normals);
-        data->SetArray(&pts[0], 2 * nr_normals * 3, 0);
-        points->SetData(data);
-    }
-    else
-    {
-        points->SetDataTypeToDouble();
-
-        vtkSmartPointer<vtkDoubleArray> data = vtkSmartPointer<vtkDoubleArray>::New();
-        data->SetNumberOfComponents(3);
-
-        double* pts = 0;
-        lines = ApplyCloudNormals::apply(cloud, normals, level, scale, pts, nr_normals);
-        data->SetArray(&pts[0], 2 * nr_normals * 3, 0);
-        points->SetData(data);
-    }
-
-    vtkSmartPointer<vtkPolyData> polyData = vtkSmartPointer<vtkPolyData>::New();
-    polyData->SetPoints(points);
-    polyData->SetLines(lines);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(polyData);
-#else
-    mapper->SetInputData(polyData);
-#endif
-    mapper->SetColorModeToMapScalars();
-    mapper->SetScalarModeToUsePointData();
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WCloudNormals cv::viz::Widget::cast<cv::viz::WCloudNormals>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCloudNormals&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// Mesh Widget implementation
-
-struct cv::viz::WMesh::CopyImpl
-{
-    template<typename _Tp>
-    static Vec<_Tp, 3> * copy(const Mat &source, Vec<_Tp, 3> *output, int *look_up, const Mat &nan_mask)
-    {
-        CV_Assert(DataDepth<_Tp>::value == source.depth() && source.size() == nan_mask.size());
-        CV_Assert(nan_mask.channels() == 3 || nan_mask.channels() == 4);
-        CV_DbgAssert(DataDepth<_Tp>::value == nan_mask.depth());
-
-        int s_chs = source.channels();
-        int m_chs = nan_mask.channels();
-
-        int index = 0;
-        const _Tp* srow = source.ptr<_Tp>(0);
-        const _Tp* mrow = nan_mask.ptr<_Tp>(0);
-
-        for (int x = 0; x < source.cols; ++x, srow += s_chs, mrow += m_chs)
-        {
-            if (!isNan(mrow[0]) && !isNan(mrow[1]) && !isNan(mrow[2]))
-            {
-                look_up[x] = index;
-                *output++ = Vec<_Tp, 3>(srow);
-                ++index;
-            }
-        }
-        return output;
-    }
-};
-
-cv::viz::WMesh::WMesh(const Mesh3d &mesh)
-{
-    CV_Assert(mesh.cloud.rows == 1 && (mesh.cloud.type() == CV_32FC3 || mesh.cloud.type() == CV_64FC3 || mesh.cloud.type() == CV_32FC4 || mesh.cloud.type() == CV_64FC4));
-    CV_Assert(mesh.colors.empty() || (mesh.colors.type() == CV_8UC3 && mesh.cloud.size() == mesh.colors.size()));
-    CV_Assert(!mesh.polygons.empty() && mesh.polygons.type() == CV_32SC1);
-
-    vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
-    vtkIdType nr_points = mesh.cloud.total();
-    Mat look_up_mat(1, nr_points, CV_32SC1);
-    int * look_up = look_up_mat.ptr<int>();
-    points->SetNumberOfPoints(nr_points);
-
-    // Copy data from cloud to vtkPoints
-    if (mesh.cloud.depth() == CV_32F)
-    {
-        points->SetDataTypeToFloat();
-        Vec3f *data_beg = vtkpoints_data<float>(points);
-        Vec3f *data_end = CopyImpl::copy(mesh.cloud, data_beg, look_up, mesh.cloud);
-        nr_points = data_end - data_beg;
-    }
-    else
-    {
-        points->SetDataTypeToDouble();
-        Vec3d *data_beg = vtkpoints_data<double>(points);
-        Vec3d *data_end = CopyImpl::copy(mesh.cloud, data_beg, look_up, mesh.cloud);
-        nr_points = data_end - data_beg;
-    }
-
-    vtkSmartPointer<vtkUnsignedCharArray> scalars;
-
-    if (!mesh.colors.empty())
-    {
-        Vec3b * colors_data = 0;
-        colors_data = new Vec3b[nr_points];
-        NanFilter::copyColor(mesh.colors, colors_data, mesh.cloud);
-
-        scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-        scalars->SetNumberOfComponents(3);
-        scalars->SetNumberOfTuples(nr_points);
-        scalars->SetArray(colors_data->val, 3 * nr_points, 0);
-    }
-
-    points->SetNumberOfPoints(nr_points);
-
-    vtkSmartPointer<vtkPointSet> data;
-
-    if (mesh.polygons.size().area() > 1)
-    {
-        vtkSmartPointer<vtkCellArray> cell_array = vtkSmartPointer<vtkCellArray>::New();
-        const int * polygons = mesh.polygons.ptr<int>();
-
-        int idx = 0;
-        int poly_size = mesh.polygons.total();
-        for (int i = 0; i < poly_size; ++idx)
-        {
-            int n_points = polygons[i++];
-
-            cell_array->InsertNextCell(n_points);
-            for (int j = 0; j < n_points; ++j, ++idx)
-                cell_array->InsertCellPoint(look_up[polygons[i++]]);
-        }
-        vtkSmartPointer<vtkPolyData> polydata = vtkSmartPointer<vtkPolyData>::New();
-        cell_array->GetData()->SetNumberOfValues(idx);
-        cell_array->Squeeze();
-        polydata->SetStrips(cell_array);
-        polydata->SetPoints(points);
-
-        if (scalars)
-            polydata->GetPointData()->SetScalars(scalars);
-
-        data = polydata;
-    }
-    else
-    {
-        // Only one polygon
-        vtkSmartPointer<vtkPolygon> polygon = vtkSmartPointer<vtkPolygon>::New();
-        const int * polygons = mesh.polygons.ptr<int>();
-        int n_points = polygons[0];
-
-        polygon->GetPointIds()->SetNumberOfIds(n_points);
-
-        for (int j = 1; j < n_points+1; ++j)
-            polygon->GetPointIds()->SetId(j, look_up[polygons[j]]);
-
-        vtkSmartPointer<vtkUnstructuredGrid> poly_grid = vtkSmartPointer<vtkUnstructuredGrid>::New();
-        poly_grid->Allocate(1, 1);
-        poly_grid->InsertNextCell(polygon->GetCellType(), polygon->GetPointIds());
-        poly_grid->SetPoints(points);
-
-        if (scalars)
-            poly_grid->GetPointData()->SetScalars(scalars);
-
-        data = poly_grid;
-    }
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-
-    actor->GetProperty()->SetRepresentationToSurface();
-    actor->GetProperty()->BackfaceCullingOff(); // Backface culling is off for higher efficiency
-    actor->GetProperty()->SetInterpolationToFlat();
-    actor->GetProperty()->EdgeVisibilityOff();
-    actor->GetProperty()->ShadingOff();
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(data);
-#else
-    mapper->SetInputData(data);
-#endif
-    mapper->ImmediateModeRenderingOff();
-
-    vtkIdType numberOfCloudPoints = nr_points * 0.1;
-    actor->SetNumberOfCloudPoints(int(numberOfCloudPoints > 1 ? numberOfCloudPoints : 1));
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-template<> CV_EXPORTS cv::viz::WMesh cv::viz::Widget::cast<cv::viz::WMesh>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WMesh&>(widget);
-}
diff --git a/modules/viz/src/clouds.cpp b/modules/viz/src/clouds.cpp
new file mode 100644
index 000000000..4b84e8e9e
--- /dev/null
+++ b/modules/viz/src/clouds.cpp
@@ -0,0 +1,441 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Ozan Tonkal, ozantonkal@gmail.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Point Cloud Widget implementation
+
+cv::viz::WCloud::WCloud(InputArray cloud, InputArray colors)
+{
+    CV_Assert(!cloud.empty() && !colors.empty());
+
+    vtkSmartPointer<vtkCloudMatSource> cloud_source = vtkSmartPointer<vtkCloudMatSource>::New();
+    cloud_source->SetColorCloud(cloud, colors);
+    cloud_source->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, cloud_source->GetOutput());
+    mapper->SetScalarModeToUsePointData();
+    mapper->ImmediateModeRenderingOff();
+    mapper->SetScalarRange(0, 255);
+    mapper->ScalarVisibilityOn();
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->GetProperty()->BackfaceCullingOn();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WCloud::WCloud(InputArray cloud, const Color &color)
+{
+    WCloud cloud_widget(cloud, Mat(cloud.size(), CV_8UC3, color));
+    *this = cloud_widget;
+}
+
+
+template<> cv::viz::WCloud cv::viz::Widget::cast<cv::viz::WCloud>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCloud&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Painted Cloud Widget implementation
+
+cv::viz::WPaintedCloud::WPaintedCloud(InputArray cloud)
+{
+    vtkSmartPointer<vtkCloudMatSource> cloud_source = vtkSmartPointer<vtkCloudMatSource>::New();
+    cloud_source->SetCloud(cloud);
+    cloud_source->Update();
+
+    Vec6d bounds(cloud_source->GetOutput()->GetPoints()->GetBounds());
+
+    vtkSmartPointer<vtkElevationFilter> elevation = vtkSmartPointer<vtkElevationFilter>::New();
+    elevation->SetInputConnection(cloud_source->GetOutputPort());
+    elevation->SetLowPoint(bounds[0], bounds[2], bounds[4]);
+    elevation->SetHighPoint(bounds[1], bounds[3], bounds[5]);
+    elevation->SetScalarRange(0.0, 1.0);
+    elevation->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, vtkPolyData::SafeDownCast(elevation->GetOutput()));
+    mapper->ImmediateModeRenderingOff();
+    mapper->ScalarVisibilityOn();
+    mapper->SetColorModeToMapScalars();
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->GetProperty()->BackfaceCullingOn();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WPaintedCloud::WPaintedCloud(InputArray cloud, const Point3d& p1, const Point3d& p2)
+{
+    vtkSmartPointer<vtkCloudMatSource> cloud_source = vtkSmartPointer<vtkCloudMatSource>::New();
+    cloud_source->SetCloud(cloud);
+
+    vtkSmartPointer<vtkElevationFilter> elevation = vtkSmartPointer<vtkElevationFilter>::New();
+    elevation->SetInputConnection(cloud_source->GetOutputPort());
+    elevation->SetLowPoint(p1.x, p1.y, p1.z);
+    elevation->SetHighPoint(p2.x, p2.y, p2.z);
+    elevation->SetScalarRange(0.0, 1.0);
+    elevation->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, vtkPolyData::SafeDownCast(elevation->GetOutput()));
+    mapper->ImmediateModeRenderingOff();
+    mapper->ScalarVisibilityOn();
+    mapper->SetColorModeToMapScalars();
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->GetProperty()->BackfaceCullingOn();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WPaintedCloud::WPaintedCloud(InputArray cloud, const Point3d& p1, const Point3d& p2, const Color& c1, const Color c2)
+{
+    vtkSmartPointer<vtkCloudMatSource> cloud_source = vtkSmartPointer<vtkCloudMatSource>::New();
+    cloud_source->SetCloud(cloud);
+
+    vtkSmartPointer<vtkElevationFilter> elevation = vtkSmartPointer<vtkElevationFilter>::New();
+    elevation->SetInputConnection(cloud_source->GetOutputPort());
+    elevation->SetLowPoint(p1.x, p1.y, p1.z);
+    elevation->SetHighPoint(p2.x, p2.y, p2.z);
+    elevation->SetScalarRange(0.0, 1.0);
+    elevation->Update();
+
+    Color vc1 = vtkcolor(c1), vc2 = vtkcolor(c2);
+    vtkSmartPointer<vtkColorTransferFunction> color_transfer = vtkSmartPointer<vtkColorTransferFunction>::New();
+    color_transfer->SetColorSpaceToRGB();
+    color_transfer->AddRGBPoint(0.0, vc1[0], vc1[1], vc1[2]);
+    color_transfer->AddRGBPoint(1.0, vc2[0], vc2[1], vc2[2]);
+    color_transfer->SetScaleToLinear();
+    color_transfer->Build();
+
+    //if in future some need to replace color table with real scalars, then this can be done usine next calls:
+    //vtkDataArray *float_scalars = vtkPolyData::SafeDownCast(elevation->GetOutput())->GetPointData()->GetArray("Elevation");
+    //vtkSmartPointer<vtkPolyData> polydata = cloud_source->GetOutput();
+    //polydata->GetPointData()->SetScalars(color_transfer->MapScalars(float_scalars, VTK_COLOR_MODE_DEFAULT, 0));
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, vtkPolyData::SafeDownCast(elevation->GetOutput()));
+    mapper->ImmediateModeRenderingOff();
+    mapper->ScalarVisibilityOn();
+    mapper->SetColorModeToMapScalars();
+    mapper->SetLookupTable(color_transfer);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->GetProperty()->BackfaceCullingOn();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WPaintedCloud cv::viz::Widget::cast<cv::viz::WPaintedCloud>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WPaintedCloud&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Cloud Collection Widget implementation
+
+cv::viz::WCloudCollection::WCloudCollection()
+{
+    // Just create the actor
+    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
+    WidgetAccessor::setProp(*this, actor);
+}
+
+void cv::viz::WCloudCollection::addCloud(InputArray cloud, InputArray colors, const Affine3d &pose)
+{
+    vtkSmartPointer<vtkCloudMatSource> source = vtkSmartPointer<vtkCloudMatSource>::New();
+    source->SetColorCloud(cloud, colors);
+
+    vtkSmartPointer<vtkPolyData> polydata = VtkUtils::TransformPolydata(source->GetOutputPort(), pose);
+
+    vtkSmartPointer<vtkLODActor> actor = vtkLODActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("Incompatible widget type." && actor);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+    if (!mapper)
+    {
+        // This is the first cloud
+        mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+        mapper->SetScalarRange(0, 255);
+        mapper->SetScalarModeToUsePointData();
+        mapper->ScalarVisibilityOn();
+        mapper->ImmediateModeRenderingOff();
+        VtkUtils::SetInputData(mapper, polydata);
+
+        actor->SetNumberOfCloudPoints(std::max<vtkIdType>(1, polydata->GetNumberOfPoints()/10));
+        actor->GetProperty()->SetInterpolationToFlat();
+        actor->GetProperty()->BackfaceCullingOn();
+        actor->SetMapper(mapper);
+        return;
+    }
+
+    vtkPolyData *currdata = vtkPolyData::SafeDownCast(mapper->GetInput());
+    CV_Assert("Cloud Widget without data" && currdata);
+
+    vtkSmartPointer<vtkAppendPolyData> append_filter = vtkSmartPointer<vtkAppendPolyData>::New();
+    VtkUtils::AddInputData(append_filter, currdata);
+    VtkUtils::AddInputData(append_filter, polydata);
+    append_filter->Update();
+
+    VtkUtils::SetInputData(mapper, append_filter->GetOutput());
+
+    actor->SetNumberOfCloudPoints(std::max<vtkIdType>(1, actor->GetNumberOfCloudPoints() + polydata->GetNumberOfPoints()/10));
+}
+
+void cv::viz::WCloudCollection::addCloud(InputArray cloud, const Color &color, const Affine3d &pose)
+{
+    addCloud(cloud, Mat(cloud.size(), CV_8UC3, color), pose);
+}
+
+template<> cv::viz::WCloudCollection cv::viz::Widget::cast<cv::viz::WCloudCollection>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCloudCollection&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Cloud Normals Widget implementation
+
+cv::viz::WCloudNormals::WCloudNormals(InputArray _cloud, InputArray _normals, int level, double scale, const Color &color)
+{
+    Mat cloud = _cloud.getMat();
+    Mat normals = _normals.getMat();
+
+    CV_Assert(cloud.type() == CV_32FC3 || cloud.type() == CV_64FC3 || cloud.type() == CV_32FC4 || cloud.type() == CV_64FC4);
+    CV_Assert(cloud.size() == normals.size() && cloud.type() == normals.type());
+
+    int sqlevel = (int)std::sqrt((double)level);
+    int ystep = (cloud.cols > 1 && cloud.rows > 1) ? sqlevel : 1;
+    int xstep = (cloud.cols > 1 && cloud.rows > 1) ? sqlevel : level;
+
+    vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
+    points->SetDataType(cloud.depth() == CV_32F ? VTK_FLOAT : VTK_DOUBLE);
+
+    vtkSmartPointer<vtkCellArray> lines = vtkSmartPointer<vtkCellArray>::New();
+
+    int s_chs = cloud.channels();
+    int n_chs = normals.channels();
+    int total = 0;
+
+    for(int y = 0; y < cloud.rows; y += ystep)
+    {
+        if (cloud.depth() == CV_32F)
+        {
+            const float *srow = cloud.ptr<float>(y);
+            const float *send = srow + cloud.cols * s_chs;
+            const float *nrow = normals.ptr<float>(y);
+
+            for (; srow < send; srow += xstep * s_chs, nrow += xstep * n_chs)
+                if (!isNan(srow) && !isNan(nrow))
+                {
+                    Vec3f endp = Vec3f(srow) + Vec3f(nrow) * (float)scale;
+
+                    points->InsertNextPoint(srow);
+                    points->InsertNextPoint(endp.val);
+
+                    lines->InsertNextCell(2);
+                    lines->InsertCellPoint(total++);
+                    lines->InsertCellPoint(total++);
+                }
+        }
+        else
+        {
+            const double *srow = cloud.ptr<double>(y);
+            const double *send = srow + cloud.cols * s_chs;
+            const double *nrow = normals.ptr<double>(y);
+
+            for (; srow < send; srow += xstep * s_chs, nrow += xstep * n_chs)
+                if (!isNan(srow) && !isNan(nrow))
+                {
+                    Vec3d endp = Vec3d(srow) + Vec3d(nrow) * (double)scale;
+
+                    points->InsertNextPoint(srow);
+                    points->InsertNextPoint(endp.val);
+
+                    lines->InsertNextCell(2);
+                    lines->InsertCellPoint(total++);
+                    lines->InsertCellPoint(total++);
+                }
+        }
+    }
+
+    vtkSmartPointer<vtkPolyData> polyData = vtkSmartPointer<vtkPolyData>::New();
+    polyData->SetPoints(points);
+    polyData->SetLines(lines);
+
+    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
+    mapper->SetColorModeToMapScalars();
+    mapper->SetScalarModeToUsePointData();
+    VtkUtils::SetInputData(mapper, polyData);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WCloudNormals cv::viz::Widget::cast<cv::viz::WCloudNormals>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCloudNormals&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Mesh Widget implementation
+
+cv::viz::WMesh::WMesh(const Mesh &mesh)
+{
+    CV_Assert(mesh.cloud.rows == 1 && mesh.polygons.type() == CV_32SC1);
+
+    vtkSmartPointer<vtkCloudMatSource> source = vtkSmartPointer<vtkCloudMatSource>::New();
+    source->SetColorCloudNormalsTCoords(mesh.cloud, mesh.colors, mesh.normals, mesh.tcoords);
+    source->Update();
+
+    Mat lookup_buffer(1, mesh.cloud.total(), CV_32SC1);
+    int *lookup = lookup_buffer.ptr<int>();
+    for(int y = 0, index = 0; y < mesh.cloud.rows; ++y)
+    {
+        int s_chs = mesh.cloud.channels();
+
+        if (mesh.cloud.depth() == CV_32F)
+        {
+            const float* srow = mesh.cloud.ptr<float>(y);
+            const float* send = srow + mesh.cloud.cols * s_chs;
+
+            for (; srow != send; srow += s_chs, ++lookup)
+                if (!isNan(srow[0]) && !isNan(srow[1]) && !isNan(srow[2]))
+                    *lookup = index++;
+        }
+
+        if (mesh.cloud.depth() == CV_64F)
+        {
+            const double* srow = mesh.cloud.ptr<double>(y);
+            const double* send = srow + mesh.cloud.cols * s_chs;
+
+            for (; srow != send; srow += s_chs, ++lookup)
+                if (!isNan(srow[0]) && !isNan(srow[1]) && !isNan(srow[2]))
+                    *lookup = index++;
+        }
+    }
+    lookup = lookup_buffer.ptr<int>();
+
+    vtkSmartPointer<vtkPolyData> polydata = source->GetOutput();
+    polydata->SetVerts(0);
+
+    const int * polygons = mesh.polygons.ptr<int>();
+    vtkSmartPointer<vtkCellArray> cell_array = vtkSmartPointer<vtkCellArray>::New();
+
+    int idx = 0;
+    size_t polygons_size = mesh.polygons.total();
+    for (size_t i = 0; i < polygons_size; ++idx)
+    {
+        int n_points = polygons[i++];
+
+        cell_array->InsertNextCell(n_points);
+        for (int j = 0; j < n_points; ++j, ++idx)
+            cell_array->InsertCellPoint(lookup[polygons[i++]]);
+    }
+    cell_array->GetData()->SetNumberOfValues(idx);
+    cell_array->Squeeze();
+    polydata->SetStrips(cell_array);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    mapper->SetScalarModeToUsePointData();
+    mapper->ImmediateModeRenderingOff();
+    VtkUtils::SetInputData(mapper, polydata);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    //actor->SetNumberOfCloudPoints(std::max(1, polydata->GetNumberOfPoints() / 10));
+    actor->GetProperty()->SetRepresentationToSurface();
+    actor->GetProperty()->BackfaceCullingOff(); // Backface culling is off for higher efficiency
+    actor->GetProperty()->SetInterpolationToFlat();
+    actor->GetProperty()->EdgeVisibilityOff();
+    actor->GetProperty()->ShadingOff();
+    actor->SetMapper(mapper);
+
+    if (!mesh.texture.empty())
+    {
+        vtkSmartPointer<vtkImageMatSource> image_source = vtkSmartPointer<vtkImageMatSource>::New();
+        image_source->SetImage(mesh.texture);
+
+        vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
+        texture->SetInputConnection(image_source->GetOutputPort());
+        actor->SetTexture(texture);
+    }
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WMesh::WMesh(InputArray cloud, InputArray polygons, InputArray colors, InputArray normals)
+{
+    Mesh mesh;
+    mesh.cloud = cloud.getMat();
+    mesh.colors = colors.getMat();
+    mesh.normals = normals.getMat();
+    mesh.polygons = polygons.getMat();
+    *this = WMesh(mesh);
+}
+
+template<> CV_EXPORTS cv::viz::WMesh cv::viz::Widget::cast<cv::viz::WMesh>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WMesh&>(widget);
+}
diff --git a/modules/viz/src/interactor_style.cpp b/modules/viz/src/interactor_style.cpp
index ccf188ded..75003a2b6 100644
--- a/modules/viz/src/interactor_style.cpp
+++ b/modules/viz/src/interactor_style.cpp
@@ -48,20 +48,21 @@
 
 #include "precomp.hpp"
 
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(InteractorStyle)
+}}
+
+
 //////////////////////////////////////////////////////////////////////////////////////////////
 void cv::viz::InteractorStyle::Initialize()
 {
-    modifier_ = cv::viz::InteractorStyle::KB_MOD_ALT;
     // Set windows size (width, height) to unknown (-1)
     win_size_ = Vec2i(-1, -1);
     win_pos_ = Vec2i(0, 0);
     max_win_size_ = Vec2i(-1, -1);
 
-    // Create the image filter and PNG writer objects
-    wif_ = vtkSmartPointer<vtkWindowToImageFilter>::New();
-    snapshot_writer_ = vtkSmartPointer<vtkPNGWriter>::New();
-    snapshot_writer_->SetInputConnection(wif_->GetOutputPort());
-
     init_ = true;
     stereo_anaglyph_mask_default_ = true;
 
@@ -78,11 +79,37 @@ void cv::viz::InteractorStyle::Initialize()
 void cv::viz::InteractorStyle::saveScreenshot(const String &file)
 {
     FindPokedRenderer(Interactor->GetEventPosition()[0], Interactor->GetEventPosition()[1]);
-    wif_->SetInput(Interactor->GetRenderWindow());
-    wif_->Modified(); // Update the WindowToImageFilter
-    snapshot_writer_->Modified();
-    snapshot_writer_->SetFileName(file.c_str());
-    snapshot_writer_->Write();
+
+    vtkSmartPointer<vtkWindowToImageFilter> wif = vtkSmartPointer<vtkWindowToImageFilter>::New();
+    wif->SetInput(Interactor->GetRenderWindow());
+
+    vtkSmartPointer<vtkPNGWriter> snapshot_writer = vtkSmartPointer<vtkPNGWriter>::New();
+    snapshot_writer->SetInputConnection(wif->GetOutputPort());
+    snapshot_writer->SetFileName(file.c_str());
+    snapshot_writer->Write();
+
+    cout << "Screenshot successfully captured (" << file.c_str() << ")" << endl;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+void cv::viz::InteractorStyle::exportScene(const String &file)
+{
+    vtkSmartPointer<vtkExporter> exporter;
+    if (file.size() > 5 && file.substr(file.size() - 5) == ".vrml")
+    {
+        exporter = vtkSmartPointer<vtkVRMLExporter>::New();
+        vtkVRMLExporter::SafeDownCast(exporter)->SetFileName(file.c_str());
+    }
+    else
+    {
+        exporter = vtkSmartPointer<vtkOBJExporter>::New();
+        vtkOBJExporter::SafeDownCast(exporter)->SetFilePrefix(file.c_str());
+    }
+
+    exporter->SetInput(Interactor->GetRenderWindow());
+    exporter->Write();
+
+    cout << "Scene successfully exported (" << file.c_str() << ")" << endl;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
@@ -121,13 +148,7 @@ void cv::viz::InteractorStyle::OnChar()
     else if (key.find("XF86ZoomOut") != String::npos)
         zoomOut();
 
-    int keymod = false;
-    switch (modifier_)
-    {
-    case KB_MOD_ALT:   keymod = Interactor->GetAltKey(); break;
-    case KB_MOD_CTRL:  keymod = Interactor->GetControlKey(); break;
-    case KB_MOD_SHIFT: keymod = Interactor->GetShiftKey(); break;
-    }
+    int keymod = Interactor->GetAltKey();
 
     switch (Interactor->GetKeyCode())
     {
@@ -180,43 +201,32 @@ void cv::viz::InteractorStyle::registerKeyboardCallback(void (*callback)(const K
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
-bool cv::viz::InteractorStyle::getAltKey() { return Interactor->GetAltKey() != 0; }
-bool cv::viz::InteractorStyle::getShiftKey() { return Interactor->GetShiftKey()!= 0; }
-bool cv::viz::InteractorStyle::getControlKey() { return Interactor->GetControlKey()!= 0; }
+int cv::viz::InteractorStyle::getModifiers()
+{
+    int modifiers = KeyboardEvent::NONE;
+
+    if (Interactor->GetAltKey())
+        modifiers |= KeyboardEvent::ALT;
+
+    if (Interactor->GetControlKey())
+        modifiers |= KeyboardEvent::CTRL;
+
+    if (Interactor->GetShiftKey())
+        modifiers |= KeyboardEvent::SHIFT;
+    return modifiers;
+}
 
 //////////////////////////////////////////////////////////////////////////////////////////////
-void
-cv::viz::InteractorStyle::OnKeyDown()
+void cv::viz::InteractorStyle::OnKeyDown()
 {
     CV_Assert("Interactor style not initialized. Please call Initialize() before continuing" && init_);
-    CV_Assert("No renderer given! Use SetRendererCollection() before continuing." && renderer_);
-
     FindPokedRenderer(Interactor->GetEventPosition()[0], Interactor->GetEventPosition()[1]);
 
-    if (wif_->GetInput() == NULL)
-    {
-        wif_->SetInput(Interactor->GetRenderWindow());
-        wif_->Modified();
-        snapshot_writer_->Modified();
-    }
-
     // Save the initial windows width/height
     if (win_size_[0] == -1 || win_size_[1] == -1)
         win_size_ = Vec2i(Interactor->GetRenderWindow()->GetSize());
 
-
-    // Get the status of special keys (Cltr+Alt+Shift)
-    bool shift = getShiftKey();
-    bool ctrl  = getControlKey();
-    bool alt   = getAltKey();
-
-    bool keymod = false;
-    switch (modifier_)
-    {
-    case KB_MOD_ALT:   keymod = alt;   break;
-    case KB_MOD_CTRL:  keymod = ctrl;  break;
-    case KB_MOD_SHIFT: keymod = shift; break;
-    }
+    bool alt = Interactor->GetAltKey() != 0;
 
     std::string key(Interactor->GetKeySym());
     if (key.find("XF86ZoomIn") != std::string::npos)
@@ -235,8 +245,10 @@ cv::viz::InteractorStyle::OnKeyDown()
                      "          s, S   : switch to a surface-based representation (where available)\n"
                      "\n"
                      "          j, J   : take a .PNG snapshot of the current window view\n"
+                     "          k, K   : export scene to Wavefront .obj format\n"
+                     "    ALT + k, K   : export scene to VRML format\n"
                      "          c, C   : display current camera/window parameters\n"
-                     "          f, F   : fly to point mode\n"
+                     "          f, F   : fly to point mode, hold the key and move mouse where to fly\n"
                      "\n"
                      "          e, E   : exit the interactor\n"
                      "          q, Q   : stop and call VTK's TerminateApp\n"
@@ -249,7 +261,7 @@ cv::viz::InteractorStyle::OnKeyDown()
                      "    ALT + s, S   : turn stereo mode on/off\n"
                      "    ALT + f, F   : switch between maximized window mode and original size\n"
                      "\n"
-                     << std::endl;
+                  << std::endl;
         break;
     }
 
@@ -261,66 +273,41 @@ cv::viz::InteractorStyle::OnKeyDown()
         for (ac->InitTraversal(ait); vtkActor* actor = ac->GetNextActor(ait); )
             for (actor->InitPathTraversal(); vtkAssemblyPath* path = actor->GetNextPath(); )
             {
-                vtkActor* apart = reinterpret_cast <vtkActor*>(path->GetLastNode()->GetViewProp());
+                vtkActor* apart = vtkActor::SafeDownCast(path->GetLastNode()->GetViewProp());
                 apart->GetProperty()->SetRepresentationToPoints();
             }
         break;
     }
-        // Save a PNG snapshot with the current screen
+
+        // Save a PNG snapshot
     case 'j': case 'J':
+        saveScreenshot(cv::format("screenshot-%d.png", (unsigned int)time(0))); break;
+
+        // Export scene as in obj or vrml format
+    case 'k': case 'K':
     {
-        unsigned int t = static_cast<unsigned int>(time(0));
-        String png_file = cv::format("screenshot-%d.png", t);
-        String cam_file = cv::format("screenshot-%d.cam", t);
-
-        vtkSmartPointer<vtkCamera> cam = Interactor->GetRenderWindow()->GetRenderers()->GetFirstRenderer()->GetActiveCamera();
-        Vec2d clip;
-        Vec3d focal, pos, view;
-        cam->GetClippingRange(clip.val);
-        cam->GetFocalPoint(focal.val);
-        cam->GetPosition(pos.val);
-        cam->GetViewUp(view.val);
-        Vec2i win_pos(Interactor->GetRenderWindow()->GetPosition());
-        Vec2i win_size(Interactor->GetRenderWindow()->GetSize());
-        double angle = cam->GetViewAngle() / 180.0 * CV_PI;
-
-        String data = cv::format("%f,%f/%f,%f,%f/%f,%f,%f/%f,%f,%f/%f/%d,%d/%d,%d", clip[0],clip[1], focal[0],focal[1],focal[2],
-                 pos[0],pos[1],pos[2], view[0],view[1], view[2], angle , win_size[0],win_size[1], win_pos[0], win_pos[1]);
-
-        saveScreenshot(png_file);
-        ofstream ofs_cam(cam_file.c_str());
-        ofs_cam << data.c_str() << endl;
-        ofs_cam.close();
-
-        cout << "Screenshot (" << png_file.c_str() << ") and camera information (" << cam_file.c_str() << ") successfully captured." << endl;
+        String format = alt ? "scene-%d.vrml" : "scene-%d";
+        exportScene(cv::format(format.c_str(), (unsigned int)time(0)));
         break;
     }
+
         // display current camera settings/parameters
     case 'c': case 'C':
     {
         vtkSmartPointer<vtkCamera> cam = Interactor->GetRenderWindow()->GetRenderers()->GetFirstRenderer()->GetActiveCamera();
 
-        Vec2d clip;
-        Vec3d focal, pose, view;
-        cam->GetClippingRange(clip.val);
-        cam->GetFocalPoint(focal.val);
-        cam->GetPosition(pose.val);
-        cam->GetViewUp(view.val);
+        Vec2d clip(cam->GetClippingRange());
+        Vec3d focal(cam->GetFocalPoint()), pos(cam->GetPosition()), view(cam->GetViewUp());
         Vec2i win_pos(Interactor->GetRenderWindow()->GetPosition());
         Vec2i win_size(Interactor->GetRenderWindow()->GetSize());
+        double angle = cam->GetViewAngle () / 180.0 * CV_PI;
+
+        String data = cv::format("clip(%f,%f) focal(%f,%f,%f) pos(%f,%f,%f) view(%f,%f,%f) angle(%f) winsz(%d,%d) winpos(%d,%d)",
+                                 clip[0], clip[1], focal[0], focal[1], focal[2], pos[0], pos[1], pos[2], view[0], view[1], view[2],
+                                 angle, win_size[0], win_size[1], win_pos[0], win_pos[1]);
+
+        std::cout << data.c_str() << std::endl;
 
-        cv::print(Mat(clip, false).reshape(1, 1));
-        std::cout << "/";
-        cv::print(Mat(focal, false).reshape(1, 1));
-        std::cout << "/";
-        cv::print(Mat(pose, false).reshape(1, 1));
-        std::cout << "/";
-        cv::print(Mat(view, false).reshape(1, 1));
-        std::cout << "/" << cam->GetViewAngle () / 180.0 * CV_PI;
-        cv::print(Mat(win_size, false).reshape(1, 1));
-        std::cout << "/";
-        cv::print(Mat(win_pos, false).reshape(1, 1));
-        std::cout << std::endl;
         break;
     }
     case '=':
@@ -339,7 +326,7 @@ cv::viz::InteractorStyle::OnKeyDown()
             for (ac->InitTraversal(ait); vtkActor* actor = ac->GetNextActor(ait); )
                 for (actor->InitPathTraversal(); vtkAssemblyPath* path = actor->GetNextPath(); )
                 {
-                    vtkActor* apart = reinterpret_cast <vtkActor*>(path->GetLastNode()->GetViewProp());
+                    vtkActor* apart = vtkActor::SafeDownCast(path->GetLastNode()->GetViewProp());
                     float psize = apart->GetProperty()->GetPointSize();
                     if (psize < 63.0f)
                         apart->GetProperty()->SetPointSize(psize + 1.0f);
@@ -358,7 +345,7 @@ cv::viz::InteractorStyle::OnKeyDown()
             for (ac->InitTraversal(ait); vtkActor* actor = ac->GetNextActor(ait); )
                 for (actor->InitPathTraversal(); vtkAssemblyPath* path = actor->GetNextPath(); )
                 {
-                    vtkActor* apart = static_cast<vtkActor*>(path->GetLastNode()->GetViewProp());
+                    vtkActor* apart = vtkActor::SafeDownCast(path->GetLastNode()->GetViewProp());
                     float psize = apart->GetProperty()->GetPointSize();
                     if (psize > 1.0f)
                         apart->GetProperty()->SetPointSize(psize - 1.0f);
@@ -369,7 +356,7 @@ cv::viz::InteractorStyle::OnKeyDown()
         // Switch between maximize and original window size
     case 'f': case 'F':
     {
-        if (keymod)
+        if (alt)
         {
             Vec2i screen_size(Interactor->GetRenderWindow()->GetScreenSize());
             Vec2i win_size(Interactor->GetRenderWindow()->GetSize());
@@ -397,13 +384,11 @@ cv::viz::InteractorStyle::OnKeyDown()
         else
         {
             AnimState = VTKIS_ANIM_ON;
-            vtkAssemblyPath *path = NULL;
             Interactor->GetPicker()->Pick(Interactor->GetEventPosition()[0], Interactor->GetEventPosition()[1], 0.0, CurrentRenderer);
-            vtkAbstractPropPicker *picker;
-            if ((picker = vtkAbstractPropPicker::SafeDownCast(Interactor->GetPicker())))
-                path = picker->GetPath();
-            if (path != NULL)
-                Interactor->FlyTo(CurrentRenderer, picker->GetPickPosition());
+            vtkSmartPointer<vtkAbstractPropPicker> picker = vtkAbstractPropPicker::SafeDownCast(Interactor->GetPicker());
+            if (picker)
+                if (picker->GetPath())
+                    Interactor->FlyTo(CurrentRenderer, picker->GetPickPosition());
             AnimState = VTKIS_ANIM_OFF;
         }
         break;
@@ -411,24 +396,16 @@ cv::viz::InteractorStyle::OnKeyDown()
         // 's'/'S' w/out ALT
     case 's': case 'S':
     {
-        if (keymod)
+        if (alt)
         {
-            int stereo_render = Interactor->GetRenderWindow()->GetStereoRender();
-            if (!stereo_render)
+            vtkSmartPointer<vtkRenderWindow> window = Interactor->GetRenderWindow();
+            if (!window->GetStereoRender())
             {
-                if (stereo_anaglyph_mask_default_)
-                {
-                    Interactor->GetRenderWindow()->SetAnaglyphColorMask(4, 3);
-                    stereo_anaglyph_mask_default_ = false;
-                }
-                else
-                {
-                    Interactor->GetRenderWindow()->SetAnaglyphColorMask(2, 5);
-                    stereo_anaglyph_mask_default_ = true;
-                }
+                static Vec2i red_blue(4, 3), magenta_green(2, 5);
+                window->SetAnaglyphColorMask (stereo_anaglyph_mask_default_ ? red_blue.val : magenta_green.val);
+                stereo_anaglyph_mask_default_ = !stereo_anaglyph_mask_default_;
             }
-            Interactor->GetRenderWindow()->SetStereoRender(!stereo_render);
-            Interactor->GetRenderWindow()->Render();
+            window->SetStereoRender(!window->GetStereoRender());
             Interactor->Render();
         }
         else
@@ -440,43 +417,34 @@ cv::viz::InteractorStyle::OnKeyDown()
     {
         vtkSmartPointer<vtkCamera> cam = CurrentRenderer->GetActiveCamera();
         cam->SetParallelProjection(!cam->GetParallelProjection());
-        CurrentRenderer->SetActiveCamera(cam);
         CurrentRenderer->Render();
         break;
     }
 
-    // Overwrite the camera reset
+        // Overwrite the camera reset
     case 'r': case 'R':
     {
-        if (!keymod)
+        if (!alt)
         {
             Superclass::OnKeyDown();
             break;
         }
 
-        vtkSmartPointer<vtkCamera> cam = CurrentRenderer->GetActiveCamera();
-
-        static WidgetActorMap::iterator it = widget_actor_map_->begin();
+        WidgetActorMap::iterator it = widget_actor_map_->begin();
         // it might be that some actors don't have a valid transformation set -> we skip them to avoid a seg fault.
-        bool found_transformation = false;
-
-        for (size_t idx = 0; idx < widget_actor_map_->size(); ++idx, ++it)
+        for (; it != widget_actor_map_->end();  ++it)
         {
-            if (it == widget_actor_map_->end())
-                it = widget_actor_map_->begin();
-
             vtkProp3D * actor = vtkProp3D::SafeDownCast(it->second);
             if (actor && actor->GetUserMatrix())
-            {
-                found_transformation = true;
                 break;
-            }
         }
 
+        vtkSmartPointer<vtkCamera> cam = CurrentRenderer->GetActiveCamera();
+
         // if a valid transformation was found, use it otherwise fall back to default view point.
-        if (found_transformation)
+        if (it != widget_actor_map_->end())
         {
-            const vtkMatrix4x4* m = vtkProp3D::SafeDownCast(it->second)->GetUserMatrix();
+            vtkMatrix4x4* m = vtkProp3D::SafeDownCast(it->second)->GetUserMatrix();
 
             cam->SetFocalPoint(m->GetElement(0, 3) - m->GetElement(0, 2),
                                m->GetElement(1, 3) - m->GetElement(1, 2),
@@ -516,23 +484,18 @@ cv::viz::InteractorStyle::OnKeyDown()
     }
     }
 
-    KeyboardEvent event(true, Interactor->GetKeySym(), Interactor->GetKeyCode(), getAltKey(), getControlKey(), getShiftKey());
-    // Check if there is a keyboard callback registered
+    KeyboardEvent event(KeyboardEvent::KEY_DOWN, Interactor->GetKeySym(), Interactor->GetKeyCode(), getModifiers());
     if (keyboardCallback_)
-      keyboardCallback_(event, keyboard_callback_cookie_);
-
-    renderer_->Render();
+        keyboardCallback_(event, keyboard_callback_cookie_);
     Interactor->Render();
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
 void cv::viz::InteractorStyle::OnKeyUp()
 {
-    KeyboardEvent event(false, Interactor->GetKeySym(), Interactor->GetKeyCode(), getAltKey(), getControlKey(), getShiftKey());
-    // Check if there is a keyboard callback registered
+    KeyboardEvent event(KeyboardEvent::KEY_UP, Interactor->GetKeySym(), Interactor->GetKeyCode(), getModifiers());
     if (keyboardCallback_)
-      keyboardCallback_(event, keyboard_callback_cookie_);
-
+        keyboardCallback_(event, keyboard_callback_cookie_);
     Superclass::OnKeyUp();
 }
 
@@ -540,9 +503,9 @@ void cv::viz::InteractorStyle::OnKeyUp()
 void cv::viz::InteractorStyle::OnMouseMove()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseMove, MouseEvent::NoButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(MouseEvent::MouseMove, MouseEvent::NoButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnMouseMove();
 }
 
@@ -551,9 +514,9 @@ void cv::viz::InteractorStyle::OnLeftButtonDown()
 {
     Vec2i p(Interactor->GetEventPosition());
     MouseEvent::Type type = (Interactor->GetRepeatCount() == 0) ? MouseEvent::MouseButtonPress : MouseEvent::MouseDblClick;
-    MouseEvent event(type, MouseEvent::LeftButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(type, MouseEvent::LeftButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnLeftButtonDown();
 }
 
@@ -561,9 +524,9 @@ void cv::viz::InteractorStyle::OnLeftButtonDown()
 void cv::viz::InteractorStyle::OnLeftButtonUp()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::LeftButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::LeftButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnLeftButtonUp();
 }
 
@@ -571,11 +534,10 @@ void cv::viz::InteractorStyle::OnLeftButtonUp()
 void cv::viz::InteractorStyle::OnMiddleButtonDown()
 {
     Vec2i p(Interactor->GetEventPosition());
-
     MouseEvent::Type type = (Interactor->GetRepeatCount() == 0) ? MouseEvent::MouseButtonPress : MouseEvent::MouseDblClick;
-    MouseEvent event(type, MouseEvent::MiddleButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(type, MouseEvent::MiddleButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnMiddleButtonDown();
 }
 
@@ -583,9 +545,9 @@ void cv::viz::InteractorStyle::OnMiddleButtonDown()
 void cv::viz::InteractorStyle::OnMiddleButtonUp()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::MiddleButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::MiddleButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnMiddleButtonUp();
 }
 
@@ -593,11 +555,10 @@ void cv::viz::InteractorStyle::OnMiddleButtonUp()
 void cv::viz::InteractorStyle::OnRightButtonDown()
 {
     Vec2i p(Interactor->GetEventPosition());
-
     MouseEvent::Type type = (Interactor->GetRepeatCount() == 0) ? MouseEvent::MouseButtonPress : MouseEvent::MouseDblClick;
-    MouseEvent event(type, MouseEvent::RightButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(type, MouseEvent::RightButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnRightButtonDown();
 }
 
@@ -605,9 +566,9 @@ void cv::viz::InteractorStyle::OnRightButtonDown()
 void cv::viz::InteractorStyle::OnRightButtonUp()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::RightButton, p, getAltKey(), getControlKey(), getShiftKey());
+    MouseEvent event(MouseEvent::MouseButtonRelease, MouseEvent::RightButton, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     Superclass::OnRightButtonUp();
 }
 
@@ -615,12 +576,11 @@ void cv::viz::InteractorStyle::OnRightButtonUp()
 void cv::viz::InteractorStyle::OnMouseWheelForward()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseScrollUp, MouseEvent::VScroll, p, getAltKey(), getControlKey(), getShiftKey());
-    // If a mouse callback registered, call it!
+    MouseEvent event(MouseEvent::MouseScrollUp, MouseEvent::VScroll, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
     if (Interactor->GetRepeatCount() && mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
 
     if (Interactor->GetAltKey())
     {
@@ -632,11 +592,9 @@ void cv::viz::InteractorStyle::OnMouseWheelForward()
 
         cam->SetViewAngle(opening_angle);
         cam->Modified();
-        CurrentRenderer->SetActiveCamera(cam);
         CurrentRenderer->ResetCameraClippingRange();
         CurrentRenderer->Modified();
         CurrentRenderer->Render();
-        renderer_->Render();
         Interactor->Render();
     }
     else
@@ -647,13 +605,12 @@ void cv::viz::InteractorStyle::OnMouseWheelForward()
 void cv::viz::InteractorStyle::OnMouseWheelBackward()
 {
     Vec2i p(Interactor->GetEventPosition());
-    MouseEvent event(MouseEvent::MouseScrollDown, MouseEvent::VScroll, p, getAltKey(), getControlKey(), getShiftKey());
-    // If a mouse callback registered, call it!
+    MouseEvent event(MouseEvent::MouseScrollDown, MouseEvent::VScroll, p, getModifiers());
     if (mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
 
     if (Interactor->GetRepeatCount() && mouseCallback_)
-      mouseCallback_(event, mouse_callback_cookie_);
+        mouseCallback_(event, mouse_callback_cookie_);
 
     if (Interactor->GetAltKey())
     {
@@ -665,11 +622,9 @@ void cv::viz::InteractorStyle::OnMouseWheelBackward()
 
         cam->SetViewAngle(opening_angle);
         cam->Modified();
-        CurrentRenderer->SetActiveCamera(cam);
         CurrentRenderer->ResetCameraClippingRange();
         CurrentRenderer->Modified();
         CurrentRenderer->Render();
-        renderer_->Render();
         Interactor->Render();
     }
     else
@@ -680,13 +635,5 @@ void cv::viz::InteractorStyle::OnMouseWheelBackward()
 void cv::viz::InteractorStyle::OnTimer()
 {
     CV_Assert("Interactor style not initialized." && init_);
-    CV_Assert("Renderer has not been set." && renderer_);
-    renderer_->Render();
     Interactor->Render();
 }
-
-namespace cv { namespace viz
-{
-    //Standard VTK macro for *New()
-    vtkStandardNewMacro(InteractorStyle)
-}}
diff --git a/modules/viz/src/interactor_style.hpp b/modules/viz/src/interactor_style.hpp
index 3af13fcc4..8d01697a8 100644
--- a/modules/viz/src/interactor_style.hpp
+++ b/modules/viz/src/interactor_style.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_INTERACTOR_STYLE_H__
@@ -56,9 +53,6 @@ namespace cv
         class InteractorStyle : public vtkInteractorStyleTrackballCamera
         {
         public:
-
-            enum KeyboardModifier { KB_MOD_ALT, KB_MOD_CTRL, KB_MOD_SHIFT };
-
             static InteractorStyle *New();
             virtual ~InteractorStyle() {}
 
@@ -69,31 +63,21 @@ namespace cv
             virtual void Initialize();
 
             void setWidgetActorMap(const Ptr<WidgetActorMap>& actors) { widget_actor_map_ = actors; }
-            void setRenderer(vtkSmartPointer<vtkRenderer>& ren) { renderer_ = ren; }
             void registerMouseCallback(void (*callback)(const MouseEvent&, void*), void* cookie = 0);
             void registerKeyboardCallback(void (*callback)(const KeyboardEvent&, void*), void * cookie = 0);
             void saveScreenshot(const String &file);
-
-            /** \brief Change the default keyboard modified from ALT to a different special key.*/
-            inline void setKeyboardModifier(const KeyboardModifier &modifier) { modifier_ = modifier; }
+            void exportScene(const String &file);
 
         private:
             /** \brief Set to true after initialization is complete. */
             bool init_;
 
-            vtkSmartPointer<vtkRenderer> renderer_;
             Ptr<WidgetActorMap> widget_actor_map_;
 
             Vec2i win_size_;
             Vec2i win_pos_;
             Vec2i max_win_size_;
 
-            /** \brief A PNG writer for screenshot captures. */
-            vtkSmartPointer<vtkPNGWriter> snapshot_writer_;
-
-            /** \brief Internal window to image filter. Needed by \a snapshot_writer_. */
-            vtkSmartPointer<vtkWindowToImageFilter> wif_;
-
             /** \brief Interactor style internal method. Gets called whenever a key is pressed. */
             virtual void OnChar();
 
@@ -121,17 +105,13 @@ namespace cv
             /** \brief True if we're using red-blue colors for anaglyphic stereo, false if magenta-green. */
             bool stereo_anaglyph_mask_default_;
 
-            KeyboardModifier modifier_;
-
             void (*keyboardCallback_)(const KeyboardEvent&, void*);
             void *keyboard_callback_cookie_;
 
             void (*mouseCallback_)(const MouseEvent&, void*);
             void *mouse_callback_cookie_;
 
-            bool getAltKey();
-            bool getControlKey();
-            bool getShiftKey();
+            int getModifiers();
         };
     }
 }
diff --git a/modules/viz/src/precomp.hpp b/modules/viz/src/precomp.hpp
index ab673b389..de5346ebf 100644
--- a/modules/viz/src/precomp.hpp
+++ b/modules/viz/src/precomp.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifndef __OPENCV_VIZ_PRECOMP_HPP__
@@ -53,6 +50,8 @@
 #include <ctime>
 #include <list>
 #include <vector>
+#include <iomanip>
+#include <limits>
 
 #include <vtkAppendPolyData.h>
 #include <vtkAssemblyPath.h>
@@ -94,13 +93,13 @@
 #include <vtkInteractorStyleTrackballCamera.h>
 #include <vtkProperty.h>
 #include <vtkCamera.h>
-#include <vtkObjectFactory.h>
 #include <vtkPlanes.h>
 #include <vtkImageFlip.h>
 #include <vtkRenderWindow.h>
 #include <vtkTextProperty.h>
 #include <vtkProperty2D.h>
 #include <vtkLODActor.h>
+#include <vtkActor.h>
 #include <vtkTextActor.h>
 #include <vtkRenderWindowInteractor.h>
 #include <vtkMath.h>
@@ -110,12 +109,50 @@
 #include <vtkPolyDataNormals.h>
 #include <vtkAlgorithmOutput.h>
 #include <vtkImageMapper.h>
+#include <vtkPoints.h>
+#include <vtkInformation.h>
+#include <vtkInformationVector.h>
+#include <vtkObjectFactory.h>
+#include <vtkPolyDataAlgorithm.h>
+#include <vtkMergeFilter.h>
+#include <vtkDataSetWriter.h>
+#include <vtkErrorCode.h>
+#include <vtkPLYWriter.h>
+#include <vtkSTLWriter.h>
+#include <vtkSimplePointsReader.h>
+#include <vtkPLYReader.h>
+#include <vtkOBJReader.h>
+#include <vtkSTLReader.h>
+#include <vtkPNGReader.h>
+#include <vtkOBJExporter.h>
+#include <vtkVRMLExporter.h>
+#include <vtkTensorGlyph.h>
+#include <vtkImageAlgorithm.h>
+#include <vtkTransformFilter.h>
+#include <vtkConeSource.h>
+#include <vtkElevationFilter.h>
+#include <vtkColorTransferFunction.h>
+#include <vtkStreamingDemandDrivenPipeline.h>
+
+#if !defined(_WIN32) || defined(__CYGWIN__)
+# include <unistd.h> /* unlink */
+#else
+# include <io.h> /* unlink */
+#endif
+
+#include <vtk/vtkOBJWriter.h>
+#include <vtk/vtkXYZWriter.h>
+#include <vtk/vtkCloudMatSink.h>
+#include <vtk/vtkCloudMatSource.h>
+#include <vtk/vtkTrajectorySource.h>
+#include <vtk/vtkImageMatSource.h>
 
 #include <opencv2/core.hpp>
 #include <opencv2/viz.hpp>
 #include <opencv2/viz/widget_accessor.hpp>
 #include <opencv2/core/utility.hpp>
 
+
 namespace cv
 {
     namespace viz
@@ -144,11 +181,145 @@ namespace cv
             static VizMap storage;
             friend class Viz3d;
         };
+
+        template<typename _Tp> inline _Tp normalized(const _Tp& v) { return v * 1/norm(v); }
+
+        template<typename _Tp> inline bool isNan(const _Tp* data)
+        {
+            return isNan(data[0]) || isNan(data[1]) || isNan(data[2]);
+        }
+
+        inline vtkSmartPointer<vtkActor> getActor(const Widget3D& widget)
+        {
+            return vtkActor::SafeDownCast(WidgetAccessor::getProp(widget));
+        }
+
+        inline vtkSmartPointer<vtkPolyData> getPolyData(const Widget3D& widget)
+        {
+            vtkSmartPointer<vtkMapper> mapper = getActor(widget)->GetMapper();
+            return vtkPolyData::SafeDownCast(mapper->GetInput());
+        }
+
+        inline vtkSmartPointer<vtkMatrix4x4> vtkmatrix(const cv::Matx44d &matrix)
+        {
+            vtkSmartPointer<vtkMatrix4x4> vtk_matrix = vtkSmartPointer<vtkMatrix4x4>::New();
+            vtk_matrix->DeepCopy(matrix.val);
+            return vtk_matrix;
+        }
+
+        inline Color vtkcolor(const Color& color)
+        {
+            Color scaled_color = color * (1.0/255.0);
+            std::swap(scaled_color[0], scaled_color[2]);
+            return scaled_color;
+        }
+
+        inline Vec3d get_random_vec(double from = -10.0, double to = 10.0)
+        {
+            RNG& rng = theRNG();
+            return Vec3d(rng.uniform(from, to), rng.uniform(from, to), rng.uniform(from, to));
+        }
+
+        struct VtkUtils
+        {
+            template<class Filter>
+            static void SetInputData(vtkSmartPointer<Filter> filter, vtkPolyData* polydata)
+            {
+            #if VTK_MAJOR_VERSION <= 5
+                filter->SetInput(polydata);
+            #else
+                filter->SetInputData(polydata);
+            #endif
+            }
+            template<class Filter>
+            static void SetSourceData(vtkSmartPointer<Filter> filter, vtkPolyData* polydata)
+            {
+            #if VTK_MAJOR_VERSION <= 5
+                filter->SetSource(polydata);
+            #else
+                filter->SetSourceData(polydata);
+            #endif
+            }
+
+            template<class Filter>
+            static void SetInputData(vtkSmartPointer<Filter> filter, vtkImageData* polydata)
+            {
+            #if VTK_MAJOR_VERSION <= 5
+                filter->SetInput(polydata);
+            #else
+                filter->SetInputData(polydata);
+            #endif
+            }
+
+            template<class Filter>
+            static void AddInputData(vtkSmartPointer<Filter> filter, vtkPolyData *polydata)
+            {
+            #if VTK_MAJOR_VERSION <= 5
+                filter->AddInput(polydata);
+            #else
+                filter->AddInputData(polydata);
+            #endif
+            }
+
+            static vtkSmartPointer<vtkUnsignedCharArray> FillScalars(size_t size, const Color& color)
+            {
+                Vec3b rgb = Vec3d(color[2], color[1], color[0]);
+                Vec3b* color_data = new Vec3b[size];
+                std::fill(color_data, color_data + size, rgb);
+
+                vtkSmartPointer<vtkUnsignedCharArray> scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
+                scalars->SetName("Colors");
+                scalars->SetNumberOfComponents(3);
+                scalars->SetNumberOfTuples(size);
+                scalars->SetArray(color_data->val, size * 3, 0);
+                return scalars;
+            }
+
+            static vtkSmartPointer<vtkPolyData> ComputeNormals(vtkSmartPointer<vtkPolyData> polydata)
+            {
+                vtkSmartPointer<vtkPolyDataNormals> normals_generator = vtkSmartPointer<vtkPolyDataNormals>::New();
+                normals_generator->ComputePointNormalsOn();
+                normals_generator->ComputeCellNormalsOff();
+                normals_generator->SetFeatureAngle(0.1);
+                normals_generator->SetSplitting(0);
+                normals_generator->SetConsistency(1);
+                normals_generator->SetAutoOrientNormals(0);
+                normals_generator->SetFlipNormals(0);
+                normals_generator->SetNonManifoldTraversal(1);
+                VtkUtils::SetInputData(normals_generator, polydata);
+                normals_generator->Update();
+                return normals_generator->GetOutput();
+            }
+
+            static vtkSmartPointer<vtkPolyData> TransformPolydata(vtkSmartPointer<vtkAlgorithmOutput> algorithm_output_port, const Affine3d& pose)
+            {
+                vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
+                transform->SetMatrix(vtkmatrix(pose.matrix));
+
+                vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
+                transform_filter->SetTransform(transform);
+                transform_filter->SetInputConnection(algorithm_output_port);
+                transform_filter->Update();
+                return transform_filter->GetOutput();
+            }
+
+            static vtkSmartPointer<vtkPolyData> TransformPolydata(vtkSmartPointer<vtkPolyData> polydata, const Affine3d& pose)
+            {
+                vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
+                transform->SetMatrix(vtkmatrix(pose.matrix));
+
+                vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
+                VtkUtils::SetInputData(transform_filter, polydata);
+                transform_filter->SetTransform(transform);
+                transform_filter->Update();
+                return transform_filter->GetOutput();
+            }
+        };
     }
 }
 
 #include "interactor_style.hpp"
-#include "viz3d_impl.hpp"
+#include "vizimpl.hpp"
 
 
 #endif
diff --git a/modules/viz/src/shape_widgets.cpp b/modules/viz/src/shape_widgets.cpp
deleted file mode 100644
index 6e4f4c70f..000000000
--- a/modules/viz/src/shape_widgets.cpp
+++ /dev/null
@@ -1,1497 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Ozan Tonkal, ozantonkal@gmail.com
-//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
-//
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
-//M*/
-
-#include "precomp.hpp"
-
-namespace cv
-{
-    namespace viz
-    {
-        template<typename _Tp> Vec<_Tp, 3>* vtkpoints_data(vtkSmartPointer<vtkPoints>& points);
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// line widget implementation
-cv::viz::WLine::WLine(const Point3f &pt1, const Point3f &pt2, const Color &color)
-{
-    vtkSmartPointer<vtkLineSource> line = vtkSmartPointer<vtkLineSource>::New();
-    line->SetPoint1(pt1.x, pt1.y, pt1.z);
-    line->SetPoint2(pt2.x, pt2.y, pt2.z);
-    line->Update();
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(line->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WLine cv::viz::Widget::cast<cv::viz::WLine>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WLine&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// plane widget implementation
-
-struct cv::viz::WPlane::SetSizeImpl
-{
-    template<typename _Tp>
-    static vtkSmartPointer<vtkTransformPolyDataFilter> setSize(const Vec<_Tp, 3> &center, vtkSmartPointer<vtkAlgorithmOutput> poly_data_port, double size)
-    {
-        vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-        transform->PreMultiply();
-        transform->Translate(center[0], center[1], center[2]);
-        transform->Scale(size, size, size);
-        transform->Translate(-center[0], -center[1], -center[2]);
-
-        vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-        transform_filter->SetInputConnection(poly_data_port);
-        transform_filter->SetTransform(transform);
-        transform_filter->Update();
-
-        return transform_filter;
-    }
-};
-
-cv::viz::WPlane::WPlane(const Vec4f& coefs, float size, const Color &color)
-{
-    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
-    plane->SetNormal(coefs[0], coefs[1], coefs[2]);
-    double norm = cv::norm(Vec3f(coefs.val));
-    plane->Push(-coefs[3] / norm);
-
-    Vec3d p_center;
-    plane->GetOrigin(p_center.val);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(SetSizeImpl::setSize(p_center, plane->GetOutputPort(), size)->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-cv::viz::WPlane::WPlane(const Vec4f& coefs, const Point3f& pt, float size, const Color &color)
-{
-    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
-    Point3f coefs3(coefs[0], coefs[1], coefs[2]);
-    double norm_sqr = 1.0 / coefs3.dot(coefs3);
-    plane->SetNormal(coefs[0], coefs[1], coefs[2]);
-
-    double t = coefs3.dot(pt) + coefs[3];
-    Vec3f p_center = pt - coefs3 * t * norm_sqr;
-    plane->SetCenter(p_center[0], p_center[1], p_center[2]);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(SetSizeImpl::setSize(p_center, plane->GetOutputPort(), size)->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WPlane cv::viz::Widget::cast<cv::viz::WPlane>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WPlane&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// sphere widget implementation
-
-cv::viz::WSphere::WSphere(const Point3f &center, float radius, int sphere_resolution, const Color &color)
-{
-    vtkSmartPointer<vtkSphereSource> sphere = vtkSmartPointer<vtkSphereSource>::New();
-    sphere->SetRadius(radius);
-    sphere->SetCenter(center.x, center.y, center.z);
-    sphere->SetPhiResolution(sphere_resolution);
-    sphere->SetThetaResolution(sphere_resolution);
-    sphere->LatLongTessellationOff();
-    sphere->Update();
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(sphere->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WSphere cv::viz::Widget::cast<cv::viz::WSphere>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WSphere&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// arrow widget implementation
-
-cv::viz::WArrow::WArrow(const Point3f& pt1, const Point3f& pt2, float thickness, const Color &color)
-{
-    vtkSmartPointer<vtkArrowSource> arrowSource = vtkSmartPointer<vtkArrowSource>::New();
-    arrowSource->SetShaftRadius(thickness);
-    // The thickness and radius of the tip are adjusted based on the thickness of the arrow
-    arrowSource->SetTipRadius(thickness * 3.0);
-    arrowSource->SetTipLength(thickness * 10.0);
-
-    float startPoint[3], endPoint[3];
-    startPoint[0] = pt1.x;
-    startPoint[1] = pt1.y;
-    startPoint[2] = pt1.z;
-    endPoint[0] = pt2.x;
-    endPoint[1] = pt2.y;
-    endPoint[2] = pt2.z;
-    float normalizedX[3], normalizedY[3], normalizedZ[3];
-
-    // The X axis is a vector from start to end
-    vtkMath::Subtract(endPoint, startPoint, normalizedX);
-    float length = vtkMath::Norm(normalizedX);
-    vtkMath::Normalize(normalizedX);
-
-    // The Z axis is an arbitrary vecotr cross X
-    float arbitrary[3];
-    arbitrary[0] = vtkMath::Random(-10,10);
-    arbitrary[1] = vtkMath::Random(-10,10);
-    arbitrary[2] = vtkMath::Random(-10,10);
-    vtkMath::Cross(normalizedX, arbitrary, normalizedZ);
-    vtkMath::Normalize(normalizedZ);
-
-    // The Y axis is Z cross X
-    vtkMath::Cross(normalizedZ, normalizedX, normalizedY);
-    vtkSmartPointer<vtkMatrix4x4> matrix = vtkSmartPointer<vtkMatrix4x4>::New();
-
-    // Create the direction cosine matrix
-    matrix->Identity();
-    for (unsigned int i = 0; i < 3; i++)
-    {
-        matrix->SetElement(i, 0, normalizedX[i]);
-        matrix->SetElement(i, 1, normalizedY[i]);
-        matrix->SetElement(i, 2, normalizedZ[i]);
-    }
-
-    // Apply the transforms
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->Translate(startPoint);
-    transform->Concatenate(matrix);
-    transform->Scale(length, length, length);
-
-    // Transform the polydata
-    vtkSmartPointer<vtkTransformPolyDataFilter> transformPD = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transformPD->SetTransform(transform);
-    transformPD->SetInputConnection(arrowSource->GetOutputPort());
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(transformPD->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WArrow cv::viz::Widget::cast<cv::viz::WArrow>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WArrow&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// circle widget implementation
-
-cv::viz::WCircle::WCircle(const Point3f& pt, float radius, float thickness, const Color& color)
-{
-    vtkSmartPointer<vtkDiskSource> disk = vtkSmartPointer<vtkDiskSource>::New();
-    // Maybe the resolution should be lower e.g. 50 or 25
-    disk->SetCircumferentialResolution(50);
-    disk->SetInnerRadius(radius - thickness);
-    disk->SetOuterRadius(radius + thickness);
-
-    // Set the circle origin
-    vtkSmartPointer<vtkTransform> t = vtkSmartPointer<vtkTransform>::New();
-    t->Identity();
-    t->Translate(pt.x, pt.y, pt.z);
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> tf = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    tf->SetTransform(t);
-    tf->SetInputConnection(disk->GetOutputPort());
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(tf->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WCircle cv::viz::Widget::cast<cv::viz::WCircle>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCircle&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// cylinder widget implementation
-
-cv::viz::WCylinder::WCylinder(const Point3f& pt_on_axis, const Point3f& axis_direction, float radius, int numsides, const Color &color)
-{
-    const Point3f pt2 = pt_on_axis + axis_direction;
-    vtkSmartPointer<vtkLineSource> line = vtkSmartPointer<vtkLineSource>::New();
-    line->SetPoint1(pt_on_axis.x, pt_on_axis.y, pt_on_axis.z);
-    line->SetPoint2(pt2.x, pt2.y, pt2.z);
-
-    vtkSmartPointer<vtkTubeFilter> tuber = vtkSmartPointer<vtkTubeFilter>::New();
-    tuber->SetInputConnection(line->GetOutputPort());
-    tuber->SetRadius(radius);
-    tuber->SetNumberOfSides(numsides);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(tuber->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WCylinder cv::viz::Widget::cast<cv::viz::WCylinder>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCylinder&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// cylinder widget implementation
-
-cv::viz::WCube::WCube(const Point3f& pt_min, const Point3f& pt_max, bool wire_frame, const Color &color)
-{
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    if (wire_frame)
-    {
-        vtkSmartPointer<vtkOutlineSource> cube = vtkSmartPointer<vtkOutlineSource>::New();
-        cube->SetBounds(pt_min.x, pt_max.x, pt_min.y, pt_max.y, pt_min.z, pt_max.z);
-        mapper->SetInputConnection(cube->GetOutputPort());
-    }
-    else
-    {
-        vtkSmartPointer<vtkCubeSource> cube = vtkSmartPointer<vtkCubeSource>::New();
-        cube->SetBounds(pt_min.x, pt_max.x, pt_min.y, pt_max.y, pt_min.z, pt_max.z);
-        mapper->SetInputConnection(cube->GetOutputPort());
-    }
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WCube cv::viz::Widget::cast<cv::viz::WCube>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCube&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// coordinate system widget implementation
-
-cv::viz::WCoordinateSystem::WCoordinateSystem(float scale)
-{
-    vtkSmartPointer<vtkAxes> axes = vtkSmartPointer<vtkAxes>::New();
-    axes->SetOrigin(0, 0, 0);
-    axes->SetScaleFactor(scale);
-
-    vtkSmartPointer<vtkFloatArray> axes_colors = vtkSmartPointer<vtkFloatArray>::New();
-    axes_colors->Allocate(6);
-    axes_colors->InsertNextValue(0.0);
-    axes_colors->InsertNextValue(0.0);
-    axes_colors->InsertNextValue(0.5);
-    axes_colors->InsertNextValue(0.5);
-    axes_colors->InsertNextValue(1.0);
-    axes_colors->InsertNextValue(1.0);
-
-    vtkSmartPointer<vtkPolyData> axes_data = axes->GetOutput();
-#if VTK_MAJOR_VERSION <= 5
-    axes_data->Update();
-#else
-    axes->Update();
-#endif
-    axes_data->GetPointData()->SetScalars(axes_colors);
-
-    vtkSmartPointer<vtkTubeFilter> axes_tubes = vtkSmartPointer<vtkTubeFilter>::New();
-#if VTK_MAJOR_VERSION <= 5
-    axes_tubes->SetInput(axes_data);
-#else
-    axes_tubes->SetInputData(axes_data);
-#endif
-    axes_tubes->SetRadius(axes->GetScaleFactor() / 50.0);
-    axes_tubes->SetNumberOfSides(6);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetScalarModeToUsePointData();
-    mapper->SetInputConnection(axes_tubes->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-template<> cv::viz::WCoordinateSystem cv::viz::Widget::cast<cv::viz::WCoordinateSystem>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCoordinateSystem&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// polyline widget implementation
-
-struct cv::viz::WPolyLine::CopyImpl
-{
-    template<typename _Tp>
-    static void copy(const Mat& source, Vec<_Tp, 3> *output, vtkSmartPointer<vtkPolyLine> polyLine)
-    {
-        int s_chs = source.channels();
-
-        for (int y = 0, id = 0; y < source.rows; ++y)
-        {
-            const _Tp* srow = source.ptr<_Tp>(y);
-
-            for (int x = 0; x < source.cols; ++x, srow += s_chs, ++id)
-            {
-                *output++ = Vec<_Tp, 3>(srow);
-                polyLine->GetPointIds()->SetId(id,id);
-            }
-        }
-    }
-};
-
-cv::viz::WPolyLine::WPolyLine(InputArray _pointData, const Color &color)
-{
-    Mat pointData = _pointData.getMat();
-    CV_Assert(pointData.type() == CV_32FC3 || pointData.type() == CV_32FC4 || pointData.type() == CV_64FC3 || pointData.type() == CV_64FC4);
-    vtkIdType nr_points = pointData.total();
-
-    vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
-    vtkSmartPointer<vtkPolyData> polyData = vtkSmartPointer<vtkPolyData>::New();
-    vtkSmartPointer<vtkPolyLine> polyLine = vtkSmartPointer<vtkPolyLine>::New();
-
-    if (pointData.depth() == CV_32F)
-        points->SetDataTypeToFloat();
-    else
-        points->SetDataTypeToDouble();
-
-    points->SetNumberOfPoints(nr_points);
-    polyLine->GetPointIds()->SetNumberOfIds(nr_points);
-
-    if (pointData.depth() == CV_32F)
-    {
-        // Get a pointer to the beginning of the data array
-        Vec3f *data_beg = vtkpoints_data<float>(points);
-        CopyImpl::copy(pointData, data_beg, polyLine);
-    }
-    else if (pointData.depth() == CV_64F)
-    {
-        // Get a pointer to the beginning of the data array
-        Vec3d *data_beg = vtkpoints_data<double>(points);
-        CopyImpl::copy(pointData, data_beg, polyLine);
-    }
-
-    vtkSmartPointer<vtkCellArray> cells = vtkSmartPointer<vtkCellArray>::New();
-    cells->InsertNextCell(polyLine);
-
-    polyData->SetPoints(points);
-    polyData->SetLines(cells);
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(polyData);
-#else
-    mapper->SetInputData(polyData);
-#endif
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WPolyLine cv::viz::Widget::cast<cv::viz::WPolyLine>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WPolyLine&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// grid widget implementation
-
-struct cv::viz::WGrid::GridImpl
-{
-    static vtkSmartPointer<vtkPolyData> createGrid(const Vec2i &dimensions, const Vec2d &spacing)
-    {
-        // Create the grid using image data
-        vtkSmartPointer<vtkImageData> grid = vtkSmartPointer<vtkImageData>::New();
-
-        // Add 1 to dimensions because in ImageData dimensions is the number of lines
-        // - however here it means number of cells
-        grid->SetDimensions(dimensions[0]+1, dimensions[1]+1, 1);
-        grid->SetSpacing(spacing[0], spacing[1], 0.);
-
-        // Set origin of the grid to be the middle of the grid
-        grid->SetOrigin(dimensions[0] * spacing[0] * (-0.5), dimensions[1] * spacing[1] * (-0.5), 0);
-
-        // Extract the edges so we have the grid
-        vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-#if VTK_MAJOR_VERSION <= 5
-        filter->SetInputConnection(grid->GetProducerPort());
-#else
-        filter->SetInputData(grid);
-#endif
-        filter->Update();
-        return filter->GetOutput();
-    }
-};
-
-cv::viz::WGrid::WGrid(const Vec2i &dimensions, const Vec2d &spacing, const Color &color)
-{
-    vtkSmartPointer<vtkPolyData> grid = GridImpl::createGrid(dimensions, spacing);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInputConnection(grid->GetProducerPort());
-#else
-    mapper->SetInputData(grid);
-#endif
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-cv::viz::WGrid::WGrid(const Vec4f &coefs, const Vec2i &dimensions, const Vec2d &spacing, const Color &color)
-{
-    vtkSmartPointer<vtkPolyData> grid = GridImpl::createGrid(dimensions, spacing);
-
-    // Estimate the transform to set the normal based on the coefficients
-    Vec3f normal(coefs[0], coefs[1], coefs[2]);
-    Vec3f up_vector(0.0f, 1.0f, 0.0f); // Just set as default
-    double push_distance = -coefs[3]/cv::norm(Vec3f(coefs.val));
-    Vec3f u,v,n;
-    n = normalize(normal);
-    u = normalize(up_vector.cross(n));
-    v = n.cross(u);
-
-    vtkSmartPointer<vtkMatrix4x4> mat_trans = vtkSmartPointer<vtkMatrix4x4>::New();
-    mat_trans->SetElement(0,0,u[0]);
-    mat_trans->SetElement(0,1,u[1]);
-    mat_trans->SetElement(0,2,u[2]);
-    mat_trans->SetElement(1,0,v[0]);
-    mat_trans->SetElement(1,1,v[1]);
-    mat_trans->SetElement(1,2,v[2]);
-    mat_trans->SetElement(2,0,n[0]);
-    mat_trans->SetElement(2,1,n[1]);
-    mat_trans->SetElement(2,2,n[2]);
-    // Inverse rotation (orthogonal, so just take transpose)
-    mat_trans->Transpose();
-    mat_trans->SetElement(0,3,n[0] * push_distance);
-    mat_trans->SetElement(1,3,n[1] * push_distance);
-    mat_trans->SetElement(2,3,n[2] * push_distance);
-    mat_trans->SetElement(3,3,1);
-
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->PreMultiply();
-    transform->SetMatrix(mat_trans);
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transform_filter->SetTransform(transform);
-#if VTK_MAJOR_VERSION <= 5
-    transform_filter->SetInputConnection(grid->GetProducerPort());
-#else
-    transform_filter->SetInputData(grid);
-#endif
-    transform_filter->Update();
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetInputConnection(transform_filter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WGrid cv::viz::Widget::cast<cv::viz::WGrid>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WGrid&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// text3D widget implementation
-
-cv::viz::WText3D::WText3D(const String &text, const Point3f &position, float text_scale, bool face_camera, const Color &color)
-{
-    vtkSmartPointer<vtkVectorText> textSource = vtkSmartPointer<vtkVectorText>::New();
-    textSource->SetText(text.c_str());
-    textSource->Update();
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetInputConnection(textSource->GetOutputPort());
-
-    if (face_camera)
-    {
-        vtkSmartPointer<vtkFollower> actor = vtkSmartPointer<vtkFollower>::New();
-        actor->SetMapper(mapper);
-        actor->SetPosition(position.x, position.y, position.z);
-        actor->SetScale(text_scale);
-        WidgetAccessor::setProp(*this, actor);
-    }
-    else
-    {
-        vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-        actor->SetMapper(mapper);
-        actor->SetPosition(position.x, position.y, position.z);
-        actor->SetScale(text_scale);
-        WidgetAccessor::setProp(*this, actor);
-    }
-
-    setColor(color);
-}
-
-void cv::viz::WText3D::setText(const String &text)
-{
-    vtkFollower *actor = vtkFollower::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support text." && actor);
-
-    // Update text source
-    vtkPolyDataMapper *mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
-    vtkVectorText * textSource = vtkVectorText::SafeDownCast(mapper->GetInputConnection(0,0)->GetProducer());
-    CV_Assert("This widget does not support text." && textSource);
-
-    textSource->SetText(text.c_str());
-    textSource->Update();
-}
-
-cv::String cv::viz::WText3D::getText() const
-{
-    vtkFollower *actor = vtkFollower::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support text." && actor);
-
-    vtkPolyDataMapper *mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
-    vtkVectorText * textSource = vtkVectorText::SafeDownCast(mapper->GetInputConnection(0,0)->GetProducer());
-    CV_Assert("This widget does not support text." && textSource);
-
-    return textSource->GetText();
-}
-
-template<> cv::viz::WText3D cv::viz::Widget::cast<cv::viz::WText3D>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WText3D&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// text widget implementation
-
-cv::viz::WText::WText(const String &text, const Point2i &pos, int font_size, const Color &color)
-{
-    vtkSmartPointer<vtkTextActor> actor = vtkSmartPointer<vtkTextActor>::New();
-    actor->SetPosition(pos.x, pos.y);
-    actor->SetInput(text.c_str());
-
-    vtkSmartPointer<vtkTextProperty> tprop = actor->GetTextProperty();
-    tprop->SetFontSize(font_size);
-    tprop->SetFontFamilyToArial();
-    tprop->SetJustificationToLeft();
-    tprop->BoldOn();
-
-    Color c = vtkcolor(color);
-    tprop->SetColor(c.val);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-template<> cv::viz::WText cv::viz::Widget::cast<cv::viz::WText>()
-{
-    Widget2D widget = this->cast<Widget2D>();
-    return static_cast<WText&>(widget);
-}
-
-void cv::viz::WText::setText(const String &text)
-{
-    vtkTextActor *actor = vtkTextActor::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support text." && actor);
-    actor->SetInput(text.c_str());
-}
-
-cv::String cv::viz::WText::getText() const
-{
-    vtkTextActor *actor = vtkTextActor::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support text." && actor);
-    return actor->GetInput();
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// image overlay widget implementation
-
-cv::viz::WImageOverlay::WImageOverlay(const Mat &image, const Rect &rect)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-
-    // Create the vtk image and set its parameters based on input image
-    vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-    ConvertToVtkImage::convert(image, vtk_image);
-
-    // Need to flip the image as the coordinates are different in OpenCV and VTK
-    vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-    flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-    flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-    flipFilter->SetInputData(vtk_image);
-#endif
-    flipFilter->Update();
-
-    // Scale the image based on the Rect
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->Scale(double(image.cols)/rect.width,double(image.rows)/rect.height,1.0);
-
-    vtkSmartPointer<vtkImageReslice> image_reslice = vtkSmartPointer<vtkImageReslice>::New();
-    image_reslice->SetResliceTransform(transform);
-    image_reslice->SetInputConnection(flipFilter->GetOutputPort());
-    image_reslice->SetOutputDimensionality(2);
-    image_reslice->InterpolateOn();
-    image_reslice->AutoCropOutputOn();
-
-    vtkSmartPointer<vtkImageMapper> imageMapper = vtkSmartPointer<vtkImageMapper>::New();
-    imageMapper->SetInputConnection(image_reslice->GetOutputPort());
-    imageMapper->SetColorWindow(255); // OpenCV color
-    imageMapper->SetColorLevel(127.5);
-
-    vtkSmartPointer<vtkActor2D> actor = vtkSmartPointer<vtkActor2D>::New();
-    actor->SetMapper(imageMapper);
-    actor->SetPosition(rect.x, rect.y);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-void cv::viz::WImageOverlay::setImage(const Mat &image)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-
-    vtkActor2D *actor = vtkActor2D::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support overlay image." && actor);
-
-    vtkImageMapper *mapper = vtkImageMapper::SafeDownCast(actor->GetMapper());
-    CV_Assert("This widget does not support overlay image." && mapper);
-
-    // Create the vtk image and set its parameters based on input image
-    vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-    ConvertToVtkImage::convert(image, vtk_image);
-
-    // Need to flip the image as the coordinates are different in OpenCV and VTK
-    vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-    flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-    flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-    flipFilter->SetInputData(vtk_image);
-#endif
-    flipFilter->Update();
-
-    mapper->SetInputConnection(flipFilter->GetOutputPort());
-}
-
-template<> cv::viz::WImageOverlay cv::viz::Widget::cast<cv::viz::WImageOverlay>()
-{
-    Widget2D widget = this->cast<Widget2D>();
-    return static_cast<WImageOverlay&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// image 3D widget implementation
-
-cv::viz::WImage3D::WImage3D(const Mat &image, const Size &size)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-
-    // Create the vtk image and set its parameters based on input image
-    vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-    ConvertToVtkImage::convert(image, vtk_image);
-
-    // Need to flip the image as the coordinates are different in OpenCV and VTK
-    vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-    flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-    flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-    flipFilter->SetInputData(vtk_image);
-#endif
-    flipFilter->Update();
-
-    Vec3d plane_center(size.width * 0.5, size.height * 0.5, 0.0);
-
-    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
-    plane->SetCenter(plane_center[0], plane_center[1], plane_center[2]);
-    plane->SetNormal(0.0, 0.0, 1.0);
-
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->PreMultiply();
-    transform->Translate(plane_center[0], plane_center[1], plane_center[2]);
-    transform->Scale(size.width, size.height, 1.0);
-    transform->Translate(-plane_center[0], -plane_center[1], -plane_center[2]);
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transform_filter->SetTransform(transform);
-    transform_filter->SetInputConnection(plane->GetOutputPort());
-    transform_filter->Update();
-
-    // Apply the texture
-    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
-    texture->SetInputConnection(flipFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkTextureMapToPlane> texturePlane = vtkSmartPointer<vtkTextureMapToPlane>::New();
-    texturePlane->SetInputConnection(transform_filter->GetOutputPort());
-
-    vtkSmartPointer<vtkPolyDataMapper> planeMapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    planeMapper->SetInputConnection(texturePlane->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(planeMapper);
-    actor->SetTexture(texture);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-cv::viz::WImage3D::WImage3D(const Vec3f &position, const Vec3f &normal, const Vec3f &up_vector, const Mat &image, const Size &size)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-
-    // Create the vtk image and set its parameters based on input image
-    vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-    ConvertToVtkImage::convert(image, vtk_image);
-
-    // Need to flip the image as the coordinates are different in OpenCV and VTK
-    vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-    flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-    flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-    flipFilter->SetInputData(vtk_image);
-#endif
-    flipFilter->Update();
-
-    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
-    plane->SetCenter(0.0, 0.0, 0.0);
-    plane->SetNormal(0.0, 0.0, 1.0);
-
-    // Compute the transformation matrix for drawing the camera frame in a scene
-    Vec3f u,v,n;
-    n = normalize(normal);
-    u = normalize(up_vector.cross(n));
-    v = n.cross(u);
-
-    vtkSmartPointer<vtkMatrix4x4> mat_trans = vtkSmartPointer<vtkMatrix4x4>::New();
-    mat_trans->SetElement(0,0,u[0]);
-    mat_trans->SetElement(0,1,u[1]);
-    mat_trans->SetElement(0,2,u[2]);
-    mat_trans->SetElement(1,0,v[0]);
-    mat_trans->SetElement(1,1,v[1]);
-    mat_trans->SetElement(1,2,v[2]);
-    mat_trans->SetElement(2,0,n[0]);
-    mat_trans->SetElement(2,1,n[1]);
-    mat_trans->SetElement(2,2,n[2]);
-    // Inverse rotation (orthogonal, so just take transpose)
-    mat_trans->Transpose();
-    // Then translate the coordinate frame to camera position
-    mat_trans->SetElement(0,3,position[0]);
-    mat_trans->SetElement(1,3,position[1]);
-    mat_trans->SetElement(2,3,position[2]);
-    mat_trans->SetElement(3,3,1);
-
-    // Apply the texture
-    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
-    texture->SetInputConnection(flipFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkTextureMapToPlane> texturePlane = vtkSmartPointer<vtkTextureMapToPlane>::New();
-    texturePlane->SetInputConnection(plane->GetOutputPort());
-
-    // Apply the transform after texture mapping
-    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-    transform->PreMultiply();
-    transform->SetMatrix(mat_trans);
-    transform->Scale(size.width, size.height, 1.0);
-
-    vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-    transform_filter->SetTransform(transform);
-    transform_filter->SetInputConnection(texturePlane->GetOutputPort());
-    transform_filter->Update();
-
-    vtkSmartPointer<vtkPolyDataMapper> planeMapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    planeMapper->SetInputConnection(transform_filter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(planeMapper);
-    actor->SetTexture(texture);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-void cv::viz::WImage3D::setImage(const Mat &image)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-
-    vtkActor *actor = vtkActor::SafeDownCast(WidgetAccessor::getProp(*this));
-    CV_Assert("This widget does not support 3D image." && actor);
-
-    // Create the vtk image and set its parameters based on input image
-    vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-    ConvertToVtkImage::convert(image, vtk_image);
-
-    // Need to flip the image as the coordinates are different in OpenCV and VTK
-    vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-    flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-    flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-    flipFilter->SetInputData(vtk_image);
-#endif
-    flipFilter->Update();
-
-    // Apply the texture
-    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
-    texture->SetInputConnection(flipFilter->GetOutputPort());
-
-    actor->SetTexture(texture);
-}
-
-template<> cv::viz::WImage3D cv::viz::Widget::cast<cv::viz::WImage3D>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WImage3D&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// camera position widget implementation
-
-struct cv::viz::WCameraPosition::ProjectImage
-{
-    static void projectImage(float fovy, float far_end_height, const Mat &image,
-                             double scale, const Color &color, vtkSmartPointer<vtkActor> actor)
-    {
-        // Create a camera
-        vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
-        float aspect_ratio = float(image.cols)/float(image.rows);
-
-        // Create the vtk image
-        vtkSmartPointer<vtkImageData> vtk_image = vtkSmartPointer<vtkImageData>::New();
-        ConvertToVtkImage::convert(image, vtk_image);
-
-        // Adjust a pixel of the vtk_image
-        vtk_image->SetScalarComponentFromDouble(0, image.rows-1, 0, 0, color[2]);
-        vtk_image->SetScalarComponentFromDouble(0, image.rows-1, 0, 1, color[1]);
-        vtk_image->SetScalarComponentFromDouble(0, image.rows-1, 0, 2, color[0]);
-
-        // Need to flip the image as the coordinates are different in OpenCV and VTK
-        vtkSmartPointer<vtkImageFlip> flipFilter = vtkSmartPointer<vtkImageFlip>::New();
-        flipFilter->SetFilteredAxis(1); // Vertical flip
-#if VTK_MAJOR_VERSION <= 5
-        flipFilter->SetInputConnection(vtk_image->GetProducerPort());
-#else
-        flipFilter->SetInputData(vtk_image);
-#endif
-        flipFilter->Update();
-
-        Vec3d plane_center(0.0, 0.0, scale);
-
-        vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
-        plane->SetCenter(plane_center[0], plane_center[1], plane_center[2]);
-        plane->SetNormal(0.0, 0.0, 1.0);
-
-        vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-        transform->PreMultiply();
-        transform->Translate(plane_center[0], plane_center[1], plane_center[2]);
-        transform->Scale(far_end_height*aspect_ratio, far_end_height, 1.0);
-        transform->RotateY(180.0);
-        transform->Translate(-plane_center[0], -plane_center[1], -plane_center[2]);
-
-        // Apply the texture
-        vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
-        texture->SetInputConnection(flipFilter->GetOutputPort());
-
-        vtkSmartPointer<vtkTextureMapToPlane> texturePlane = vtkSmartPointer<vtkTextureMapToPlane>::New();
-        texturePlane->SetInputConnection(plane->GetOutputPort());
-
-        vtkSmartPointer<vtkTransformPolyDataFilter> transform_filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-        transform_filter->SetTransform(transform);
-        transform_filter->SetInputConnection(texturePlane->GetOutputPort());
-        transform_filter->Update();
-
-        // Create frustum
-        camera->SetViewAngle(fovy);
-        camera->SetPosition(0.0,0.0,0.0);
-        camera->SetViewUp(0.0,1.0,0.0);
-        camera->SetFocalPoint(0.0,0.0,1.0);
-        camera->SetClippingRange(0.01, scale);
-
-        double planesArray[24];
-        camera->GetFrustumPlanes(aspect_ratio, planesArray);
-
-        vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
-        planes->SetFrustumPlanes(planesArray);
-
-        vtkSmartPointer<vtkFrustumSource> frustumSource =
-        vtkSmartPointer<vtkFrustumSource>::New();
-        frustumSource->SetPlanes(planes);
-        frustumSource->Update();
-
-        vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-        filter->SetInputConnection(frustumSource->GetOutputPort());
-        filter->Update();
-
-        // Frustum needs to be textured or else it can't be combined with image
-        vtkSmartPointer<vtkTextureMapToPlane> frustum_texture = vtkSmartPointer<vtkTextureMapToPlane>::New();
-        frustum_texture->SetInputConnection(filter->GetOutputPort());
-        // Texture mapping with only one pixel from the image to have constant color
-        frustum_texture->SetSRange(0.0, 0.0);
-        frustum_texture->SetTRange(0.0, 0.0);
-
-        vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-        appendFilter->AddInputConnection(frustum_texture->GetOutputPort());
-        appendFilter->AddInputConnection(transform_filter->GetOutputPort());
-
-        vtkSmartPointer<vtkPolyDataMapper> planeMapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-        planeMapper->SetInputConnection(appendFilter->GetOutputPort());
-
-        actor->SetMapper(planeMapper);
-        actor->SetTexture(texture);
-    }
-};
-
-cv::viz::WCameraPosition::WCameraPosition(float scale)
-{
-    vtkSmartPointer<vtkAxes> axes = vtkSmartPointer<vtkAxes>::New();
-    axes->SetOrigin(0, 0, 0);
-    axes->SetScaleFactor(scale);
-
-    vtkSmartPointer<vtkFloatArray> axes_colors = vtkSmartPointer<vtkFloatArray>::New();
-    axes_colors->Allocate(6);
-    axes_colors->InsertNextValue(0.0);
-    axes_colors->InsertNextValue(0.0);
-    axes_colors->InsertNextValue(0.5);
-    axes_colors->InsertNextValue(0.5);
-    axes_colors->InsertNextValue(1.0);
-    axes_colors->InsertNextValue(1.0);
-
-    vtkSmartPointer<vtkPolyData> axes_data = axes->GetOutput();
-#if VTK_MAJOR_VERSION <= 5
-    axes_data->Update();
-#else
-    axes->Update();
-#endif
-    axes_data->GetPointData()->SetScalars(axes_colors);
-
-    vtkSmartPointer<vtkTubeFilter> axes_tubes = vtkSmartPointer<vtkTubeFilter>::New();
-#if VTK_MAJOR_VERSION <= 5
-    axes_tubes->SetInput(axes_data);
-#else
-    axes_tubes->SetInputData(axes_data);
-#endif
-    axes_tubes->SetRadius(axes->GetScaleFactor() / 50.0);
-    axes_tubes->SetNumberOfSides(6);
-
-    vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-    mapper->SetScalarModeToUsePointData();
-    mapper->SetInputConnection(axes_tubes->GetOutputPort());
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-cv::viz::WCameraPosition::WCameraPosition(const Matx33f &K, float scale, const Color &color)
-{
-    vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
-    float f_x = K(0,0);
-    float f_y = K(1,1);
-    float c_y = K(1,2);
-    float aspect_ratio = f_y / f_x;
-    // Assuming that this is an ideal camera (c_y and c_x are at the center of the image)
-    float fovy = 2.0f * atan2(c_y,f_y) * 180 / CV_PI;
-
-    camera->SetViewAngle(fovy);
-    camera->SetPosition(0.0,0.0,0.0);
-    camera->SetViewUp(0.0,1.0,0.0);
-    camera->SetFocalPoint(0.0,0.0,1.0);
-    camera->SetClippingRange(0.01, scale);
-
-    double planesArray[24];
-    camera->GetFrustumPlanes(aspect_ratio, planesArray);
-
-    vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
-    planes->SetFrustumPlanes(planesArray);
-
-    vtkSmartPointer<vtkFrustumSource> frustumSource =
-    vtkSmartPointer<vtkFrustumSource>::New();
-    frustumSource->SetPlanes(planes);
-    frustumSource->Update();
-
-    vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-    filter->SetInputConnection(frustumSource->GetOutputPort());
-    filter->Update();
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetInputConnection(filter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-
-cv::viz::WCameraPosition::WCameraPosition(const Vec2f &fov, float scale, const Color &color)
-{
-    vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
-
-    camera->SetViewAngle(fov[1] * 180 / CV_PI); // Vertical field of view
-    camera->SetPosition(0.0,0.0,0.0);
-    camera->SetViewUp(0.0,1.0,0.0);
-    camera->SetFocalPoint(0.0,0.0,1.0);
-    camera->SetClippingRange(0.01, scale);
-
-    double aspect_ratio = tan(fov[0] * 0.5) / tan(fov[1] * 0.5);
-
-    double planesArray[24];
-    camera->GetFrustumPlanes(aspect_ratio, planesArray);
-
-    vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
-    planes->SetFrustumPlanes(planesArray);
-
-    vtkSmartPointer<vtkFrustumSource> frustumSource =
-    vtkSmartPointer<vtkFrustumSource>::New();
-    frustumSource->SetPlanes(planes);
-    frustumSource->Update();
-
-    // Extract the edges so we have the grid
-    vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-    filter->SetInputConnection(frustumSource->GetOutputPort());
-    filter->Update();
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetInputConnection(filter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-cv::viz::WCameraPosition::WCameraPosition(const Matx33f &K, const Mat &image, float scale, const Color &color)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-    float f_y = K(1,1);
-    float c_y = K(1,2);
-    // Assuming that this is an ideal camera (c_y and c_x are at the center of the image)
-    float fovy = 2.0f * atan2(c_y,f_y) * 180.0f / CV_PI;
-    float far_end_height = 2.0f * c_y * scale / f_y;
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    ProjectImage::projectImage(fovy, far_end_height, image, scale, color, actor);
-    WidgetAccessor::setProp(*this, actor);
-}
-
-cv::viz::WCameraPosition::WCameraPosition(const Vec2f &fov, const Mat &image, float scale, const Color &color)
-{
-    CV_Assert(!image.empty() && image.depth() == CV_8U);
-    float fovy = fov[1] * 180.0f / CV_PI;
-    float far_end_height = 2.0 * scale * tan(fov[1] * 0.5);
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    ProjectImage::projectImage(fovy, far_end_height, image, scale, color, actor);
-    WidgetAccessor::setProp(*this, actor);
-}
-
-template<> cv::viz::WCameraPosition cv::viz::Widget::cast<cv::viz::WCameraPosition>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WCameraPosition&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// trajectory widget implementation
-
-struct cv::viz::WTrajectory::ApplyPath
-{
-    static void applyPath(vtkSmartPointer<vtkPolyData> poly_data, vtkSmartPointer<vtkAppendPolyData> append_filter, const std::vector<Affine3f> &path)
-    {
-        vtkIdType nr_points = path.size();
-
-        for (vtkIdType i = 0; i < nr_points; ++i)
-        {
-            vtkSmartPointer<vtkPolyData> new_data = vtkSmartPointer<vtkPolyData>::New();
-            new_data->DeepCopy(poly_data);
-
-            // Transform the default coordinate frame
-            vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
-            transform->PreMultiply();
-            vtkSmartPointer<vtkMatrix4x4> mat_trans = vtkSmartPointer<vtkMatrix4x4>::New();
-            mat_trans = convertToVtkMatrix(path[i].matrix);
-            transform->SetMatrix(mat_trans);
-
-            vtkSmartPointer<vtkTransformPolyDataFilter> filter = vtkSmartPointer<vtkTransformPolyDataFilter>::New();
-#if VTK_MAJOR_VERSION <= 5
-            filter->SetInput(new_data);
-#else
-            filter->SetInputData(new_data);
-#endif
-            filter->SetTransform(transform);
-            filter->Update();
-
-            append_filter->AddInputConnection(filter->GetOutputPort());
-        }
-    }
-};
-
-cv::viz::WTrajectory::WTrajectory(const std::vector<Affine3f> &path, int display_mode, const Color &color, float scale)
-{
-    vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-
-    // Bitwise and with 3 in order to limit the domain to 2 bits
-    if ((~display_mode & 3) ^ WTrajectory::DISPLAY_PATH)
-    {
-        // Create a poly line along the path
-        vtkIdType nr_points = path.size();
-
-        vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
-        vtkSmartPointer<vtkPolyData> polyData = vtkSmartPointer<vtkPolyData>::New();
-        vtkSmartPointer<vtkPolyLine> polyLine = vtkSmartPointer<vtkPolyLine>::New();
-
-        points->SetDataTypeToFloat();
-        points->SetNumberOfPoints(nr_points);
-        polyLine->GetPointIds()->SetNumberOfIds(nr_points);
-
-        Vec3f *data_beg = vtkpoints_data<float>(points);
-
-        for (vtkIdType i = 0; i < nr_points; ++i)
-        {
-            Vec3f cam_pose = path[i].translation();
-            *data_beg++ = cam_pose;
-            polyLine->GetPointIds()->SetId(i,i);
-        }
-
-        vtkSmartPointer<vtkCellArray> cells = vtkSmartPointer<vtkCellArray>::New();
-        cells->InsertNextCell(polyLine);
-
-        polyData->SetPoints(points);
-        polyData->SetLines(cells);
-
-        // Set the color for polyData
-        vtkSmartPointer<vtkUnsignedCharArray> colors = vtkSmartPointer<vtkUnsignedCharArray>::New();
-        colors->SetNumberOfComponents(3);
-        colors->SetNumberOfTuples(nr_points);
-        colors->FillComponent(0, color[2]);
-        colors->FillComponent(1, color[1]);
-        colors->FillComponent(2, color[0]);
-
-        polyData->GetPointData()->SetScalars(colors);
-#if VTK_MAJOR_VERSION <= 5
-        appendFilter->AddInputConnection(polyData->GetProducerPort());
-#else
-        appendFilter->AddInputData(polyData);
-#endif
-    }
-
-    if ((~display_mode & 3) ^ WTrajectory::DISPLAY_FRAMES)
-    {
-        // Create frames and transform along the path
-        vtkSmartPointer<vtkAxes> axes = vtkSmartPointer<vtkAxes>::New();
-        axes->SetOrigin(0, 0, 0);
-        axes->SetScaleFactor(scale);
-
-        vtkSmartPointer<vtkUnsignedCharArray> axes_colors = vtkSmartPointer<vtkUnsignedCharArray>::New();
-        axes_colors->SetNumberOfComponents(3);
-        axes_colors->InsertNextTuple3(255,0,0);
-        axes_colors->InsertNextTuple3(255,0,0);
-        axes_colors->InsertNextTuple3(0,255,0);
-        axes_colors->InsertNextTuple3(0,255,0);
-        axes_colors->InsertNextTuple3(0,0,255);
-        axes_colors->InsertNextTuple3(0,0,255);
-
-        vtkSmartPointer<vtkPolyData> axes_data = axes->GetOutput();
-#if VTK_MAJOR_VERSION <= 5
-        axes_data->Update();
-#else
-        axes->Update();
-#endif
-        axes_data->GetPointData()->SetScalars(axes_colors);
-
-        vtkSmartPointer<vtkTubeFilter> axes_tubes = vtkSmartPointer<vtkTubeFilter>::New();
-#if VTK_MAJOR_VERSION <= 5
-        axes_tubes->SetInput(axes_data);
-#else
-        axes_tubes->SetInputData(axes_data);
-#endif
-        axes_tubes->SetRadius(axes->GetScaleFactor() / 50.0);
-        axes_tubes->SetNumberOfSides(6);
-        axes_tubes->Update();
-
-        ApplyPath::applyPath(axes_tubes->GetOutput(), appendFilter, path);
-    }
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetScalarModeToUsePointData();
-    mapper->SetInputConnection(appendFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-cv::viz::WTrajectory::WTrajectory(const std::vector<Affine3f> &path, const Matx33f &K, float scale, const Color &color)
-{
-    vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
-    float f_x = K(0,0);
-    float f_y = K(1,1);
-    float c_y = K(1,2);
-    float aspect_ratio = f_y / f_x;
-    // Assuming that this is an ideal camera (c_y and c_x are at the center of the image)
-    float fovy = 2.0f * atan2(c_y,f_y) * 180 / CV_PI;
-
-    camera->SetViewAngle(fovy);
-    camera->SetPosition(0.0,0.0,0.0);
-    camera->SetViewUp(0.0,1.0,0.0);
-    camera->SetFocalPoint(0.0,0.0,1.0);
-    camera->SetClippingRange(0.01, scale);
-
-    double planesArray[24];
-    camera->GetFrustumPlanes(aspect_ratio, planesArray);
-
-    vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
-    planes->SetFrustumPlanes(planesArray);
-
-    vtkSmartPointer<vtkFrustumSource> frustumSource = vtkSmartPointer<vtkFrustumSource>::New();
-    frustumSource->SetPlanes(planes);
-    frustumSource->Update();
-
-    // Extract the edges
-    vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-    filter->SetInputConnection(frustumSource->GetOutputPort());
-    filter->Update();
-
-    vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-    ApplyPath::applyPath(filter->GetOutput(), appendFilter, path);
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetInputConnection(appendFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-cv::viz::WTrajectory::WTrajectory(const std::vector<Affine3f> &path, const Vec2f &fov, float scale, const Color &color)
-{
-    vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
-
-    camera->SetViewAngle(fov[1] * 180 / CV_PI); // Vertical field of view
-    camera->SetPosition(0.0,0.0,0.0);
-    camera->SetViewUp(0.0,1.0,0.0);
-    camera->SetFocalPoint(0.0,0.0,1.0);
-    camera->SetClippingRange(0.01, scale);
-
-    double aspect_ratio = tan(fov[0] * 0.5) / tan(fov[1] * 0.5);
-
-    double planesArray[24];
-    camera->GetFrustumPlanes(aspect_ratio, planesArray);
-
-    vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
-    planes->SetFrustumPlanes(planesArray);
-
-    vtkSmartPointer<vtkFrustumSource> frustumSource = vtkSmartPointer<vtkFrustumSource>::New();
-    frustumSource->SetPlanes(planes);
-    frustumSource->Update();
-
-    // Extract the edges
-    vtkSmartPointer<vtkExtractEdges> filter = vtkSmartPointer<vtkExtractEdges>::New();
-    filter->SetInputConnection(frustumSource->GetOutputPort());
-    filter->Update();
-
-    vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-    ApplyPath::applyPath(filter->GetOutput(), appendFilter, path);
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetInputConnection(appendFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-    setColor(color);
-}
-
-template<> cv::viz::WTrajectory cv::viz::Widget::cast<cv::viz::WTrajectory>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WTrajectory&>(widget);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// spheres trajectory widget implementation
-
-cv::viz::WSpheresTrajectory::WSpheresTrajectory(const std::vector<Affine3f> &path, float line_length, float init_sphere_radius, float sphere_radius,
-                                                          const Color &line_color, const Color &sphere_color)
-{
-    vtkSmartPointer<vtkAppendPolyData> appendFilter = vtkSmartPointer<vtkAppendPolyData>::New();
-    vtkIdType nr_poses = path.size();
-
-    // Create color arrays
-    vtkSmartPointer<vtkUnsignedCharArray> line_scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    line_scalars->SetNumberOfComponents(3);
-    line_scalars->InsertNextTuple3(line_color[2], line_color[1], line_color[0]);
-
-    // Create color array for sphere
-    vtkSphereSource * dummy_sphere = vtkSphereSource::New();
-    // Create the array for big sphere
-    dummy_sphere->SetRadius(init_sphere_radius);
-    dummy_sphere->Update();
-    vtkIdType nr_points = dummy_sphere->GetOutput()->GetNumberOfCells();
-    vtkSmartPointer<vtkUnsignedCharArray> sphere_scalars_init = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    sphere_scalars_init->SetNumberOfComponents(3);
-    sphere_scalars_init->SetNumberOfTuples(nr_points);
-    sphere_scalars_init->FillComponent(0, sphere_color[2]);
-    sphere_scalars_init->FillComponent(1, sphere_color[1]);
-    sphere_scalars_init->FillComponent(2, sphere_color[0]);
-    // Create the array for small sphere
-    dummy_sphere->SetRadius(sphere_radius);
-    dummy_sphere->Update();
-    nr_points = dummy_sphere->GetOutput()->GetNumberOfCells();
-    vtkSmartPointer<vtkUnsignedCharArray> sphere_scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
-    sphere_scalars->SetNumberOfComponents(3);
-    sphere_scalars->SetNumberOfTuples(nr_points);
-    sphere_scalars->FillComponent(0, sphere_color[2]);
-    sphere_scalars->FillComponent(1, sphere_color[1]);
-    sphere_scalars->FillComponent(2, sphere_color[0]);
-    dummy_sphere->Delete();
-
-    for (vtkIdType i = 0; i < nr_poses; ++i)
-    {
-        Point3f new_pos = path[i].translation();
-
-        vtkSmartPointer<vtkSphereSource> sphere_source = vtkSmartPointer<vtkSphereSource>::New();
-        sphere_source->SetCenter(new_pos.x, new_pos.y, new_pos.z);
-        if (i == 0)
-        {
-            sphere_source->SetRadius(init_sphere_radius);
-            sphere_source->Update();
-            sphere_source->GetOutput()->GetCellData()->SetScalars(sphere_scalars_init);
-            appendFilter->AddInputConnection(sphere_source->GetOutputPort());
-            continue;
-        }
-        else
-        {
-            sphere_source->SetRadius(sphere_radius);
-            sphere_source->Update();
-            sphere_source->GetOutput()->GetCellData()->SetScalars(sphere_scalars);
-            appendFilter->AddInputConnection(sphere_source->GetOutputPort());
-        }
-
-
-        Affine3f relativeAffine = path[i].inv() * path[i-1];
-        Vec3f v = path[i].rotation() * relativeAffine.translation();
-        v = normalize(v) * line_length;
-
-        vtkSmartPointer<vtkLineSource> line_source = vtkSmartPointer<vtkLineSource>::New();
-        line_source->SetPoint1(new_pos.x + v[0], new_pos.y + v[1], new_pos.z + v[2]);
-        line_source->SetPoint2(new_pos.x, new_pos.y, new_pos.z);
-        line_source->Update();
-        line_source->GetOutput()->GetCellData()->SetScalars(line_scalars);
-
-        appendFilter->AddInputConnection(line_source->GetOutputPort());
-    }
-
-    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
-    mapper->SetScalarModeToUseCellData();
-    mapper->SetInputConnection(appendFilter->GetOutputPort());
-
-    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
-    actor->SetMapper(mapper);
-
-    WidgetAccessor::setProp(*this, actor);
-}
-
-template<> cv::viz::WSpheresTrajectory cv::viz::Widget::cast<cv::viz::WSpheresTrajectory>()
-{
-    Widget3D widget = this->cast<Widget3D>();
-    return static_cast<WSpheresTrajectory&>(widget);
-}
diff --git a/modules/viz/src/shapes.cpp b/modules/viz/src/shapes.cpp
new file mode 100644
index 000000000..cc3a51ce4
--- /dev/null
+++ b/modules/viz/src/shapes.cpp
@@ -0,0 +1,1088 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Ozan Tonkal, ozantonkal@gmail.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// line widget implementation
+cv::viz::WLine::WLine(const Point3d &pt1, const Point3d &pt2, const Color &color)
+{
+    vtkSmartPointer<vtkLineSource> line = vtkSmartPointer<vtkLineSource>::New();
+    line->SetPoint1(pt1.x, pt1.y, pt1.z);
+    line->SetPoint2(pt2.x, pt2.y, pt2.z);
+    line->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, line->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WLine cv::viz::Widget::cast<cv::viz::WLine>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WLine&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// sphere widget implementation
+
+cv::viz::WSphere::WSphere(const Point3d &center, double radius, int sphere_resolution, const Color &color)
+{
+    vtkSmartPointer<vtkSphereSource> sphere = vtkSmartPointer<vtkSphereSource>::New();
+    sphere->SetRadius(radius);
+    sphere->SetCenter(center.x, center.y, center.z);
+    sphere->SetPhiResolution(sphere_resolution);
+    sphere->SetThetaResolution(sphere_resolution);
+    sphere->LatLongTessellationOff();
+    sphere->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, sphere->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WSphere cv::viz::Widget::cast<cv::viz::WSphere>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WSphere&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// plane widget implementation
+
+cv::viz::WPlane::WPlane(const Size2d& size, const Color &color)
+{
+    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
+    plane->SetOrigin(-0.5 * size.width, -0.5 * size.height, 0.0);
+    plane->SetPoint1( 0.5 * size.width, -0.5 * size.height, 0.0);
+    plane->SetPoint2(-0.5 * size.width,  0.5 * size.height, 0.0);
+    plane->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, plane->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+    actor->GetProperty()->LightingOff();
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WPlane::WPlane(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis, const Size2d& size, const Color &color)
+{
+    Vec3d zvec = normalize(normal);
+    Vec3d xvec = normalize(new_yaxis.cross(zvec));
+    Vec3d yvec = zvec.cross(xvec);
+
+    WPlane plane(size, color);
+    plane.applyTransform(makeTransformToGlobal(xvec, yvec, zvec, center));
+    *this = plane;
+}
+
+template<> cv::viz::WPlane cv::viz::Widget::cast<cv::viz::WPlane>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WPlane&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// arrow widget implementation
+
+cv::viz::WArrow::WArrow(const Point3d& pt1, const Point3d& pt2, double thickness, const Color &color)
+{
+    vtkSmartPointer<vtkArrowSource> arrow_source = vtkSmartPointer<vtkArrowSource>::New();
+    arrow_source->SetShaftRadius(thickness);
+    arrow_source->SetTipRadius(thickness * 3.0);
+    arrow_source->SetTipLength(thickness * 10.0);
+
+    Vec3d arbitrary = get_random_vec();
+    Vec3d start_point(pt1.x, pt1.y, pt1.z), end_point(pt2.x, pt2.y, pt2.z);
+
+    double length = norm(end_point - start_point);
+
+    Vec3d xvec = normalized(end_point - start_point);
+    Vec3d zvec = normalized(xvec.cross(arbitrary));
+    Vec3d yvec = zvec.cross(xvec);
+
+    Matx33d R = makeTransformToGlobal(xvec, yvec, zvec).rotation();
+    Affine3d transform_with_scale(R * length, start_point);
+
+    vtkSmartPointer<vtkPolyData> polydata = VtkUtils::TransformPolydata(arrow_source->GetOutputPort(), transform_with_scale);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, polydata);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WArrow cv::viz::Widget::cast<cv::viz::WArrow>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WArrow&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// circle widget implementation
+
+cv::viz::WCircle::WCircle(double radius, double thickness, const Color &color)
+{
+    vtkSmartPointer<vtkDiskSource> disk = vtkSmartPointer<vtkDiskSource>::New();
+    disk->SetCircumferentialResolution(30);
+    disk->SetInnerRadius(radius - thickness);
+    disk->SetOuterRadius(radius + thickness);
+    disk->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, disk->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->GetProperty()->LightingOff();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+
+}
+
+cv::viz::WCircle::WCircle(double radius, const Point3d& center, const Vec3d& normal, double thickness, const Color &color)
+{
+    Vec3d arbitrary = get_random_vec();
+    Vec3d zvec = normalized(normal);
+    Vec3d xvec = normalized(zvec.cross(arbitrary));
+    Vec3d yvec = zvec.cross(xvec);
+
+    WCircle circle(radius, thickness, color);
+    circle.applyTransform(makeTransformToGlobal(xvec, yvec, zvec, center));
+    *this = circle;
+}
+
+template<> cv::viz::WCircle cv::viz::Widget::cast<cv::viz::WCircle>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCircle&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// WCone widget implementation
+
+cv::viz::WCone::WCone(double length, double radius, int resolution, const Color &color)
+{
+    vtkSmartPointer<vtkConeSource> cone_source = vtkSmartPointer<vtkConeSource>::New();
+    cone_source->SetCenter(length*0.5, 0.0, 0.0);
+    cone_source->SetHeight(length);
+    cone_source->SetRadius(radius);
+    cone_source->SetResolution(resolution);
+    cone_source->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, cone_source->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WCone::WCone(double radius, const Point3d& center, const Point3d& tip, int resolution, const Color &color)
+{
+    Vec3d arbitrary = get_random_vec();
+    Vec3d xvec = normalized(Vec3d(tip - center));
+    Vec3d zvec = normalized(xvec.cross(arbitrary));
+    Vec3d yvec = zvec.cross(xvec);
+
+    WCone circle(norm(tip - center), radius, resolution, color);
+    circle.applyTransform(makeTransformToGlobal(xvec, yvec, zvec, center));
+    *this = circle;
+}
+
+template<> cv::viz::WCone cv::viz::Widget::cast<cv::viz::WCone>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCone&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// cylinder widget implementation
+
+cv::viz::WCylinder::WCylinder(const Point3d& axis_point1, const Point3d& axis_point2, double radius, int numsides, const Color &color)
+{
+    vtkSmartPointer<vtkLineSource> line = vtkSmartPointer<vtkLineSource>::New();
+    line->SetPoint1(axis_point1.x, axis_point1.y, axis_point1.z);
+    line->SetPoint2(axis_point2.x, axis_point2.y, axis_point2.z);
+
+    vtkSmartPointer<vtkTubeFilter> tuber = vtkSmartPointer<vtkTubeFilter>::New();
+    tuber->SetInputConnection(line->GetOutputPort());
+    tuber->SetNumberOfSides(numsides);
+    tuber->SetRadius(radius);
+    tuber->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, tuber->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WCylinder cv::viz::Widget::cast<cv::viz::WCylinder>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCylinder&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// cylinder widget implementation
+
+cv::viz::WCube::WCube(const Point3d& min_point, const Point3d& max_point, bool wire_frame, const Color &color)
+{
+    double bounds[6];
+    bounds[0] = std::min(min_point.x, max_point.x);
+    bounds[1] = std::max(min_point.x, max_point.x);
+    bounds[2] = std::min(min_point.y, max_point.y);
+    bounds[3] = std::max(min_point.y, max_point.y);
+    bounds[4] = std::min(min_point.z, max_point.z);
+    bounds[5] = std::max(min_point.z, max_point.z);
+
+    vtkSmartPointer<vtkPolyDataAlgorithm> cube;
+    if (wire_frame)
+    {
+        cube = vtkSmartPointer<vtkOutlineSource>::New();
+        vtkOutlineSource::SafeDownCast(cube)->SetBounds(bounds);
+    }
+    else
+    {
+        cube = vtkSmartPointer<vtkCubeSource>::New();
+        vtkCubeSource::SafeDownCast(cube)->SetBounds(bounds);
+    }
+    cube->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, cube->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WCube cv::viz::Widget::cast<cv::viz::WCube>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCube&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// coordinate system widget implementation
+
+cv::viz::WCoordinateSystem::WCoordinateSystem(double scale)
+{
+    vtkSmartPointer<vtkAxes> axes = vtkSmartPointer<vtkAxes>::New();
+    axes->SetOrigin(0, 0, 0);
+    axes->SetScaleFactor(scale);
+    axes->Update();
+
+    vtkSmartPointer<vtkUnsignedCharArray> colors = vtkSmartPointer<vtkUnsignedCharArray>::New();
+    colors->SetNumberOfComponents(3);
+    colors->InsertNextTuple3(255, 0, 0);
+    colors->InsertNextTuple3(255, 0, 0);
+    colors->InsertNextTuple3(0, 255, 0);
+    colors->InsertNextTuple3(0, 255, 0);
+    colors->InsertNextTuple3(0, 0, 255);
+    colors->InsertNextTuple3(0, 0, 255);
+
+    vtkSmartPointer<vtkPolyData> polydata = axes->GetOutput();
+    polydata->GetPointData()->SetScalars(colors);
+
+    vtkSmartPointer<vtkTubeFilter> tube_filter = vtkSmartPointer<vtkTubeFilter>::New();
+    VtkUtils::SetInputData(tube_filter, polydata);
+    tube_filter->SetRadius(axes->GetScaleFactor() / 50.0);
+    tube_filter->SetNumberOfSides(6);
+    tube_filter->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    mapper->SetScalarModeToUsePointData();
+    VtkUtils::SetInputData(mapper, tube_filter->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WCoordinateSystem cv::viz::Widget::cast<cv::viz::WCoordinateSystem>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCoordinateSystem&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// polyline widget implementation
+
+cv::viz::WPolyLine::WPolyLine(InputArray _points, const Color &color)
+{
+    CV_Assert(_points.type() == CV_32FC3 || _points.type() == CV_32FC4 || _points.type() == CV_64FC3 || _points.type() == CV_64FC4);
+
+    const float *fpoints = _points.getMat().ptr<float>();
+    const double *dpoints = _points.getMat().ptr<double>();
+    size_t total = _points.total();
+    int s_chs = _points.channels();
+
+    vtkSmartPointer<vtkPoints> points = vtkSmartPointer<vtkPoints>::New();
+    points->SetDataType(_points.depth() == CV_32F ? VTK_FLOAT : VTK_DOUBLE);
+    points->SetNumberOfPoints(total);
+
+    if (_points.depth() == CV_32F)
+        for(size_t i = 0; i < total; ++i, fpoints += s_chs)
+            points->SetPoint(i, fpoints);
+
+    if (_points.depth() == CV_64F)
+        for(size_t i = 0; i < total; ++i, dpoints += s_chs)
+            points->SetPoint(i, dpoints);
+
+    vtkSmartPointer<vtkCellArray> cell_array = vtkSmartPointer<vtkCellArray>::New();
+    cell_array->Allocate(cell_array->EstimateSize(1, total));
+    cell_array->InsertNextCell(total);
+    for(size_t i = 0; i < total; ++i)
+        cell_array->InsertCellPoint(i);
+
+    vtkSmartPointer<vtkUnsignedCharArray> scalars =  VtkUtils::FillScalars(total, color);
+
+    vtkSmartPointer<vtkPolyData> polydata = vtkSmartPointer<vtkPolyData>::New();
+    polydata->SetPoints(points);
+    polydata->SetLines(cell_array);
+    polydata->GetPointData()->SetScalars(scalars);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, polydata);
+    mapper->SetScalarRange(0, 255);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WPolyLine cv::viz::Widget::cast<cv::viz::WPolyLine>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WPolyLine&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// grid widget implementation
+
+
+cv::viz::WGrid::WGrid(const Vec2i &cells, const Vec2d &cells_spacing, const Color &color)
+{
+    vtkSmartPointer<vtkImageData> grid_data = vtkSmartPointer<vtkImageData>::New();
+
+    // Add 1 to dimensions because in ImageData dimensions is the number of lines
+    // - however here it means number of cells
+    grid_data->SetDimensions(cells[0]+1, cells[1]+1, 1);
+    grid_data->SetSpacing(cells_spacing[0], cells_spacing[1], 0.);
+
+    // Set origin of the grid to be the middle of the grid
+    grid_data->SetOrigin(cells[0] * cells_spacing[0] * (-0.5), cells[1] * cells_spacing[1] * (-0.5), 0);
+
+    // Extract the edges so we have the grid
+    vtkSmartPointer<vtkExtractEdges> extract_edges = vtkSmartPointer<vtkExtractEdges>::New();
+    VtkUtils::SetInputData(extract_edges, grid_data);
+    extract_edges->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, extract_edges->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WGrid::WGrid(const Point3d& center, const Vec3d& normal, const Vec3d& new_yaxis, const Vec2i &cells, const Vec2d &cells_spacing, const Color &color)
+{
+    Vec3d zvec = normalize(normal);
+    Vec3d xvec = normalize(new_yaxis.cross(zvec));
+    Vec3d yvec = zvec.cross(xvec);
+
+    WGrid grid(cells, cells_spacing, color);
+    grid.applyTransform(makeTransformToGlobal(xvec, yvec, zvec, center));
+    *this = grid;
+}
+
+template<> cv::viz::WGrid cv::viz::Widget::cast<cv::viz::WGrid>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WGrid&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// text3D widget implementation
+
+cv::viz::WText3D::WText3D(const String &text, const Point3d &position, double text_scale, bool face_camera, const Color &color)
+{
+    vtkSmartPointer<vtkVectorText> textSource = vtkSmartPointer<vtkVectorText>::New();
+    textSource->SetText(text.c_str());
+    textSource->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    mapper->SetInputConnection(textSource->GetOutputPort());
+
+    if (face_camera)
+    {
+        vtkSmartPointer<vtkFollower> actor = vtkSmartPointer<vtkFollower>::New();
+        actor->SetMapper(mapper);
+        actor->SetPosition(position.x, position.y, position.z);
+        actor->SetScale(text_scale);
+        WidgetAccessor::setProp(*this, actor);
+    }
+    else
+    {
+        vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+        actor->SetMapper(mapper);
+        actor->SetPosition(position.x, position.y, position.z);
+        actor->SetScale(text_scale);
+        actor->GetProperty()->LightingOff();
+        WidgetAccessor::setProp(*this, actor);
+    }
+
+    setColor(color);
+}
+
+void cv::viz::WText3D::setText(const String &text)
+{
+    vtkActor *actor = vtkActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support text." && actor);
+
+    // Update text source
+    vtkPolyDataMapper *mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+    vtkVectorText * textSource = vtkVectorText::SafeDownCast(mapper->GetInputConnection(0,0)->GetProducer());
+    CV_Assert("This widget does not support text." && textSource);
+
+    textSource->SetText(text.c_str());
+    textSource->Modified();
+    textSource->Update();
+}
+
+cv::String cv::viz::WText3D::getText() const
+{
+    vtkFollower *actor = vtkFollower::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support text." && actor);
+
+    vtkPolyDataMapper *mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+    vtkVectorText * textSource = vtkVectorText::SafeDownCast(mapper->GetInputConnection(0,0)->GetProducer());
+    CV_Assert("This widget does not support text." && textSource);
+
+    return textSource->GetText();
+}
+
+template<> cv::viz::WText3D cv::viz::Widget::cast<cv::viz::WText3D>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WText3D&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// text widget implementation
+
+cv::viz::WText::WText(const String &text, const Point &pos, int font_size, const Color &color)
+{
+    vtkSmartPointer<vtkTextActor> actor = vtkSmartPointer<vtkTextActor>::New();
+    actor->SetDisplayPosition(pos.x, pos.y);
+    actor->SetInput(text.c_str());
+
+    actor->GetProperty()->SetDisplayLocationToForeground();
+
+    vtkSmartPointer<vtkTextProperty> tprop = actor->GetTextProperty();
+    tprop->SetFontSize(font_size);
+    tprop->SetFontFamilyToCourier();
+    tprop->SetJustificationToLeft();
+    tprop->BoldOn();
+
+    Color c = vtkcolor(color);
+    tprop->SetColor(c.val);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WText cv::viz::Widget::cast<cv::viz::WText>()
+{
+    Widget2D widget = this->cast<Widget2D>();
+    return static_cast<WText&>(widget);
+}
+
+void cv::viz::WText::setText(const String &text)
+{
+    vtkTextActor *actor = vtkTextActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support text." && actor);
+    actor->SetInput(text.c_str());
+}
+
+cv::String cv::viz::WText::getText() const
+{
+    vtkTextActor *actor = vtkTextActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support text." && actor);
+    return actor->GetInput();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// image overlay widget implementation
+
+cv::viz::WImageOverlay::WImageOverlay(InputArray image, const Rect &rect)
+{
+    CV_Assert(!image.empty() && image.depth() == CV_8U);
+    vtkSmartPointer<vtkImageMatSource> source = vtkSmartPointer<vtkImageMatSource>::New();
+    source->SetImage(image);
+    Size sz = image.size();
+
+    // Scale the image based on the Rect, and flip to match y-ais orientation
+    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
+    transform->Scale(sz.width/(double)rect.width, sz.height/(double)rect.height, 1.0);
+    transform->RotateX(180);
+
+    vtkSmartPointer<vtkImageReslice> image_reslice = vtkSmartPointer<vtkImageReslice>::New();
+    image_reslice->SetResliceTransform(transform);
+    image_reslice->SetInputConnection(source->GetOutputPort());
+    image_reslice->SetOutputDimensionality(2);
+    image_reslice->InterpolateOn();
+    image_reslice->AutoCropOutputOn();
+    image_reslice->Update();
+
+    vtkSmartPointer<vtkImageMapper> image_mapper = vtkSmartPointer<vtkImageMapper>::New();
+    image_mapper->SetInputConnection(image_reslice->GetOutputPort());
+    image_mapper->SetColorWindow(255); // OpenCV color
+    image_mapper->SetColorLevel(127.5);
+
+    vtkSmartPointer<vtkActor2D> actor = vtkSmartPointer<vtkActor2D>::New();
+    actor->SetMapper(image_mapper);
+    actor->SetPosition(rect.x, rect.y);
+    actor->GetProperty()->SetDisplayLocationToForeground();
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+void cv::viz::WImageOverlay::setImage(InputArray image)
+{
+    CV_Assert(!image.empty() && image.depth() == CV_8U);
+
+    vtkActor2D *actor = vtkActor2D::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support overlay image." && actor);
+
+    vtkImageMapper *mapper = vtkImageMapper::SafeDownCast(actor->GetMapper());
+    CV_Assert("This widget does not support overlay image." && mapper);
+    \
+    Vec6i extent;
+    mapper->GetInput()->GetExtent(extent.val);
+    Size size(extent[1], extent[3]);
+
+    // Create the vtk image and set its parameters based on input image
+    vtkSmartPointer<vtkImageMatSource> source = vtkSmartPointer<vtkImageMatSource>::New();
+    source->SetImage(image);
+    Size sz = image.size();
+
+    // Scale the image based on the Rect, and flip to match y-ais orientation
+    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
+    transform->Scale(sz.width/(double)size.width, sz.height/(double)size.height, 1.0);
+    transform->RotateX(180);
+
+    vtkSmartPointer<vtkImageReslice> image_reslice = vtkSmartPointer<vtkImageReslice>::New();
+    image_reslice->SetResliceTransform(transform);
+    image_reslice->SetInputConnection(source->GetOutputPort());
+    image_reslice->SetOutputDimensionality(2);
+    image_reslice->InterpolateOn();
+    image_reslice->AutoCropOutputOn();
+    image_reslice->Update();
+
+    mapper->SetInputConnection(image_reslice->GetOutputPort());
+}
+
+template<> cv::viz::WImageOverlay cv::viz::Widget::cast<cv::viz::WImageOverlay>()
+{
+    Widget2D widget = this->cast<Widget2D>();
+    return static_cast<WImageOverlay&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// image 3D widget implementation
+
+cv::viz::WImage3D::WImage3D(InputArray image, const Size2d &size)
+{
+    CV_Assert(!image.empty() && image.depth() == CV_8U);
+
+    vtkSmartPointer<vtkImageMatSource> source = vtkSmartPointer<vtkImageMatSource>::New();
+    source->SetImage(image);
+
+    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
+    texture->SetInputConnection(source->GetOutputPort());
+
+    vtkSmartPointer<vtkPlaneSource> plane = vtkSmartPointer<vtkPlaneSource>::New();
+    plane->SetOrigin(-0.5 * size.width, -0.5 * size.height, 0.0);
+    plane->SetPoint1( 0.5 * size.width, -0.5 * size.height, 0.0);
+    plane->SetPoint2(-0.5 * size.width,  0.5 * size.height, 0.0);
+
+    vtkSmartPointer<vtkTextureMapToPlane> textured_plane = vtkSmartPointer<vtkTextureMapToPlane>::New();
+    textured_plane->SetInputConnection(plane->GetOutputPort());
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    mapper->SetInputConnection(textured_plane->GetOutputPort());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+    actor->SetTexture(texture);
+    actor->GetProperty()->ShadingOff();
+    actor->GetProperty()->LightingOff();
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WImage3D::WImage3D(InputArray image, const Size2d &size, const Vec3d &center, const Vec3d &normal, const Vec3d &up_vector)
+{
+    CV_Assert(!image.empty() && image.depth() == CV_8U);
+
+    // Compute the transformation matrix for drawing the camera frame in a scene
+    Vec3d n = normalize(normal);
+    Vec3d u = normalize(up_vector.cross(n));
+    Vec3d v = n.cross(u);
+    Affine3d pose = makeTransformToGlobal(u, v, n, center);
+
+    WImage3D image3d(image, size);
+    image3d.applyTransform(pose);
+    *this = image3d;
+}
+
+void cv::viz::WImage3D::setImage(InputArray image)
+{
+    CV_Assert(!image.empty() && image.depth() == CV_8U);
+
+    vtkActor *actor = vtkActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("This widget does not support 3D image." && actor);
+
+    vtkSmartPointer<vtkImageMatSource> source = vtkSmartPointer<vtkImageMatSource>::New();
+    source->SetImage(image);
+
+    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
+    texture->SetInputConnection(source->GetOutputPort());
+
+    actor->SetTexture(texture);
+}
+
+template<> cv::viz::WImage3D cv::viz::Widget::cast<cv::viz::WImage3D>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WImage3D&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// camera position widget implementation
+
+namespace  cv  { namespace viz { namespace
+{
+    struct CameraPositionUtils
+    {
+        static vtkSmartPointer<vtkPolyData> createFrustum(double aspect_ratio, double fovy, double scale)
+        {
+            vtkSmartPointer<vtkCamera> camera = vtkSmartPointer<vtkCamera>::New();
+            camera->SetViewAngle(fovy);
+            camera->SetPosition(0.0, 0.0, 0.0);
+            camera->SetViewUp(0.0, 1.0, 0.0);
+            camera->SetFocalPoint(0.0, 0.0, 1.0);
+            camera->SetClippingRange(1e-9, scale);
+
+            double planes_array[24];
+            camera->GetFrustumPlanes(aspect_ratio, planes_array);
+
+            vtkSmartPointer<vtkPlanes> planes = vtkSmartPointer<vtkPlanes>::New();
+            planes->SetFrustumPlanes(planes_array);
+
+            vtkSmartPointer<vtkFrustumSource> frustumSource = vtkSmartPointer<vtkFrustumSource>::New();
+            frustumSource->SetPlanes(planes);
+
+            vtkSmartPointer<vtkExtractEdges> extract_edges = vtkSmartPointer<vtkExtractEdges>::New();
+            extract_edges->SetInputConnection(frustumSource->GetOutputPort());
+            extract_edges->Update();
+
+            return extract_edges->GetOutput();
+        }
+
+        static Mat ensureColorImage(InputArray image)
+        {
+            Mat color(image.size(), CV_8UC3);
+            if (image.channels() == 1)
+            {
+                Vec3b *drow = color.ptr<Vec3b>();
+                for(int y = 0; y < color.rows; ++y)
+                {
+                    const unsigned char *srow = image.getMat().ptr<unsigned char>(y);
+                    const unsigned char *send = srow + color.cols;
+                    for(;srow < send;)
+                        *drow++ = Vec3b::all(*srow++);
+                }
+            }
+            else
+                image.copyTo(color);
+            return color;
+        }
+    };
+}}}
+
+cv::viz::WCameraPosition::WCameraPosition(double scale)
+{
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, getPolyData(WCoordinateSystem(scale)));
+    mapper->SetScalarModeToUsePointData();
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WCameraPosition::WCameraPosition(const Matx33d &K, double scale, const Color &color)
+{
+    double f_x = K(0,0), f_y = K(1,1), c_y = K(1,2);
+
+    // Assuming that this is an ideal camera (c_y and c_x are at the center of the image)
+    double fovy = 2.0 * atan2(c_y, f_y) * 180 / CV_PI;
+    double aspect_ratio = f_y / f_x;
+
+    vtkSmartPointer<vtkPolyData> polydata = CameraPositionUtils::createFrustum(aspect_ratio, fovy, scale);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, polydata);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WCameraPosition::WCameraPosition(const Vec2d &fov, double scale, const Color &color)
+{
+    double aspect_ratio = tan(fov[0] * 0.5) / tan(fov[1] * 0.5);
+    double fovy = fov[1] * 180 / CV_PI;
+
+    vtkSmartPointer<vtkPolyData> polydata = CameraPositionUtils::createFrustum(aspect_ratio, fovy, scale);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, polydata);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WCameraPosition::WCameraPosition(const Matx33d &K, InputArray _image, double scale, const Color &color)
+{
+    CV_Assert(!_image.empty() && _image.depth() == CV_8U);
+    Mat image = CameraPositionUtils::ensureColorImage(_image);
+    image.at<Vec3b>(0, 0) = Vec3d(color.val); //workaround of VTK limitation
+
+    double f_y = K(1,1), c_y = K(1,2);
+    // Assuming that this is an ideal camera (c_y and c_x are at the center of the image)
+    double fovy = 2.0 * atan2(c_y, f_y) * 180.0 / CV_PI;
+    double far_end_height = 2.00 * c_y * scale / f_y;
+    double aspect_ratio = image.cols/(double)image.rows;
+    double image_scale = far_end_height/image.rows;
+
+    WImage3D image_widget(image, Size2d(image.size()) * image_scale);
+    image_widget.applyTransform(Affine3d().translate(Vec3d(0, 0, scale)));
+    vtkSmartPointer<vtkPolyData> plane = getPolyData(image_widget);
+
+    vtkSmartPointer<vtkPolyData> frustum = CameraPositionUtils::createFrustum(aspect_ratio, fovy, scale);
+
+    // Frustum needs to be textured or else it can't be combined with image
+    vtkSmartPointer<vtkTextureMapToPlane> frustum_texture = vtkSmartPointer<vtkTextureMapToPlane>::New();
+    VtkUtils::SetInputData(frustum_texture, frustum);
+    frustum_texture->SetSRange(0.0, 0.0); // Texture mapping with only one pixel
+    frustum_texture->SetTRange(0.0, 0.0); // from the image to have constant color
+
+    vtkSmartPointer<vtkAppendPolyData> append_filter = vtkSmartPointer<vtkAppendPolyData>::New();
+    append_filter->AddInputConnection(frustum_texture->GetOutputPort());
+    VtkUtils::AddInputData(append_filter, plane);
+
+    vtkSmartPointer<vtkActor> actor = getActor(image_widget);
+    actor->GetMapper()->SetInputConnection(append_filter->GetOutputPort());
+    WidgetAccessor::setProp(*this, actor);
+}
+
+cv::viz::WCameraPosition::WCameraPosition(const Vec2d &fov, InputArray _image, double scale, const Color &color)
+{
+    CV_Assert(!_image.empty() && _image.depth() == CV_8U);
+    Mat image = CameraPositionUtils::ensureColorImage(_image);
+    image.at<Vec3b>(0, 0) = Vec3d(color.val); //workaround of VTK limitation
+
+    double fovy = fov[1] * 180.0 / CV_PI;
+    double far_end_height = 2.0 * scale * tan(fov[1] * 0.5);
+    double aspect_ratio = image.cols/(double)image.rows;
+    double image_scale = far_end_height/image.rows;
+
+    WImage3D image_widget(image, Size2d(image.size()) * image_scale);
+    image_widget.applyTransform(Affine3d().translate(Vec3d(0, 0, scale)));
+    vtkSmartPointer<vtkPolyData> plane = getPolyData(image_widget);
+
+    vtkSmartPointer<vtkPolyData> frustum = CameraPositionUtils::createFrustum(aspect_ratio, fovy, scale);
+
+    // Frustum needs to be textured or else it can't be combined with image
+    vtkSmartPointer<vtkTextureMapToPlane> frustum_texture = vtkSmartPointer<vtkTextureMapToPlane>::New();
+    VtkUtils::SetInputData(frustum_texture, frustum);
+    frustum_texture->SetSRange(0.0, 0.0); // Texture mapping with only one pixel
+    frustum_texture->SetTRange(0.0, 0.0); // from the image to have constant color
+
+    vtkSmartPointer<vtkAppendPolyData> append_filter = vtkSmartPointer<vtkAppendPolyData>::New();
+    append_filter->AddInputConnection(frustum_texture->GetOutputPort());
+    VtkUtils::AddInputData(append_filter, plane);
+
+    vtkSmartPointer<vtkActor> actor = getActor(image_widget);
+    actor->GetMapper()->SetInputConnection(append_filter->GetOutputPort());
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WCameraPosition cv::viz::Widget::cast<cv::viz::WCameraPosition>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WCameraPosition&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// trajectory widget implementation
+
+cv::viz::WTrajectory::WTrajectory(InputArray _path, int display_mode, double scale, const Color &color)
+{
+    vtkSmartPointer<vtkAppendPolyData> append_filter = vtkSmartPointer<vtkAppendPolyData>::New();
+
+    // Bitwise and with 3 in order to limit the domain to 2 bits
+    if (display_mode & WTrajectory::PATH)
+    {
+        Mat points = vtkTrajectorySource::ExtractPoints(_path);
+        vtkSmartPointer<vtkPolyData> polydata = getPolyData(WPolyLine(points, color));
+        VtkUtils::AddInputData(append_filter, polydata);
+    }
+
+    if (display_mode & WTrajectory::FRAMES)
+    {
+        vtkSmartPointer<vtkTrajectorySource> source = vtkSmartPointer<vtkTrajectorySource>::New();
+        source->SetTrajectory(_path);
+
+        vtkSmartPointer<vtkPolyData> glyph = getPolyData(WCoordinateSystem(scale));
+
+        vtkSmartPointer<vtkTensorGlyph> tensor_glyph = vtkSmartPointer<vtkTensorGlyph>::New();
+        tensor_glyph->SetInputConnection(source->GetOutputPort());
+        VtkUtils::SetSourceData(tensor_glyph, glyph);
+        tensor_glyph->ExtractEigenvaluesOff();  // Treat as a rotation matrix, not as something with eigenvalues
+        tensor_glyph->ThreeGlyphsOff();
+        tensor_glyph->SymmetricOff();
+        tensor_glyph->ColorGlyphsOff();
+
+        append_filter->AddInputConnection(tensor_glyph->GetOutputPort());
+    }
+    append_filter->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, append_filter->GetOutput());
+    mapper->SetScalarModeToUsePointData();
+    mapper->SetScalarRange(0, 255);
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WTrajectory cv::viz::Widget::cast<cv::viz::WTrajectory>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WTrajectory&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// WTrajectoryFrustums widget implementation
+
+cv::viz::WTrajectoryFrustums::WTrajectoryFrustums(InputArray _path, const Matx33d &K, double scale, const Color &color)
+{
+    vtkSmartPointer<vtkTrajectorySource> source = vtkSmartPointer<vtkTrajectorySource>::New();
+    source->SetTrajectory(_path);
+
+    vtkSmartPointer<vtkPolyData> glyph = getPolyData(WCameraPosition(K, scale));
+
+    vtkSmartPointer<vtkTensorGlyph> tensor_glyph = vtkSmartPointer<vtkTensorGlyph>::New();
+    tensor_glyph->SetInputConnection(source->GetOutputPort());
+    VtkUtils::SetSourceData(tensor_glyph, glyph);
+    tensor_glyph->ExtractEigenvaluesOff();  // Treat as a rotation matrix, not as something with eigenvalues
+    tensor_glyph->ThreeGlyphsOff();
+    tensor_glyph->SymmetricOff();
+    tensor_glyph->ColorGlyphsOff();
+    tensor_glyph->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, tensor_glyph->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+cv::viz::WTrajectoryFrustums::WTrajectoryFrustums(InputArray _path, const Vec2d &fov, double scale, const Color &color)
+{
+    vtkSmartPointer<vtkTrajectorySource> source = vtkSmartPointer<vtkTrajectorySource>::New();
+    source->SetTrajectory(_path);
+
+    vtkSmartPointer<vtkPolyData> glyph = getPolyData(WCameraPosition(fov, scale));
+
+    vtkSmartPointer<vtkTensorGlyph> tensor_glyph = vtkSmartPointer<vtkTensorGlyph>::New();
+    tensor_glyph->SetInputConnection(source->GetOutputPort());
+    VtkUtils::SetSourceData(tensor_glyph, glyph);
+    tensor_glyph->ExtractEigenvaluesOff();  // Treat as a rotation matrix, not as something with eigenvalues
+    tensor_glyph->ThreeGlyphsOff();
+    tensor_glyph->SymmetricOff();
+    tensor_glyph->ColorGlyphsOff();
+    tensor_glyph->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    VtkUtils::SetInputData(mapper, tensor_glyph->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+    setColor(color);
+}
+
+template<> cv::viz::WTrajectoryFrustums cv::viz::Widget::cast<cv::viz::WTrajectoryFrustums>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WTrajectoryFrustums&>(widget);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// WTrajectorySpheres widget implementation
+
+cv::viz::WTrajectorySpheres::WTrajectorySpheres(InputArray _path, double line_length, double radius, const Color &from, const Color &to)
+{
+    CV_Assert(_path.kind() == _InputArray::STD_VECTOR || _path.kind() == _InputArray::MAT);
+    CV_Assert(_path.type() == CV_32FC(16) || _path.type() == CV_64FC(16));
+
+    Mat path64;
+    _path.getMat().convertTo(path64, CV_64F);
+    Affine3d *traj = path64.ptr<Affine3d>();
+    size_t total = path64.total();
+
+    vtkSmartPointer<vtkAppendPolyData> append_filter = vtkSmartPointer<vtkAppendPolyData>::New();
+
+    for(size_t i = 0; i < total; ++i)
+    {
+        Vec3d curr = traj[i].translation();
+
+        vtkSmartPointer<vtkSphereSource> sphere_source = vtkSmartPointer<vtkSphereSource>::New();
+        sphere_source->SetCenter(curr.val);
+        sphere_source->SetRadius( (i == 0) ? 2 * radius : radius );
+        sphere_source->Update();
+
+        double alpha = static_cast<double>(i)/total;
+        Color c = from * (1 - alpha) + to * alpha;
+
+        vtkSmartPointer<vtkPolyData> polydata = sphere_source->GetOutput();
+        polydata->GetCellData()->SetScalars(VtkUtils::FillScalars(polydata->GetNumberOfCells(), c));
+        VtkUtils::AddInputData(append_filter, polydata);
+
+        if (i > 0)
+        {
+            Vec3d prev = traj[i-1].translation();
+            Vec3d lvec = prev - curr;
+
+            if(norm(lvec) > line_length)
+                lvec = normalize(lvec) * line_length;
+
+            Vec3d lend = curr + lvec;
+
+            vtkSmartPointer<vtkLineSource> line_source = vtkSmartPointer<vtkLineSource>::New();
+            line_source->SetPoint1(curr.val);
+            line_source->SetPoint2(lend.val);
+            line_source->Update();
+            vtkSmartPointer<vtkPolyData> polydata_ = line_source->GetOutput();
+            polydata_->GetCellData()->SetScalars(VtkUtils::FillScalars(polydata_->GetNumberOfCells(), c));
+            VtkUtils::AddInputData(append_filter, polydata_);
+        }
+    }
+    append_filter->Update();
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkSmartPointer<vtkPolyDataMapper>::New();
+    mapper->SetScalarModeToUseCellData();
+    VtkUtils::SetInputData(mapper, append_filter->GetOutput());
+
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
+    actor->SetMapper(mapper);
+
+    WidgetAccessor::setProp(*this, actor);
+}
+
+template<> cv::viz::WTrajectorySpheres cv::viz::Widget::cast<cv::viz::WTrajectorySpheres>()
+{
+    Widget3D widget = this->cast<Widget3D>();
+    return static_cast<WTrajectorySpheres&>(widget);
+}
diff --git a/modules/viz/src/types.cpp b/modules/viz/src/types.cpp
index 861a78318..2e32a6327 100644
--- a/modules/viz/src/types.cpp
+++ b/modules/viz/src/types.cpp
@@ -41,138 +41,63 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #include "precomp.hpp"
 
 ////////////////////////////////////////////////////////////////////
-/// cv::viz::KeyboardEvent
+/// Events
 
-cv::viz::KeyboardEvent::KeyboardEvent(bool _action, const String& _key_sym, unsigned char key, bool alt, bool ctrl, bool shift)
-  : action_(_action), modifiers_(0), key_code_(key), key_sym_(_key_sym)
-{
-  if (alt)
-    modifiers_ = Alt;
+cv::viz::KeyboardEvent::KeyboardEvent(Action _action, const String& _symbol, unsigned char _code, int _modifiers)
+  : action(_action), symbol(_symbol), code(_code), modifiers(_modifiers) {}
 
-  if (ctrl)
-    modifiers_ |= Ctrl;
-
-  if (shift)
-    modifiers_ |= Shift;
-}
-
-bool cv::viz::KeyboardEvent::isAltPressed() const { return (modifiers_ & Alt) != 0; }
-bool cv::viz::KeyboardEvent::isCtrlPressed() const { return (modifiers_ & Ctrl) != 0; }
-bool cv::viz::KeyboardEvent::isShiftPressed() const { return (modifiers_ & Shift) != 0; }
-unsigned char cv::viz::KeyboardEvent::getKeyCode() const { return key_code_; }
-const cv::String& cv::viz::KeyboardEvent::getKeySym() const { return key_sym_; }
-bool cv::viz::KeyboardEvent::keyDown() const { return action_; }
-bool cv::viz::KeyboardEvent::keyUp() const { return !action_; }
-
-////////////////////////////////////////////////////////////////////
-/// cv::viz::MouseEvent
-
-cv::viz::MouseEvent::MouseEvent(const Type& _type, const MouseButton& _button, const Point& _p,  bool alt, bool ctrl, bool shift)
-    : type(_type), button(_button), pointer(_p), key_state(0)
-{
-    if (alt)
-        key_state = KeyboardEvent::Alt;
-
-    if (ctrl)
-        key_state |= KeyboardEvent::Ctrl;
-
-    if (shift)
-        key_state |= KeyboardEvent::Shift;
-}
+cv::viz::MouseEvent::MouseEvent(const Type& _type, const MouseButton& _button, const Point& _pointer, int _modifiers)
+    : type(_type), button(_button), pointer(_pointer), modifiers(_modifiers) {}
 
 ////////////////////////////////////////////////////////////////////
 /// cv::viz::Mesh3d
 
-struct cv::viz::Mesh3d::loadMeshImpl
+cv::viz::Mesh cv::viz::Mesh::load(const String& file)
 {
-    static cv::viz::Mesh3d loadMesh(const String &file)
+    vtkSmartPointer<vtkPLYReader> reader = vtkSmartPointer<vtkPLYReader>::New();
+    reader->SetFileName(file.c_str());
+    reader->Update();
+
+    vtkSmartPointer<vtkPolyData> polydata = reader->GetOutput();
+    CV_Assert("File does not exist or file format is not supported." && polydata);
+
+    Mesh mesh;
+    vtkSmartPointer<vtkCloudMatSink> sink = vtkSmartPointer<vtkCloudMatSink>::New();
+    sink->SetOutput(mesh.cloud, mesh.colors, mesh.normals, mesh.tcoords);
+    sink->SetInputConnection(reader->GetOutputPort());
+    sink->Write();
+
+    // Now handle the polygons
+    vtkSmartPointer<vtkCellArray> polygons = polydata->GetPolys();
+    mesh.polygons.create(1, polygons->GetSize(), CV_32SC1);
+    int* poly_ptr = mesh.polygons.ptr<int>();
+
+    polygons->InitTraversal();
+    vtkIdType nr_cell_points, *cell_points;
+    while (polygons->GetNextCell(nr_cell_points, cell_points))
     {
-        Mesh3d mesh;
-
-        vtkSmartPointer<vtkPLYReader> reader = vtkSmartPointer<vtkPLYReader>::New();
-        reader->SetFileName(file.c_str());
-        reader->Update();
-
-        vtkSmartPointer<vtkPolyData> poly_data = reader->GetOutput();
-        CV_Assert("File does not exist or file format is not supported." && poly_data);
-
-        vtkSmartPointer<vtkPoints> mesh_points = poly_data->GetPoints();
-        vtkIdType nr_points = mesh_points->GetNumberOfPoints();
-
-        mesh.cloud.create(1, nr_points, CV_32FC3);
-
-        Vec3f *mesh_cloud = mesh.cloud.ptr<Vec3f>();
-        for (vtkIdType i = 0; i < mesh_points->GetNumberOfPoints(); i++)
-        {
-            Vec3d point;
-            mesh_points->GetPoint(i, point.val);
-            mesh_cloud[i] = point;
-        }
-
-        // Then the color information, if any
-        vtkUnsignedCharArray* poly_colors = 0;
-        if (poly_data->GetPointData())
-            poly_colors = vtkUnsignedCharArray::SafeDownCast(poly_data->GetPointData()->GetScalars());
-
-        if (poly_colors && (poly_colors->GetNumberOfComponents() == 3))
-        {
-            mesh.colors.create(1, nr_points, CV_8UC3);
-            Vec3b *mesh_colors = mesh.colors.ptr<cv::Vec3b>();
-
-            for (vtkIdType i = 0; i < mesh_points->GetNumberOfPoints(); i++)
-            {
-                Vec3b point_color;
-                poly_colors->GetTupleValue(i, point_color.val);
-
-                std::swap(point_color[0], point_color[2]); // RGB -> BGR
-                mesh_colors[i] = point_color;
-            }
-        }
-        else
-            mesh.colors.release();
-
-        // Now handle the polygons
-        vtkIdType* cell_points;
-        vtkIdType nr_cell_points;
-        vtkCellArray * mesh_polygons = poly_data->GetPolys();
-        mesh_polygons->InitTraversal();
-
-        mesh.polygons.create(1, mesh_polygons->GetSize(), CV_32SC1);
-
-        int* polygons = mesh.polygons.ptr<int>();
-        while (mesh_polygons->GetNextCell(nr_cell_points, cell_points))
-        {
-            *polygons++ = nr_cell_points;
-            for (int i = 0; i < nr_cell_points; ++i)
-                *polygons++ = static_cast<int>(cell_points[i]);
-        }
-
-        return mesh;
+        *poly_ptr++ = nr_cell_points;
+        for (vtkIdType i = 0; i < nr_cell_points; ++i)
+            *poly_ptr++ = (int)cell_points[i];
     }
-};
 
-cv::viz::Mesh3d cv::viz::Mesh3d::loadMesh(const String& file)
-{
-    return loadMeshImpl::loadMesh(file);
+    return mesh;
 }
 
 ////////////////////////////////////////////////////////////////////
 /// Camera implementation
 
-cv::viz::Camera::Camera(float f_x, float f_y, float c_x, float c_y, const Size &window_size)
+cv::viz::Camera::Camera(double fx, double fy, double cx, double cy, const Size &window_size)
 {
-    init(f_x, f_y, c_x, c_y, window_size);
+    init(fx, fy, cx, cy, window_size);
 }
 
-cv::viz::Camera::Camera(const Vec2f &fov, const Size &window_size)
+cv::viz::Camera::Camera(const Vec2d &fov, const Size &window_size)
 {
     CV_Assert(window_size.width > 0 && window_size.height > 0);
     setClip(Vec2d(0.01, 1000.01)); // Default clipping
@@ -183,16 +108,16 @@ cv::viz::Camera::Camera(const Vec2f &fov, const Size &window_size)
     focal_ = Vec2f(principal_point_[0] / tan(fov_[0]*0.5f), principal_point_[1] / tan(fov_[1]*0.5f));
 }
 
-cv::viz::Camera::Camera(const cv::Matx33f & K, const Size &window_size)
+cv::viz::Camera::Camera(const cv::Matx33d & K, const Size &window_size)
 {
-    float f_x = K(0,0);
-    float f_y = K(1,1);
-    float c_x = K(0,2);
-    float c_y = K(1,2);
+    double f_x = K(0,0);
+    double f_y = K(1,1);
+    double c_x = K(0,2);
+    double c_y = K(1,2);
     init(f_x, f_y, c_x, c_y, window_size);
 }
 
-cv::viz::Camera::Camera(const Matx44f &proj, const Size &window_size)
+cv::viz::Camera::Camera(const Matx44d &proj, const Size &window_size)
 {
     CV_Assert(window_size.width > 0 && window_size.height > 0);
 
@@ -205,34 +130,32 @@ cv::viz::Camera::Camera(const Matx44f &proj, const Size &window_size)
 
     double epsilon = 2.2204460492503131e-16;
 
-    if (fabs(left-right) < epsilon) principal_point_[0] = static_cast<float>(window_size.width) * 0.5f;
-    else principal_point_[0] = (left * static_cast<float>(window_size.width)) / (left - right);
-    focal_[0] = -near * principal_point_[0] / left;
+    principal_point_[0] = fabs(left-right) < epsilon ? window_size.width  * 0.5 : (left * window_size.width) / (left - right);
+    principal_point_[1] = fabs(top-bottom) < epsilon ? window_size.height * 0.5 : (top * window_size.height) / (top - bottom);
 
-    if (fabs(top-bottom) < epsilon) principal_point_[1] = static_cast<float>(window_size.height) * 0.5f;
-    else principal_point_[1] = (top * static_cast<float>(window_size.height)) / (top - bottom);
-    focal_[1] = near * principal_point_[1] / top;
+    focal_[0] = -near * principal_point_[0] / left;
+    focal_[1] =  near * principal_point_[1] / top;
 
     setClip(Vec2d(near, far));
-    fov_[0] = (atan2(principal_point_[0],focal_[0]) + atan2(window_size.width-principal_point_[0],focal_[0]));
-    fov_[1] = (atan2(principal_point_[1],focal_[1]) + atan2(window_size.height-principal_point_[1],focal_[1]));
+    fov_[0] = atan2(principal_point_[0], focal_[0]) + atan2(window_size.width-principal_point_[0],  focal_[0]);
+    fov_[1] = atan2(principal_point_[1], focal_[1]) + atan2(window_size.height-principal_point_[1], focal_[1]);
 
     window_size_ = window_size;
 }
 
-void cv::viz::Camera::init(float f_x, float f_y, float c_x, float c_y, const Size &window_size)
+void cv::viz::Camera::init(double fx, double fy, double cx, double cy, const Size &window_size)
 {
     CV_Assert(window_size.width > 0 && window_size.height > 0);
     setClip(Vec2d(0.01, 1000.01));// Default clipping
 
-    fov_[0] = (atan2(c_x,f_x) + atan2(window_size.width-c_x,f_x));
-    fov_[1] = (atan2(c_y,f_y) + atan2(window_size.height-c_y,f_y));
+    fov_[0] = atan2(cx, fx) + atan2(window_size.width  - cx, fx);
+    fov_[1] = atan2(cy, fy) + atan2(window_size.height - cy, fy);
 
-    principal_point_[0] = c_x;
-    principal_point_[1] = c_y;
+    principal_point_[0] = cx;
+    principal_point_[1] = cy;
 
-    focal_[0] = f_x;
-    focal_[1] = f_y;
+    focal_[0] = fx;
+    focal_[1] = fy;
 
     window_size_ = window_size;
 }
@@ -254,7 +177,7 @@ void cv::viz::Camera::setWindowSize(const Size &window_size)
     window_size_ = window_size;
 }
 
-void cv::viz::Camera::computeProjectionMatrix(Matx44f &proj) const
+void cv::viz::Camera::computeProjectionMatrix(Matx44d &proj) const
 {
     double top = clip_[0] * principal_point_[1] / focal_[1];
     double left = -clip_[0] * principal_point_[0] / focal_[0];
@@ -278,13 +201,6 @@ void cv::viz::Camera::computeProjectionMatrix(Matx44f &proj) const
 
 cv::viz::Camera cv::viz::Camera::KinectCamera(const Size &window_size)
 {
-    // Without distortion, RGB Camera
-    // Received from http://nicolas.burrus.name/index.php/Research/KinectCalibration
-    Matx33f K = Matx33f::zeros();
-    K(0,0) = 5.2921508098293293e+02;
-    K(0,2) = 3.2894272028759258e+02;
-    K(1,1) = 5.2556393630057437e+02;
-    K(1,2) = 2.6748068171871557e+02;
-    K(2,2) = 1.0f;
+    Matx33d K(525.0, 0.0, 320.0, 0.0, 525.0, 240.0, 0.0, 0.0, 1.0);
     return Camera(K, window_size);
 }
diff --git a/modules/viz/src/viz.cpp b/modules/viz/src/viz.cpp
deleted file mode 100644
index 6a08dfa34..000000000
--- a/modules/viz/src/viz.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Ozan Tonkal, ozantonkal@gmail.com
-//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
-//
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
-//M*/
-
-#include "precomp.hpp"
-
-cv::Affine3f cv::viz::makeTransformToGlobal(const Vec3f& axis_x, const Vec3f& axis_y, const Vec3f& axis_z, const Vec3f& origin)
-{
-    Affine3f::Mat3 R(axis_x[0], axis_y[0], axis_z[0],
-                     axis_x[1], axis_y[1], axis_z[1],
-                     axis_x[2], axis_y[2], axis_z[2]);
-
-    return Affine3f(R, origin);
-}
-
-cv::Affine3f cv::viz::makeCameraPose(const Vec3f& position, const Vec3f& focal_point, const Vec3f& y_dir)
-{
-    // Compute the transformation matrix for drawing the camera frame in a scene
-    Vec3f n = normalize(focal_point - position);
-    Vec3f u = normalize(y_dir.cross(n));
-    Vec3f v = n.cross(u);
-
-    return makeTransformToGlobal(u, v, n, position);
-}
-
-vtkSmartPointer<vtkMatrix4x4> cv::viz::convertToVtkMatrix(const cv::Matx44f &m)
-{
-    vtkSmartPointer<vtkMatrix4x4> vtk_matrix = vtkSmartPointer<vtkMatrix4x4>::New();
-    for (int i = 0; i < 4; i++)
-        for (int k = 0; k < 4; k++)
-            vtk_matrix->SetElement(i, k, m(i, k));
-    return vtk_matrix;
-}
-
-cv::Matx44f cv::viz::convertToMatx(const vtkSmartPointer<vtkMatrix4x4>& vtk_matrix)
-{
-    cv::Matx44f m;
-    for (int i = 0; i < 4; i++)
-        for (int k = 0; k < 4; k++)
-            m(i, k) = vtk_matrix->GetElement(i, k);
-    return m;
-}
-
-namespace cv { namespace viz
-{
-    template<typename _Tp> Vec<_Tp, 3>* vtkpoints_data(vtkSmartPointer<vtkPoints>& points);
-
-    template<> Vec3f* vtkpoints_data<float>(vtkSmartPointer<vtkPoints>& points)
-    {
-        CV_Assert(points->GetDataType() == VTK_FLOAT);
-        vtkDataArray *data = points->GetData();
-        float *pointer = static_cast<vtkFloatArray*>(data)->GetPointer(0);
-        return reinterpret_cast<Vec3f*>(pointer);
-    }
-
-    template<> Vec3d* vtkpoints_data<double>(vtkSmartPointer<vtkPoints>& points)
-    {
-        CV_Assert(points->GetDataType() == VTK_DOUBLE);
-        vtkDataArray *data = points->GetData();
-        double *pointer = static_cast<vtkDoubleArray*>(data)->GetPointer(0);
-        return reinterpret_cast<Vec3d*>(pointer);
-    }
-}}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-/// VizStorage implementation
-
-cv::viz::VizMap cv::viz::VizStorage::storage;
-void cv::viz::VizStorage::unregisterAll() { storage.clear(); }
-
-cv::viz::Viz3d& cv::viz::VizStorage::get(const String &window_name)
-{
-    String name = generateWindowName(window_name);
-    VizMap::iterator vm_itr = storage.find(name);
-    CV_Assert(vm_itr != storage.end());
-    return vm_itr->second;
-}
-
-void cv::viz::VizStorage::add(const Viz3d& window)
-{
-    String window_name = window.getWindowName();
-    VizMap::iterator vm_itr = storage.find(window_name);
-    CV_Assert(vm_itr == storage.end());
-    storage.insert(std::make_pair(window_name, window));
-}
-
-bool cv::viz::VizStorage::windowExists(const String &window_name)
-{
-    String name = generateWindowName(window_name);
-    return storage.find(name) != storage.end();
-}
-
-void cv::viz::VizStorage::removeUnreferenced()
-{
-    for(VizMap::iterator pos = storage.begin(); pos != storage.end();)
-        if(pos->second.impl_->ref_counter == 1)
-            storage.erase(pos++);
-        else
-            ++pos;
-}
-
-cv::String cv::viz::VizStorage::generateWindowName(const String &window_name)
-{
-    String output = "Viz";
-    // Already is Viz
-    if (window_name == output)
-        return output;
-
-    String prefixed = output + " - ";
-    if (window_name.substr(0, prefixed.length()) == prefixed)
-        output = window_name; // Already has "Viz - "
-    else if (window_name.substr(0, output.length()) == output)
-        output = prefixed + window_name; // Doesn't have prefix
-    else
-        output = (window_name == "" ? output : prefixed + window_name);
-
-    return output;
-}
-
-cv::viz::Viz3d cv::viz::get(const String &window_name) { return Viz3d (window_name); }
-void cv::viz::unregisterAllWindows() { VizStorage::unregisterAll(); }
diff --git a/modules/viz/src/viz3d.cpp b/modules/viz/src/viz3d.cpp
index 08cb880de..56f978c0e 100644
--- a/modules/viz/src/viz3d.cpp
+++ b/modules/viz/src/viz3d.cpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #include "precomp.hpp"
@@ -104,6 +101,7 @@ void cv::viz::Viz3d::release()
 void cv::viz::Viz3d::spin() { impl_->spin(); }
 void cv::viz::Viz3d::spinOnce(int time, bool force_redraw) { impl_->spinOnce(time, force_redraw); }
 bool cv::viz::Viz3d::wasStopped() const { return impl_->wasStopped(); }
+void cv::viz::Viz3d::close() { impl_->close(); }
 
 void cv::viz::Viz3d::registerKeyboardCallback(KeyboardCallback callback, void* cookie)
 { impl_->registerKeyboardCallback(callback, cookie); }
@@ -111,18 +109,21 @@ void cv::viz::Viz3d::registerKeyboardCallback(KeyboardCallback callback, void* c
 void cv::viz::Viz3d::registerMouseCallback(MouseCallback callback, void* cookie)
 { impl_->registerMouseCallback(callback, cookie); }
 
-void cv::viz::Viz3d::showWidget(const String &id, const Widget &widget, const Affine3f &pose) { impl_->showWidget(id, widget, pose); }
+void cv::viz::Viz3d::showWidget(const String &id, const Widget &widget, const Affine3d &pose) { impl_->showWidget(id, widget, pose); }
 void cv::viz::Viz3d::removeWidget(const String &id) { impl_->removeWidget(id); }
 cv::viz::Widget cv::viz::Viz3d::getWidget(const String &id) const { return impl_->getWidget(id); }
 void cv::viz::Viz3d::removeAllWidgets() { impl_->removeAllWidgets(); }
-void cv::viz::Viz3d::setWidgetPose(const String &id, const Affine3f &pose) { impl_->setWidgetPose(id, pose); }
-void cv::viz::Viz3d::updateWidgetPose(const String &id, const Affine3f &pose) { impl_->updateWidgetPose(id, pose); }
-cv::Affine3f cv::viz::Viz3d::getWidgetPose(const String &id) const { return impl_->getWidgetPose(id); }
+
+void cv::viz::Viz3d::showImage(InputArray image, const Size& window_size) { impl_->showImage(image, window_size); }
+
+void cv::viz::Viz3d::setWidgetPose(const String &id, const Affine3d &pose) { impl_->setWidgetPose(id, pose); }
+void cv::viz::Viz3d::updateWidgetPose(const String &id, const Affine3d &pose) { impl_->updateWidgetPose(id, pose); }
+cv::Affine3d cv::viz::Viz3d::getWidgetPose(const String &id) const { return impl_->getWidgetPose(id); }
 
 void cv::viz::Viz3d::setCamera(const Camera &camera) { impl_->setCamera(camera); }
 cv::viz::Camera cv::viz::Viz3d::getCamera() const { return impl_->getCamera(); }
-void cv::viz::Viz3d::setViewerPose(const Affine3f &pose) { impl_->setViewerPose(pose); }
-cv::Affine3f cv::viz::Viz3d::getViewerPose() { return impl_->getViewerPose(); }
+void cv::viz::Viz3d::setViewerPose(const Affine3d &pose) { impl_->setViewerPose(pose); }
+cv::Affine3d cv::viz::Viz3d::getViewerPose() { return impl_->getViewerPose(); }
 
 void cv::viz::Viz3d::resetCameraViewpoint(const String &id) { impl_->resetCameraViewpoint(id); }
 void cv::viz::Viz3d::resetCamera() { impl_->resetCamera(); }
@@ -131,17 +132,17 @@ void cv::viz::Viz3d::convertToWindowCoordinates(const Point3d &pt, Point3d &wind
 void cv::viz::Viz3d::converTo3DRay(const Point3d &window_coord, Point3d &origin, Vec3d &direction) { impl_->converTo3DRay(window_coord, origin, direction); }
 
 cv::Size cv::viz::Viz3d::getWindowSize() const { return impl_->getWindowSize(); }
-void cv::viz::Viz3d::setWindowSize(const Size &window_size) { impl_->setWindowSize(window_size.width, window_size.height); }
+void cv::viz::Viz3d::setWindowSize(const Size &window_size) { impl_->setWindowSize(window_size); }
 cv::String cv::viz::Viz3d::getWindowName() const { return impl_->getWindowName(); }
 void cv::viz::Viz3d::saveScreenshot(const String &file) { impl_->saveScreenshot(file); }
-void cv::viz::Viz3d::setWindowPosition(int x, int y) { impl_->setWindowPosition(x,y); }
+void cv::viz::Viz3d::setWindowPosition(const Point& window_position) { impl_->setWindowPosition(window_position); }
 void cv::viz::Viz3d::setFullScreen(bool mode) { impl_->setFullScreen(mode); }
-void cv::viz::Viz3d::setBackgroundColor(const Color& color) { impl_->setBackgroundColor(color); }
+void cv::viz::Viz3d::setBackgroundColor(const Color& color, const Color& color2) { impl_->setBackgroundColor(color, color2); }
+
+void cv::viz::Viz3d::setBackgroundTexture(InputArray image) { impl_->setBackgroundTexture(image); }
+void cv::viz::Viz3d::setBackgroundMeshLab() {impl_->setBackgroundMeshLab(); }
 
 void cv::viz::Viz3d::setRenderingProperty(const String &id, int property, double value) { getWidget(id).setRenderingProperty(property, value); }
 double cv::viz::Viz3d::getRenderingProperty(const String &id, int property) { return getWidget(id).getRenderingProperty(property); }
 
-void cv::viz::Viz3d::setDesiredUpdateRate(double rate) { impl_->setDesiredUpdateRate(rate); }
-double cv::viz::Viz3d::getDesiredUpdateRate() { return impl_->getDesiredUpdateRate(); }
-
 void cv::viz::Viz3d::setRepresentation(int representation) { impl_->setRepresentation(representation); }
diff --git a/modules/viz/src/viz3d_impl.hpp b/modules/viz/src/viz3d_impl.hpp
deleted file mode 100644
index 59f45a8fa..000000000
--- a/modules/viz/src/viz3d_impl.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Authors:
-//  * Ozan Tonkal, ozantonkal@gmail.com
-//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
-//
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
-//M*/
-
-#ifndef __OPENCV_VIZ_VIZ3D_IMPL_HPP__
-#define __OPENCV_VIZ_VIZ3D_IMPL_HPP__
-
-struct cv::viz::Viz3d::VizImpl
-{
-public:
-    typedef Viz3d::KeyboardCallback KeyboardCallback;
-    typedef Viz3d::MouseCallback MouseCallback;
-
-    int ref_counter;
-
-    VizImpl(const String &name);
-    virtual ~VizImpl();
-
-    void showWidget(const String &id, const Widget &widget, const Affine3f &pose = Affine3f::Identity());
-    void removeWidget(const String &id);
-    Widget getWidget(const String &id) const;
-    void removeAllWidgets();
-
-    void setWidgetPose(const String &id, const Affine3f &pose);
-    void updateWidgetPose(const String &id, const Affine3f &pose);
-    Affine3f getWidgetPose(const String &id) const;
-
-    void setDesiredUpdateRate(double rate);
-    double getDesiredUpdateRate();
-
-    /** \brief Returns true when the user tried to close the window */
-    bool wasStopped() const { if (interactor_ != NULL) return (stopped_); else return true; }
-
-    /** \brief Set the stopped flag back to false */
-    void resetStoppedFlag() { if (interactor_ != NULL) stopped_ = false; }
-
-    /** \brief Stop the interaction and close the visualizaton window. */
-    void close()
-    {
-        stopped_ = true;
-        if (interactor_)
-        {
-            interactor_->GetRenderWindow()->Finalize();
-            interactor_->TerminateApp(); // This tends to close the window...
-        }
-    }
-
-    void setRepresentation(int representation);
-
-    void setCamera(const Camera &camera);
-    Camera getCamera() const;
-
-    /** \brief Reset the camera to a given widget */
-    void resetCameraViewpoint(const String& id);
-    void resetCamera();
-
-    void setViewerPose(const Affine3f &pose);
-    Affine3f getViewerPose();
-
-    void convertToWindowCoordinates(const Point3d &pt, Point3d &window_coord);
-    void converTo3DRay(const Point3d &window_coord, Point3d &origin, Vec3d &direction);
-
-    void saveScreenshot(const String &file);
-    void setWindowPosition(int x, int y);
-    Size getWindowSize() const;
-    void setWindowSize(int xw, int yw);
-    void setFullScreen(bool mode);
-    String getWindowName() const;
-    void setBackgroundColor(const Color& color);
-
-    void spin();
-    void spinOnce(int time = 1, bool force_redraw = false);
-
-    void registerKeyboardCallback(KeyboardCallback callback, void* cookie = 0);
-    void registerMouseCallback(MouseCallback callback, void* cookie = 0);
-
-private:
-    vtkSmartPointer<vtkRenderWindowInteractor> interactor_;
-
-    struct ExitMainLoopTimerCallback : public vtkCommand
-    {
-        static ExitMainLoopTimerCallback* New() { return new ExitMainLoopTimerCallback; }
-        virtual void Execute(vtkObject* vtkNotUsed(caller), unsigned long event_id, void* call_data)
-        {
-            if (event_id != vtkCommand::TimerEvent)
-                return;
-
-            int timer_id = *reinterpret_cast<int*>(call_data);
-            if (timer_id != right_timer_id)
-                return;
-
-            // Stop vtk loop and send notification to app to wake it up
-            viz_->interactor_->TerminateApp();
-        }
-        int right_timer_id;
-        VizImpl* viz_;
-    };
-
-    struct ExitCallback : public vtkCommand
-    {
-        static ExitCallback* New() { return new ExitCallback; }
-        virtual void Execute(vtkObject*, unsigned long event_id, void*)
-        {
-            if (event_id == vtkCommand::ExitEvent)
-            {
-                viz_->stopped_ = true;
-                viz_->interactor_->GetRenderWindow()->Finalize();
-                viz_->interactor_->TerminateApp();
-            }
-        }
-        VizImpl* viz_;
-    };
-
-    /** \brief Set to false if the interaction loop is running. */
-    bool stopped_;
-
-    double s_lastDone_;
-
-    /** \brief Global timer ID. Used in destructor only. */
-    int timer_id_;
-
-    /** \brief Callback object enabling us to leave the main loop, when a timer fires. */
-    vtkSmartPointer<ExitMainLoopTimerCallback> exit_main_loop_timer_callback_;
-    vtkSmartPointer<ExitCallback> exit_callback_;
-
-    vtkSmartPointer<vtkRenderer> renderer_;
-    vtkSmartPointer<vtkRenderWindow> window_;
-
-    /** \brief The render window interactor style. */
-    vtkSmartPointer<InteractorStyle> style_;
-
-    /** \brief Internal list with actor pointers and name IDs for all widget actors */
-    cv::Ptr<WidgetActorMap> widget_actor_map_;
-
-    /** \brief Boolean that holds whether or not the camera parameters were manually initialized*/
-    bool camera_set_;
-
-    bool removeActorFromRenderer(const vtkSmartPointer<vtkProp> &actor);
-};
-
-
-
-namespace cv
-{
-    namespace viz
-    {
-        vtkSmartPointer<vtkMatrix4x4> convertToVtkMatrix(const cv::Matx44f &m);
-        cv::Matx44f convertToMatx(const vtkSmartPointer<vtkMatrix4x4>& vtk_matrix);
-
-        struct NanFilter
-        {
-            template<typename _Tp, typename _Msk>
-            struct Impl
-            {
-                typedef Vec<_Tp, 3> _Out;
-
-                static _Out* copy(const Mat& source, _Out* output, const Mat& nan_mask)
-                {
-                    CV_Assert(DataDepth<_Tp>::value == source.depth() && source.size() == nan_mask.size());
-                    CV_Assert(nan_mask.channels() == 3 || nan_mask.channels() == 4);
-                    CV_DbgAssert(DataDepth<_Msk>::value == nan_mask.depth());
-
-                    int s_chs = source.channels();
-                    int m_chs = nan_mask.channels();
-
-                    for (int y = 0; y < source.rows; ++y)
-                    {
-                        const _Tp* srow = source.ptr<_Tp>(y);
-                        const _Msk* mrow = nan_mask.ptr<_Msk>(y);
-
-                        for (int x = 0; x < source.cols; ++x, srow += s_chs, mrow += m_chs)
-                            if (!isNan(mrow[0]) && !isNan(mrow[1]) && !isNan(mrow[2]))
-                                *output++ = _Out(srow);
-                    }
-                    return output;
-                }
-
-                static _Out* copyColor(const Mat& source, _Out* output, const Mat& nan_mask)
-                {
-                    CV_Assert(DataDepth<_Tp>::value == source.depth() && source.size() == nan_mask.size());
-                    CV_Assert(nan_mask.channels() == 3 || nan_mask.channels() == 4);
-                    CV_DbgAssert(DataDepth<_Msk>::value == nan_mask.depth());
-
-                    int s_chs = source.channels();
-                    int m_chs = nan_mask.channels();
-
-                    for (int y = 0; y < source.rows; ++y)
-                    {
-                        const _Tp* srow = source.ptr<_Tp>(y);
-                        const _Msk* mrow = nan_mask.ptr<_Msk>(y);
-
-                        for (int x = 0; x < source.cols; ++x, srow += s_chs, mrow += m_chs)
-                            if (!isNan(mrow[0]) && !isNan(mrow[1]) && !isNan(mrow[2]))
-                            {
-                                *output = _Out(srow);
-                                std::swap((*output)[0], (*output)[2]); // BGR -> RGB
-                                ++output;
-                            }
-                    }
-                    return output;
-                }
-            };
-
-            template<typename _Tp>
-            static inline Vec<_Tp, 3>* copy(const Mat& source, Vec<_Tp, 3>* output, const Mat& nan_mask)
-            {
-                CV_Assert(nan_mask.depth() == CV_32F || nan_mask.depth() == CV_64F);
-
-                typedef Vec<_Tp, 3>* (*copy_func)(const Mat&, Vec<_Tp, 3>*, const Mat&);
-                const static copy_func table[2] = { &NanFilter::Impl<_Tp, float>::copy, &NanFilter::Impl<_Tp, double>::copy };
-
-                return table[nan_mask.depth() - 5](source, output, nan_mask);
-            }
-
-            template<typename _Tp>
-            static inline Vec<_Tp, 3>* copyColor(const Mat& source, Vec<_Tp, 3>* output, const Mat& nan_mask)
-            {
-                CV_Assert(nan_mask.depth() == CV_32F || nan_mask.depth() == CV_64F);
-
-                typedef Vec<_Tp, 3>* (*copy_func)(const Mat&, Vec<_Tp, 3>*, const Mat&);
-                const static copy_func table[2] = { &NanFilter::Impl<_Tp, float>::copyColor, &NanFilter::Impl<_Tp, double>::copyColor };
-
-                return table[nan_mask.depth() - 5](source, output, nan_mask);
-            }
-        };
-
-        struct ApplyAffine
-        {
-            const Affine3f& affine_;
-            ApplyAffine(const Affine3f& affine) : affine_(affine) {}
-
-            template<typename _Tp> Point3_<_Tp> operator()(const Point3_<_Tp>& p) const { return affine_ * p; }
-
-            template<typename _Tp> Vec<_Tp, 3> operator()(const Vec<_Tp, 3>& v) const
-            {
-                const float* m = affine_.matrix.val;
-
-                Vec<_Tp, 3> result;
-                result[0] = (_Tp)(m[0] * v[0] + m[1] * v[1] + m[ 2] * v[2] + m[ 3]);
-                result[1] = (_Tp)(m[4] * v[0] + m[5] * v[1] + m[ 6] * v[2] + m[ 7]);
-                result[2] = (_Tp)(m[8] * v[0] + m[9] * v[1] + m[10] * v[2] + m[11]);
-                return result;
-            }
-
-        private:
-            ApplyAffine(const ApplyAffine&);
-            ApplyAffine& operator=(const ApplyAffine&);
-        };
-
-
-        inline Color vtkcolor(const Color& color)
-        {
-            Color scaled_color = color * (1.0/255.0);
-            std::swap(scaled_color[0], scaled_color[2]);
-            return scaled_color;
-        }
-
-        inline Vec3d vtkpoint(const Point3f& point) { return Vec3d(point.x, point.y, point.z); }
-        template<typename _Tp> inline _Tp normalized(const _Tp& v) { return v * 1/cv::norm(v); }
-
-        struct ConvertToVtkImage
-        {
-            struct Impl
-            {
-                static void copyImageMultiChannel(const Mat &image, vtkSmartPointer<vtkImageData> output)
-                {
-                    int i_chs = image.channels();
-
-                    for (int i = 0; i < image.rows; ++i)
-                    {
-                        const unsigned char * irows = image.ptr<unsigned char>(i);
-                        for (int j = 0; j < image.cols; ++j, irows += i_chs)
-                        {
-                            unsigned char * vrows = static_cast<unsigned char *>(output->GetScalarPointer(j,i,0));
-                            memcpy(vrows, irows, i_chs);
-                            std::swap(vrows[0], vrows[2]); // BGR -> RGB
-                        }
-                    }
-                    output->Modified();
-                }
-
-                static void copyImageSingleChannel(const Mat &image, vtkSmartPointer<vtkImageData> output)
-                {
-                    for (int i = 0; i < image.rows; ++i)
-                    {
-                        const unsigned char * irows = image.ptr<unsigned char>(i);
-                        for (int j = 0; j < image.cols; ++j, ++irows)
-                        {
-                            unsigned char * vrows = static_cast<unsigned char *>(output->GetScalarPointer(j,i,0));
-                            *vrows = *irows;
-                        }
-                    }
-                    output->Modified();
-                }
-            };
-
-            static void convert(const Mat &image, vtkSmartPointer<vtkImageData> output)
-            {
-                // Create the vtk image
-                output->SetDimensions(image.cols, image.rows, 1);
-#if VTK_MAJOR_VERSION <= 5
-                output->SetNumberOfScalarComponents(image.channels());
-                output->SetScalarTypeToUnsignedChar();
-                output->AllocateScalars();
-#else
-                output->AllocateScalars(VTK_UNSIGNED_CHAR, image.channels());
-#endif
-
-                int i_chs = image.channels();
-                if (i_chs > 1)
-                {
-                    // Multi channel images are handled differently because of BGR <-> RGB
-                    Impl::copyImageMultiChannel(image, output);
-                }
-                else
-                {
-                    Impl::copyImageSingleChannel(image, output);
-                }
-            }
-        };
-    }
-}
-
-#endif
diff --git a/modules/viz/src/vizcore.cpp b/modules/viz/src/vizcore.cpp
new file mode 100644
index 000000000..29d4b4688
--- /dev/null
+++ b/modules/viz/src/vizcore.cpp
@@ -0,0 +1,312 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Ozan Tonkal, ozantonkal@gmail.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+cv::Affine3d cv::viz::makeTransformToGlobal(const Vec3d& axis_x, const Vec3d& axis_y, const Vec3d& axis_z, const Vec3d& origin)
+{
+    Affine3d::Mat3 R(axis_x[0], axis_y[0], axis_z[0],
+                     axis_x[1], axis_y[1], axis_z[1],
+                     axis_x[2], axis_y[2], axis_z[2]);
+
+    return Affine3d(R, origin);
+}
+
+cv::Affine3d cv::viz::makeCameraPose(const Vec3d& position, const Vec3d& focal_point, const Vec3d& y_dir)
+{
+    // Compute the transformation matrix for drawing the camera frame in a scene
+    Vec3d n = normalize(focal_point - position);
+    Vec3d u = normalize(y_dir.cross(n));
+    Vec3d v = n.cross(u);
+
+    return makeTransformToGlobal(u, v, n, position);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// VizStorage implementation
+
+cv::viz::VizMap cv::viz::VizStorage::storage;
+void cv::viz::VizStorage::unregisterAll() { storage.clear(); }
+
+cv::viz::Viz3d& cv::viz::VizStorage::get(const String &window_name)
+{
+    String name = generateWindowName(window_name);
+    VizMap::iterator vm_itr = storage.find(name);
+    CV_Assert(vm_itr != storage.end());
+    return vm_itr->second;
+}
+
+void cv::viz::VizStorage::add(const Viz3d& window)
+{
+    String window_name = window.getWindowName();
+    VizMap::iterator vm_itr = storage.find(window_name);
+    CV_Assert(vm_itr == storage.end());
+    storage.insert(std::make_pair(window_name, window));
+}
+
+bool cv::viz::VizStorage::windowExists(const String &window_name)
+{
+    String name = generateWindowName(window_name);
+    return storage.find(name) != storage.end();
+}
+
+void cv::viz::VizStorage::removeUnreferenced()
+{
+    for(VizMap::iterator pos = storage.begin(); pos != storage.end();)
+        if(pos->second.impl_->ref_counter == 1)
+            storage.erase(pos++);
+        else
+            ++pos;
+}
+
+cv::String cv::viz::VizStorage::generateWindowName(const String &window_name)
+{
+    String output = "Viz";
+    // Already is Viz
+    if (window_name == output)
+        return output;
+
+    String prefixed = output + " - ";
+    if (window_name.substr(0, prefixed.length()) == prefixed)
+        output = window_name; // Already has "Viz - "
+    else if (window_name.substr(0, output.length()) == output)
+        output = prefixed + window_name; // Doesn't have prefix
+    else
+        output = (window_name == "" ? output : prefixed + window_name);
+
+    return output;
+}
+
+cv::viz::Viz3d cv::viz::getWindowByName(const String &window_name) { return Viz3d (window_name); }
+void cv::viz::unregisterAllWindows() { VizStorage::unregisterAll(); }
+
+cv::viz::Viz3d cv::viz::imshow(const String& window_name, InputArray image, const Size& window_size)
+{
+    Viz3d viz = getWindowByName(window_name);
+    viz.showImage(image, window_size);
+    return viz;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Read/write clouds. Supported formats: ply, stl, xyz, obj
+
+void cv::viz::writeCloud(const String& file, InputArray cloud, InputArray colors, InputArray normals, bool binary)
+{
+    CV_Assert(file.size() > 4 && "Extention is required");
+    String extention = file.substr(file.size()-4);
+
+    vtkSmartPointer<vtkCloudMatSource> source = vtkSmartPointer<vtkCloudMatSource>::New();
+    source->SetColorCloudNormals(cloud, colors, normals);
+
+    vtkSmartPointer<vtkWriter> writer;
+    if (extention == ".xyz")
+    {
+        writer = vtkSmartPointer<vtkXYZWriter>::New();
+        vtkXYZWriter::SafeDownCast(writer)->SetFileName(file.c_str());
+    }
+    else if (extention == ".ply")
+    {
+        writer = vtkSmartPointer<vtkPLYWriter>::New();
+        vtkPLYWriter::SafeDownCast(writer)->SetFileName(file.c_str());
+        vtkPLYWriter::SafeDownCast(writer)->SetFileType(binary ? VTK_BINARY : VTK_ASCII);
+        vtkPLYWriter::SafeDownCast(writer)->SetArrayName("Colors");
+    }
+    else if (extention == ".obj")
+    {
+        writer = vtkSmartPointer<vtkOBJWriter>::New();
+        vtkOBJWriter::SafeDownCast(writer)->SetFileName(file.c_str());
+    }
+    else
+        CV_Assert(!"Unsupported format");
+
+    writer->SetInputConnection(source->GetOutputPort());
+    writer->Write();
+}
+
+cv::Mat cv::viz::readCloud(const String& file, OutputArray colors, OutputArray normals)
+{
+    CV_Assert(file.size() > 4 && "Extention is required");
+    String extention = file.substr(file.size()-4);
+
+    vtkSmartPointer<vtkPolyDataAlgorithm> reader;
+    if (extention == ".xyz")
+    {
+        reader = vtkSmartPointer<vtkSimplePointsReader>::New();
+        vtkSimplePointsReader::SafeDownCast(reader)->SetFileName(file.c_str());
+    }
+    else if (extention == ".ply")
+    {
+        reader = vtkSmartPointer<vtkPLYReader>::New();
+        CV_Assert(vtkPLYReader::CanReadFile(file.c_str()));
+        vtkPLYReader::SafeDownCast(reader)->SetFileName(file.c_str());
+    }
+    else if (extention == ".obj")
+    {
+        reader = vtkSmartPointer<vtkOBJReader>::New();
+        vtkOBJReader::SafeDownCast(reader)->SetFileName(file.c_str());
+    }
+    else if (extention == ".stl")
+    {
+        reader = vtkSmartPointer<vtkSTLReader>::New();
+        vtkSTLReader::SafeDownCast(reader)->SetFileName(file.c_str());
+    }
+    else
+        CV_Assert(!"Unsupported format");
+
+    cv::Mat cloud;
+
+    vtkSmartPointer<vtkCloudMatSink> sink = vtkSmartPointer<vtkCloudMatSink>::New();
+    sink->SetInputConnection(reader->GetOutputPort());
+    sink->SetOutput(cloud, colors, normals);
+    sink->Write();
+
+    return cloud;
+}
+
+cv::viz::Mesh cv::viz::readMesh(const String& file) { return Mesh::load(file); }
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Read/write poses and trajectories
+
+bool cv::viz::readPose(const String& file, Affine3d& pose, const String& tag)
+{
+    FileStorage fs(file, FileStorage::READ);
+    if (!fs.isOpened())
+        return false;
+
+    Mat hdr(pose.matrix, false);
+    fs[tag] >> hdr;
+    if (hdr.empty() || hdr.cols != pose.matrix.cols || hdr.rows != pose.matrix.rows)
+        return false;
+
+    hdr.convertTo(pose.matrix, CV_64F);
+    return true;
+}
+
+void cv::viz::writePose(const String& file, const Affine3d& pose, const String& tag)
+{
+    FileStorage fs(file, FileStorage::WRITE);
+    fs << tag << Mat(pose.matrix, false);
+}
+
+void cv::viz::readTrajectory(OutputArray _traj, const String& files_format, int start, int end, const String& tag)
+{
+    CV_Assert(_traj.kind() == _InputArray::STD_VECTOR || _traj.kind() == _InputArray::MAT);
+
+    start = max(0, std::min(start, end));
+    end = std::max(start, end);
+
+    std::vector<Affine3d> traj;
+
+    for(int i = start; i < end; ++i)
+    {
+        Affine3d affine;
+        bool ok = readPose(cv::format(files_format.c_str(), i), affine, tag);
+        if (!ok)
+            break;
+
+        traj.push_back(affine);
+    }
+
+    Mat(traj).convertTo(_traj, _traj.depth());
+}
+
+void cv::viz::writeTrajectory(InputArray _traj, const String& files_format, int start, const String& tag)
+{
+    if (_traj.kind() == _InputArray::STD_VECTOR_MAT)
+    {
+        std::vector<Mat>& v = *(std::vector<Mat>*)_traj.getObj();
+
+        for(size_t i = 0, index = max(0, start); i < v.size(); ++i, ++index)
+        {
+            Affine3d affine;
+            Mat pose = v[i];
+            CV_Assert(pose.type() == CV_32FC(16) || pose.type() == CV_64FC(16));
+            pose.copyTo(affine.matrix);
+            writePose(cv::format(files_format.c_str(), index), affine, tag);
+        }
+        return;
+    }
+
+    if (_traj.kind() == _InputArray::STD_VECTOR || _traj.kind() == _InputArray::MAT)
+    {
+        CV_Assert(_traj.type() == CV_32FC(16) || _traj.type() == CV_64FC(16));
+
+        Mat traj = _traj.getMat();
+
+        if (traj.depth() == CV_32F)
+            for(size_t i = 0, index = max(0, start); i < traj.total(); ++i, ++index)
+                writePose(cv::format(files_format.c_str(), index), traj.at<Affine3f>(i), tag);
+
+        if (traj.depth() == CV_64F)
+            for(size_t i = 0, index = max(0, start); i < traj.total(); ++i, ++index)
+                writePose(cv::format(files_format.c_str(), index), traj.at<Affine3d>(i), tag);
+    }
+
+    CV_Assert(!"Unsupported array kind");
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+/// Computing normals for mesh
+
+void cv::viz::computeNormals(const Mesh& mesh, OutputArray _normals)
+{
+    vtkSmartPointer<vtkPolyData> polydata = getPolyData(WMesh(mesh));
+    vtkSmartPointer<vtkPolyData> with_normals = VtkUtils::ComputeNormals(polydata);
+
+    vtkSmartPointer<vtkDataArray> generic_normals = with_normals->GetPointData()->GetNormals();
+    if(generic_normals)
+    {
+        Mat normals(1, generic_normals->GetNumberOfTuples(), CV_64FC3);
+        Vec3d *optr = normals.ptr<Vec3d>();
+
+        for(int i = 0; i < generic_normals->GetNumberOfTuples(); ++i, ++optr)
+            generic_normals->GetTuple(i, optr->val);
+
+        normals.convertTo(_normals, mesh.cloud.type());
+    }
+    else
+        _normals.release();
+}
diff --git a/modules/viz/src/viz3d_impl.cpp b/modules/viz/src/vizimpl.cpp
similarity index 65%
rename from modules/viz/src/viz3d_impl.cpp
rename to modules/viz/src/vizimpl.cpp
index b1173f645..5fa49e2f9 100644
--- a/modules/viz/src/viz3d_impl.cpp
+++ b/modules/viz/src/vizimpl.cpp
@@ -41,94 +41,143 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #include "precomp.hpp"
 
-vtkRenderWindowInteractor* vtkRenderWindowInteractorFixNew();
-
-#if 1 || !defined __APPLE__
-vtkRenderWindowInteractor* vtkRenderWindowInteractorFixNew()
-{
-  return vtkRenderWindowInteractor::New();
-}
-#endif
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-cv::viz::Viz3d::VizImpl::VizImpl(const String &name)
-    :  s_lastDone_(0.0), style_(vtkSmartPointer<cv::viz::InteractorStyle>::New()), widget_actor_map_(new WidgetActorMap)
+cv::viz::Viz3d::VizImpl::VizImpl(const String &name) : spin_once_state_(false),
+    window_position_(Vec2i(std::numeric_limits<int>::min())), widget_actor_map_(new WidgetActorMap)
 {
     renderer_ = vtkSmartPointer<vtkRenderer>::New();
+    window_name_ = VizStorage::generateWindowName(name);
 
-    // Create a RendererWindow
+    // Create render window
     window_ = vtkSmartPointer<vtkRenderWindow>::New();
-
-    // Set the window size as 1/2 of the screen size
     cv::Vec2i window_size = cv::Vec2i(window_->GetScreenSize()) / 2;
     window_->SetSize(window_size.val);
-
     window_->AddRenderer(renderer_);
 
     // Create the interactor style
-    style_->Initialize();
-    style_->setRenderer(renderer_);
+    style_ = vtkSmartPointer<InteractorStyle>::New();
     style_->setWidgetActorMap(widget_actor_map_);
     style_->UseTimersOn();
+    style_->Initialize();
 
-    /////////////////////////////////////////////////
-    interactor_ = vtkSmartPointer<vtkRenderWindowInteractor>::Take(vtkRenderWindowInteractorFixNew());
+    timer_callback_ = vtkSmartPointer<TimerCallback>::New();
+    exit_callback_ = vtkSmartPointer<ExitCallback>::New();
+    exit_callback_->viz = this;
+}
 
+/////////////////////////////////////////////////////////////////////////////////////////////
+void cv::viz::Viz3d::VizImpl::TimerCallback::Execute(vtkObject* caller, unsigned long event_id, void* cookie)
+{
+    if (event_id == vtkCommand::TimerEvent && timer_id == *reinterpret_cast<int*>(cookie))
+    {
+        vtkSmartPointer<vtkRenderWindowInteractor> interactor = vtkRenderWindowInteractor::SafeDownCast(caller);
+        interactor->TerminateApp();
+    }
+}
+
+void cv::viz::Viz3d::VizImpl::ExitCallback::Execute(vtkObject*, unsigned long event_id, void*)
+{
+    if (event_id == vtkCommand::ExitEvent)
+    {
+        viz->interactor_->TerminateApp();
+        viz->interactor_ = 0;
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+bool cv::viz::Viz3d::VizImpl::wasStopped() const
+{
+    bool stopped = spin_once_state_ ? interactor_ == 0 : false;
+    spin_once_state_ &= !stopped;
+    return stopped;
+}
+
+void cv::viz::Viz3d::VizImpl::close()
+{
+    if (!interactor_)
+        return;
+    interactor_->GetRenderWindow()->Finalize();
+    interactor_->TerminateApp(); // This tends to close the window...
+    interactor_ = 0;
+}
+
+void cv::viz::Viz3d::VizImpl::recreateRenderWindow()
+{
+#if !defined _MSC_VER
+    //recreating is workaround for Ubuntu -- a crash in x-server
+    Vec2i window_size(window_->GetSize());
+    int fullscreen = window_->GetFullScreen();
+
+    window_ = vtkSmartPointer<vtkRenderWindow>::New();
+    if (window_position_[0] != std::numeric_limits<int>::min()) //also workaround
+        window_->SetPosition(window_position_.val);
+
+    window_->SetSize(window_size.val);
+    window_->SetFullScreen(fullscreen);
+    window_->AddRenderer(renderer_);
+#endif
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+void cv::viz::Viz3d::VizImpl::spin()
+{
+    recreateRenderWindow();
+    interactor_ = vtkSmartPointer<vtkRenderWindowInteractor>::New();
+    interactor_->SetRenderWindow(window_);
+    interactor_->SetInteractorStyle(style_);
     window_->AlphaBitPlanesOff();
     window_->PointSmoothingOff();
     window_->LineSmoothingOff();
     window_->PolygonSmoothingOff();
     window_->SwapBuffersOn();
     window_->SetStereoTypeToAnaglyph();
-
-    interactor_->SetRenderWindow(window_);
-    interactor_->SetInteractorStyle(style_);
-    interactor_->SetDesiredUpdateRate(30.0);
-
-    // Initialize and create timer, also create window
-    interactor_->Initialize();
-    timer_id_ = interactor_->CreateRepeatingTimer(5000L);
-
-    // Set a simple PointPicker
-    //vtkSmartPointer<vtkPointPicker> pp = vtkSmartPointer<vtkPointPicker>::New();
-    //pp->SetTolerance(pp->GetTolerance() * 2);
-    //interactor_->SetPicker(pp);
-
-    exit_main_loop_timer_callback_ = vtkSmartPointer<ExitMainLoopTimerCallback>::New();
-    exit_main_loop_timer_callback_->viz_ = this;
-    exit_main_loop_timer_callback_->right_timer_id = -1;
-    interactor_->AddObserver(vtkCommand::TimerEvent, exit_main_loop_timer_callback_);
-
-    exit_callback_ = vtkSmartPointer<ExitCallback>::New();
-    exit_callback_->viz_ = this;
-    interactor_->AddObserver(vtkCommand::ExitEvent, exit_callback_);
-
-    resetStoppedFlag();
-
-
-    //////////////////////////////
-    String window_name = VizStorage::generateWindowName(name);
-    window_->SetWindowName(window_name.c_str());
+    window_->Render();
+    window_->SetWindowName(window_name_.c_str());
+    interactor_->Start();
+    interactor_ = 0;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-cv::viz::Viz3d::VizImpl::~VizImpl()
+void cv::viz::Viz3d::VizImpl::spinOnce(int time, bool force_redraw)
 {
-    if (interactor_)
-        interactor_->DestroyTimer(timer_id_);
-    if (renderer_)
-        renderer_->Clear();
+    if (interactor_ == 0)
+    {
+        spin_once_state_ = true;
+        recreateRenderWindow();
+        interactor_ = vtkSmartPointer<vtkRenderWindowInteractor>::New();
+        interactor_->SetRenderWindow(window_);
+        interactor_->SetInteractorStyle(style_);
+        interactor_->AddObserver(vtkCommand::TimerEvent, timer_callback_);
+        interactor_->AddObserver(vtkCommand::ExitEvent, exit_callback_);
+        window_->AlphaBitPlanesOff();
+        window_->PointSmoothingOff();
+        window_->LineSmoothingOff();
+        window_->PolygonSmoothingOff();
+        window_->SwapBuffersOn();
+        window_->SetStereoTypeToAnaglyph();
+        window_->Render();
+        window_->SetWindowName(window_name_.c_str());
+    }
+
+    vtkSmartPointer<vtkRenderWindowInteractor> local = interactor_;
+
+    if (force_redraw)
+        local->Render();
+
+    timer_callback_->timer_id = local->CreateRepeatingTimer(std::max(1, time));
+    local->Start();
+    local->DestroyTimer(timer_callback_->timer_id);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::showWidget(const String &id, const Widget &widget, const Affine3f &pose)
+void cv::viz::Viz3d::VizImpl::showWidget(const String &id, const Widget &widget, const Affine3d &pose)
 {
     WidgetActorMap::iterator wam_itr = widget_actor_map_->find(id);
     bool exists = wam_itr != widget_actor_map_->end();
@@ -142,7 +191,7 @@ void cv::viz::Viz3d::VizImpl::showWidget(const String &id, const Widget &widget,
     if (actor)
     {
         // If the actor is 3D, apply pose
-        vtkSmartPointer<vtkMatrix4x4> matrix = convertToVtkMatrix(pose.matrix);
+        vtkSmartPointer<vtkMatrix4x4> matrix = vtkmatrix(pose.matrix);
         actor->SetUserMatrix(matrix);
         actor->Modified();
     }
@@ -180,7 +229,7 @@ cv::viz::Widget cv::viz::Viz3d::VizImpl::getWidget(const String &id) const
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setWidgetPose(const String &id, const Affine3f &pose)
+void cv::viz::Viz3d::VizImpl::setWidgetPose(const String &id, const Affine3d &pose)
 {
     WidgetActorMap::iterator wam_itr = widget_actor_map_->find(id);
     bool exists = wam_itr != widget_actor_map_->end();
@@ -189,13 +238,13 @@ void cv::viz::Viz3d::VizImpl::setWidgetPose(const String &id, const Affine3f &po
     vtkProp3D *actor = vtkProp3D::SafeDownCast(wam_itr->second);
     CV_Assert("Widget is not 3D." && actor);
 
-    vtkSmartPointer<vtkMatrix4x4> matrix = convertToVtkMatrix(pose.matrix);
+    vtkSmartPointer<vtkMatrix4x4> matrix = vtkmatrix(pose.matrix);
     actor->SetUserMatrix(matrix);
     actor->Modified();
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::updateWidgetPose(const String &id, const Affine3f &pose)
+void cv::viz::Viz3d::VizImpl::updateWidgetPose(const String &id, const Affine3d &pose)
 {
     WidgetActorMap::iterator wam_itr = widget_actor_map_->find(id);
     bool exists = wam_itr != widget_actor_map_->end();
@@ -210,16 +259,15 @@ void cv::viz::Viz3d::VizImpl::updateWidgetPose(const String &id, const Affine3f
         setWidgetPose(id, pose);
         return ;
     }
-    Matx44f matrix_cv = convertToMatx(matrix);
-    Affine3f updated_pose = pose * Affine3f(matrix_cv);
-    matrix = convertToVtkMatrix(updated_pose.matrix);
+    Affine3d updated_pose = pose * Affine3d(*matrix->Element);
+    matrix = vtkmatrix(updated_pose.matrix);
 
     actor->SetUserMatrix(matrix);
     actor->Modified();
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-cv::Affine3f cv::viz::Viz3d::VizImpl::getWidgetPose(const String &id) const
+cv::Affine3d cv::viz::Viz3d::VizImpl::getWidgetPose(const String &id) const
 {
     WidgetActorMap::const_iterator wam_itr = widget_actor_map_->find(id);
     bool exists = wam_itr != widget_actor_map_->end();
@@ -228,24 +276,7 @@ cv::Affine3f cv::viz::Viz3d::VizImpl::getWidgetPose(const String &id) const
     vtkProp3D *actor = vtkProp3D::SafeDownCast(wam_itr->second);
     CV_Assert("Widget is not 3D." && actor);
 
-    vtkSmartPointer<vtkMatrix4x4> matrix = actor->GetUserMatrix();
-    Matx44f matrix_cv = convertToMatx(matrix);
-    return Affine3f(matrix_cv);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setDesiredUpdateRate(double rate)
-{
-    if (interactor_)
-        interactor_->SetDesiredUpdateRate(rate);
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-double cv::viz::Viz3d::VizImpl::getDesiredUpdateRate()
-{
-    if (interactor_)
-        return interactor_->GetDesiredUpdateRate();
-    return 0.0;
+    return Affine3d(*actor->GetUserMatrix()->Element);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -258,37 +289,6 @@ void cv::viz::Viz3d::VizImpl::registerMouseCallback(MouseCallback callback, void
 void cv::viz::Viz3d::VizImpl::registerKeyboardCallback(KeyboardCallback callback, void* cookie)
 { style_->registerKeyboardCallback(callback, cookie); }
 
-/////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::spin()
-{
-    resetStoppedFlag();
-    window_->Render();
-    interactor_->Start();
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::spinOnce(int time, bool force_redraw)
-{
-    resetStoppedFlag();
-
-    if (time <= 0)
-        time = 1;
-
-    if (force_redraw)
-        interactor_->Render();
-
-    double s_now_ = cv::getTickCount() / cv::getTickFrequency();
-    if (s_lastDone_ > s_now_)
-      s_lastDone_ = s_now_;
-
-    if ((s_now_ - s_lastDone_) > (1.0 / interactor_->GetDesiredUpdateRate()))
-    {
-        exit_main_loop_timer_callback_->right_timer_id = interactor_->CreateRepeatingTimer(time);
-        interactor_->Start();
-        interactor_->DestroyTimer(exit_main_loop_timer_callback_->right_timer_id);
-        s_lastDone_ = s_now_;
-    }
-}
 
 //////////////////////////////////////////////////////////////////////////////////////////
 void cv::viz::Viz3d::VizImpl::removeAllWidgets()
@@ -296,50 +296,98 @@ void cv::viz::Viz3d::VizImpl::removeAllWidgets()
     widget_actor_map_->clear();
     renderer_->RemoveAllViewProps();
 }
+/////////////////////////////////////////////////////////////////////////////////////////////
+void cv::viz::Viz3d::VizImpl::showImage(InputArray image, const Size& window_size)
+{
+    removeAllWidgets();
+    if (window_size.width > 0 && window_size.height > 0)
+        setWindowSize(window_size);
+
+    showWidget("showImage", WImageOverlay(image, Rect(Point(0,0), getWindowSize())));
+}
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-bool cv::viz::Viz3d::VizImpl::removeActorFromRenderer(const vtkSmartPointer<vtkProp> &actor)
+bool cv::viz::Viz3d::VizImpl::removeActorFromRenderer(vtkSmartPointer<vtkProp> actor)
 {
-    vtkProp* actor_to_remove = vtkProp::SafeDownCast(actor);
-
     vtkPropCollection* actors = renderer_->GetViewProps();
     actors->InitTraversal();
     vtkProp* current_actor = NULL;
     while ((current_actor = actors->GetNextProp()) != NULL)
-    {
-        if (current_actor != actor_to_remove)
-            continue;
-        renderer_->RemoveActor(actor);
-        return true;
-    }
+        if (current_actor == actor)
+        {
+            renderer_->RemoveActor(actor);
+            return true;
+        }
     return false;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setBackgroundColor(const Color& color)
+void cv::viz::Viz3d::VizImpl::setBackgroundColor(const Color& color, const Color& color2)
 {
-    Color c = vtkcolor(color);
-    renderer_->SetBackground(c.val);
+    Color c = vtkcolor(color), c2 = vtkcolor(color2);
+    bool gradient = color2[0] >= 0 && color2[1] >= 0 && color2[2] >= 0;
+
+    if (gradient)
+    {
+        renderer_->SetBackground(c2.val);
+        renderer_->SetBackground2(c.val);
+        renderer_->GradientBackgroundOn();
+    }
+    else
+    {
+        renderer_->SetBackground(c.val);
+        renderer_->GradientBackgroundOff();
+    }
+}
+
+void cv::viz::Viz3d::VizImpl::setBackgroundMeshLab()
+{ setBackgroundColor(Color(2, 1, 1), Color(240, 120, 120)); }
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+void cv::viz::Viz3d::VizImpl::setBackgroundTexture(InputArray image)
+{
+    if (image.empty())
+    {
+        renderer_->SetBackgroundTexture(0);
+        renderer_->TexturedBackgroundOff();
+        return;
+    }
+
+    vtkSmartPointer<vtkImageMatSource> source = vtkSmartPointer<vtkImageMatSource>::New();
+    source->SetImage(image);
+
+    vtkSmartPointer<vtkImageFlip> image_flip = vtkSmartPointer<vtkImageFlip>::New();
+    image_flip->SetFilteredAxis(1); // Vertical flip
+    image_flip->SetInputConnection(source->GetOutputPort());
+
+    vtkSmartPointer<vtkTexture> texture = vtkSmartPointer<vtkTexture>::New();
+    texture->SetInputConnection(image_flip->GetOutputPort());
+    //texture->Update();
+
+    renderer_->SetBackgroundTexture(texture);
+    renderer_->TexturedBackgroundOn();
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
 void cv::viz::Viz3d::VizImpl::setCamera(const Camera &camera)
 {
-    vtkCamera& active_camera = *renderer_->GetActiveCamera();
+    vtkSmartPointer<vtkCamera> active_camera = renderer_->GetActiveCamera();
 
     // Set the intrinsic parameters of the camera
     window_->SetSize(camera.getWindowSize().width, camera.getWindowSize().height);
     double aspect_ratio = static_cast<double>(camera.getWindowSize().width)/static_cast<double>(camera.getWindowSize().height);
 
-    Matx44f proj_mat;
+    Matx44d proj_mat;
     camera.computeProjectionMatrix(proj_mat);
+
     // Use the intrinsic parameters of the camera to simulate more realistically
-    Matx44f old_proj_mat = convertToMatx(active_camera.GetProjectionTransformMatrix(aspect_ratio, -1.0, 1.0));
-    vtkTransform *transform = vtkTransform::New();
+    vtkSmartPointer<vtkMatrix4x4> vtk_matrix = active_camera->GetProjectionTransformMatrix(aspect_ratio, -1.0, 1.0);
+    Matx44d old_proj_mat(*vtk_matrix->Element);
+
     // This is a hack around not being able to set Projection Matrix
-    transform->SetMatrix(convertToVtkMatrix(proj_mat * old_proj_mat.inv()));
-    active_camera.SetUserTransform(transform);
-    transform->Delete();
+    vtkSmartPointer<vtkTransform> transform = vtkSmartPointer<vtkTransform>::New();
+    transform->SetMatrix(vtkmatrix(proj_mat * old_proj_mat.inv()));
+    active_camera->SetUserTransform(transform);
 
     renderer_->ResetCameraClippingRange();
     renderer_->Render();
@@ -348,44 +396,42 @@ void cv::viz::Viz3d::VizImpl::setCamera(const Camera &camera)
 /////////////////////////////////////////////////////////////////////////////////////////////
 cv::viz::Camera cv::viz::Viz3d::VizImpl::getCamera() const
 {
-    vtkCamera& active_camera = *renderer_->GetActiveCamera();
+    vtkSmartPointer<vtkCamera> active_camera = renderer_->GetActiveCamera();
 
     Size window_size(renderer_->GetRenderWindow()->GetSize()[0],
                      renderer_->GetRenderWindow()->GetSize()[1]);
     double aspect_ratio = window_size.width / (double)window_size.height;
 
-    Matx44f proj_matrix = convertToMatx(active_camera.GetProjectionTransformMatrix(aspect_ratio, -1.0f, 1.0f));
-    Camera camera(proj_matrix, window_size);
-    return camera;
+    vtkSmartPointer<vtkMatrix4x4> proj_matrix = active_camera->GetProjectionTransformMatrix(aspect_ratio, -1.0f, 1.0f);
+    return Camera(Matx44d(*proj_matrix->Element), window_size);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setViewerPose(const Affine3f &pose)
+void cv::viz::Viz3d::VizImpl::setViewerPose(const Affine3d &pose)
 {
     vtkCamera& camera = *renderer_->GetActiveCamera();
 
     // Position = extrinsic translation
-    cv::Vec3f pos_vec = pose.translation();
+    cv::Vec3d pos_vec = pose.translation();
 
     // Rotate the view vector
-    cv::Matx33f rotation = pose.rotation();
-    cv::Vec3f y_axis(0.f, 1.f, 0.f);
-    cv::Vec3f up_vec(rotation * y_axis);
+    cv::Matx33d rotation = pose.rotation();
+    cv::Vec3d y_axis(0.0, 1.0, 0.0);
+    cv::Vec3d up_vec(rotation * y_axis);
 
     // Compute the new focal point
-    cv::Vec3f z_axis(0.f, 0.f, 1.f);
-    cv::Vec3f focal_vec = pos_vec + rotation * z_axis;
+    cv::Vec3d z_axis(0.0, 0.0, 1.0);
+    cv::Vec3d focal_vec = pos_vec + rotation * z_axis;
 
-    camera.SetPosition(pos_vec[0], pos_vec[1], pos_vec[2]);
-    camera.SetFocalPoint(focal_vec[0], focal_vec[1], focal_vec[2]);
-    camera.SetViewUp(up_vec[0], up_vec[1], up_vec[2]);
+    camera.SetPosition(pos_vec.val);
+    camera.SetFocalPoint(focal_vec.val);
+    camera.SetViewUp(up_vec.val);
 
     renderer_->ResetCameraClippingRange();
-    renderer_->Render();
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
-cv::Affine3f cv::viz::Viz3d::VizImpl::getViewerPose()
+cv::Affine3d cv::viz::Viz3d::VizImpl::getViewerPose()
 {
     vtkCamera& camera = *renderer_->GetActiveCamera();
 
@@ -397,20 +443,7 @@ cv::Affine3f cv::viz::Viz3d::VizImpl::getViewerPose()
     Vec3d z_axis = normalized(focal - pos);
     Vec3d x_axis = normalized(y_axis.cross(z_axis));
 
-    cv::Matx33d R;
-    R(0, 0) = x_axis[0];
-    R(0, 1) = y_axis[0];
-    R(0, 2) = z_axis[0];
-
-    R(1, 0) = x_axis[1];
-    R(1, 1) = y_axis[1];
-    R(1, 2) = z_axis[1];
-
-    R(2, 0) = x_axis[2];
-    R(2, 1) = y_axis[2];
-    R(2, 2) = z_axis[2];
-
-    return cv::Affine3f(R, pos);
+    return makeTransformToGlobal(x_axis, y_axis, z_axis, pos);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -426,10 +459,7 @@ void cv::viz::Viz3d::VizImpl::converTo3DRay(const Point3d &window_coord, Point3d
 {
     Vec4d world_pt;
     vtkInteractorObserver::ComputeDisplayToWorld(renderer_, window_coord.x, window_coord.y, window_coord.z, world_pt.val);
-
-    vtkCamera &active_camera = *renderer_->GetActiveCamera();
-    Vec3d cam_pos;
-    active_camera.GetPosition(cam_pos.val);
+    Vec3d cam_pos(renderer_->GetActiveCamera()->GetPosition());
     origin = cam_pos;
     direction = normalize(Vec3d(world_pt.val) - cam_pos);
 }
@@ -504,21 +534,9 @@ void cv::viz::Viz3d::VizImpl::setRepresentation(int representation)
     }
 }
 
-
 //////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setFullScreen(bool mode)
-{
-    if (window_)
-        window_->SetFullScreen(mode);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-cv::String cv::viz::Viz3d::VizImpl::getWindowName() const
-{
-    return (window_ ? window_->GetWindowName() : "");
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-void cv::viz::Viz3d::VizImpl::setWindowPosition(int x, int y) { window_->SetPosition(x, y); }
-void cv::viz::Viz3d::VizImpl::setWindowSize(int xw, int yw) { window_->SetSize(xw, yw); }
-cv::Size cv::viz::Viz3d::VizImpl::getWindowSize() const { return Size(window_->GetSize()[0], window_->GetSize()[1]); }
+cv::String cv::viz::Viz3d::VizImpl::getWindowName() const { return window_name_; }
+void cv::viz::Viz3d::VizImpl::setFullScreen(bool mode) { window_->SetFullScreen(mode); }
+void cv::viz::Viz3d::VizImpl::setWindowPosition(const Point& position) { window_position_ = position; window_->SetPosition(position.x, position.y); }
+void cv::viz::Viz3d::VizImpl::setWindowSize(const Size& window_size) { window_->SetSize(window_size.width, window_size.height); }
+cv::Size cv::viz::Viz3d::VizImpl::getWindowSize() const { return Size(Point(Vec2i(window_->GetSize()))); }
diff --git a/modules/viz/src/vizimpl.hpp b/modules/viz/src/vizimpl.hpp
new file mode 100644
index 000000000..02675e0a5
--- /dev/null
+++ b/modules/viz/src/vizimpl.hpp
@@ -0,0 +1,138 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Ozan Tonkal, ozantonkal@gmail.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#ifndef __OPENCV_VIZ_VIZ3D_IMPL_HPP__
+#define __OPENCV_VIZ_VIZ3D_IMPL_HPP__
+
+struct cv::viz::Viz3d::VizImpl
+{
+public:
+    typedef Viz3d::KeyboardCallback KeyboardCallback;
+    typedef Viz3d::MouseCallback MouseCallback;
+
+    int ref_counter;
+
+    VizImpl(const String &name);
+    virtual ~VizImpl() {}
+
+    bool wasStopped() const;
+    void close();
+
+    void spin();
+    void spinOnce(int time = 1, bool force_redraw = false);
+
+    void showWidget(const String &id, const Widget &widget, const Affine3d &pose = Affine3d::Identity());
+    void removeWidget(const String &id);
+    Widget getWidget(const String &id) const;
+    void removeAllWidgets();
+
+    void showImage(InputArray image, const Size& window_size);
+
+    void setWidgetPose(const String &id, const Affine3d &pose);
+    void updateWidgetPose(const String &id, const Affine3d &pose);
+    Affine3d getWidgetPose(const String &id) const;
+
+    void setRepresentation(int representation);
+
+    void setCamera(const Camera &camera);
+    Camera getCamera() const;
+
+    /** \brief Reset the camera to a given widget */
+    void resetCameraViewpoint(const String& id);
+    void resetCamera();
+
+    void setViewerPose(const Affine3d &pose);
+    Affine3d getViewerPose();
+
+    void convertToWindowCoordinates(const Point3d &pt, Point3d &window_coord);
+    void converTo3DRay(const Point3d &window_coord, Point3d &origin, Vec3d &direction);
+
+    void saveScreenshot(const String &file);
+    void setWindowPosition(const Point& position);
+    Size getWindowSize() const;
+    void setWindowSize(const Size& window_size);
+    void setFullScreen(bool mode);
+    String getWindowName() const;
+    void setBackgroundColor(const Color& color, const Color& color2);
+    void setBackgroundTexture(InputArray image);
+    void setBackgroundMeshLab();
+
+    void registerKeyboardCallback(KeyboardCallback callback, void* cookie = 0);
+    void registerMouseCallback(MouseCallback callback, void* cookie = 0);
+
+private:
+    struct TimerCallback : public vtkCommand
+    {
+        static TimerCallback* New() { return new TimerCallback; }
+        virtual void Execute(vtkObject* caller, unsigned long event_id, void* cookie);
+        int timer_id;
+    };
+
+    struct ExitCallback : public vtkCommand
+    {
+        static ExitCallback* New() { return new ExitCallback; }
+        virtual void Execute(vtkObject*, unsigned long event_id, void*);
+        VizImpl* viz;
+    };
+
+    mutable bool spin_once_state_;
+    vtkSmartPointer<vtkRenderWindowInteractor> interactor_;
+
+    vtkSmartPointer<vtkRenderWindow> window_;
+    String window_name_;
+    Vec2i window_position_;
+
+    vtkSmartPointer<TimerCallback> timer_callback_;
+    vtkSmartPointer<ExitCallback> exit_callback_;
+
+    vtkSmartPointer<vtkRenderer> renderer_;
+    vtkSmartPointer<InteractorStyle> style_;
+    Ptr<WidgetActorMap> widget_actor_map_;
+
+    bool removeActorFromRenderer(vtkSmartPointer<vtkProp> actor);
+    void recreateRenderWindow();
+};
+
+#endif
diff --git a/modules/viz/src/vtk/vtkCloudMatSink.cpp b/modules/viz/src/vtk/vtkCloudMatSink.cpp
new file mode 100644
index 000000000..09ef0cca9
--- /dev/null
+++ b/modules/viz/src/vtk/vtkCloudMatSink.cpp
@@ -0,0 +1,158 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(vtkCloudMatSink);
+}}
+
+cv::viz::vtkCloudMatSink::vtkCloudMatSink() {}
+cv::viz::vtkCloudMatSink::~vtkCloudMatSink() {}
+
+void cv::viz::vtkCloudMatSink::SetOutput(OutputArray _cloud, OutputArray _colors, OutputArray _normals, OutputArray _tcoords)
+{
+    cloud = _cloud;
+    colors = _colors;
+    normals = _normals;
+    tcoords = _tcoords;
+}
+
+void cv::viz::vtkCloudMatSink::WriteData()
+{
+    vtkPolyData *input = this->GetInput();
+    if (!input)
+        return;
+
+    vtkSmartPointer<vtkPoints> points_Data = input->GetPoints();
+
+    if (cloud.needed() && points_Data)
+    {
+        int vtktype = points_Data->GetDataType();
+        CV_Assert(vtktype == VTK_FLOAT || vtktype == VTK_DOUBLE);
+
+        cloud.create(1, points_Data->GetNumberOfPoints(), vtktype == VTK_FLOAT ? CV_32FC3 : CV_64FC3);
+        Vec3d *ddata = cloud.getMat().ptr<Vec3d>();
+        Vec3f *fdata = cloud.getMat().ptr<Vec3f>();
+
+        if (cloud.depth() == CV_32F)
+            for(size_t i = 0; i < cloud.total(); ++i)
+                *fdata++ = Vec3d(points_Data->GetPoint(i));
+
+        if (cloud.depth() == CV_64F)
+            for(size_t i = 0; i < cloud.total(); ++i)
+                *ddata++ = Vec3d(points_Data->GetPoint(i));
+    }
+    else
+        cloud.release();
+
+    vtkSmartPointer<vtkDataArray> scalars_data = input->GetPointData() ? input->GetPointData()->GetScalars() : 0;
+
+    if (colors.needed() && scalars_data)
+    {
+        int channels = scalars_data->GetNumberOfComponents();
+        int vtktype = scalars_data->GetDataType();
+
+        CV_Assert((channels == 3 || channels == 4) && "Only 3- or 4-channel color data support is implemented");
+        CV_Assert(cloud.total() == (size_t)scalars_data->GetNumberOfTuples());
+
+        Mat buffer(cloud.size(), CV_64FC(channels));
+        Vec3d *cptr = buffer.ptr<Vec3d>();
+        for(size_t i = 0; i < buffer.total(); ++i)
+            *cptr++ = Vec3d(scalars_data->GetTuple(i));
+
+        buffer.convertTo(colors, CV_8U, vtktype == VTK_FLOAT || VTK_FLOAT == VTK_DOUBLE ?  255.0 : 1.0);
+    }
+    else
+        colors.release();
+
+    vtkSmartPointer<vtkDataArray> normals_data = input->GetPointData() ? input->GetPointData()->GetNormals() : 0;
+
+    if (normals.needed() && normals_data)
+    {
+        int channels = normals_data->GetNumberOfComponents();
+        int vtktype = normals_data->GetDataType();
+
+        CV_Assert((vtktype == VTK_FLOAT || VTK_FLOAT == VTK_DOUBLE) && (channels == 3 || channels == 4));
+        CV_Assert(cloud.total() == (size_t)normals_data->GetNumberOfTuples());
+
+        Mat buffer(cloud.size(), CV_64FC(channels));
+        Vec3d *cptr = buffer.ptr<Vec3d>();
+        for(size_t i = 0; i < buffer.total(); ++i)
+            *cptr++ = Vec3d(normals_data->GetTuple(i));
+
+        buffer.convertTo(normals, vtktype == VTK_FLOAT ? CV_32F : CV_64F);
+    }
+    else
+        normals.release();
+
+    vtkSmartPointer<vtkDataArray> coords_data = input->GetPointData() ? input->GetPointData()->GetTCoords() : 0;
+
+    if (tcoords.needed() && coords_data)
+    {
+        int vtktype = coords_data->GetDataType();
+
+        CV_Assert(vtktype == VTK_FLOAT || VTK_FLOAT == VTK_DOUBLE);
+        CV_Assert(cloud.total() == (size_t)coords_data->GetNumberOfTuples());
+
+        Mat buffer(cloud.size(), CV_64FC2);
+        Vec2d *cptr = buffer.ptr<Vec2d>();
+        for(size_t i = 0; i < buffer.total(); ++i)
+            *cptr++ = Vec2d(coords_data->GetTuple(i));
+
+        buffer.convertTo(tcoords, vtktype == VTK_FLOAT ? CV_32F : CV_64F);
+
+    }
+    else
+        tcoords.release();
+}
+
+void cv::viz::vtkCloudMatSink::PrintSelf(ostream& os, vtkIndent indent)
+{
+  Superclass::PrintSelf(os, indent);
+  os << indent << "Cloud: " << cloud.needed() << "\n";
+  os << indent << "Colors: " << colors.needed() << "\n";
+  os << indent << "Normals: " << normals.needed() << "\n";
+}
diff --git a/modules/ocl/src/safe_call.hpp b/modules/viz/src/vtk/vtkCloudMatSink.h
similarity index 67%
rename from modules/ocl/src/safe_call.hpp
rename to modules/viz/src/vtk/vtkCloudMatSink.h
index 14cbb6df0..3af9e6544 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/viz/src/vtk/vtkCloudMatSink.h
@@ -10,13 +10,9 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//   Long Guoping , longguoping@gmail.com
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -41,29 +37,43 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
 //M*/
 
-#ifndef __OPENCV_OPENCL_SAFE_CALL_HPP__
-#define __OPENCV_OPENCL_SAFE_CALL_HPP__
-
-#include "opencv2/core/opencl/runtime/opencl_core.hpp"
-
-#define openCLSafeCall(expr)  ___openCLSafeCall(expr, __FILE__, __LINE__, CV_Func)
-#define openCLVerifyCall(res) ___openCLSafeCall(res, __FILE__, __LINE__, CV_Func)
+#ifndef __vtkCloudMatSink_h
+#define __vtkCloudMatSink_h
 
+#include <opencv2/core.hpp>
+#include <vtkPolyDataWriter.h>
 
 namespace cv
 {
-    namespace ocl
+    namespace viz
     {
-        const char *getOpenCLErrorString( int err );
-
-        static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "")
+        class vtkCloudMatSink : public vtkPolyDataWriter
         {
-            if (CL_SUCCESS != err)
-                cv::error(Error::OpenCLApiCallError, getOpenCLErrorString(err), func, file, line);
-        }
+        public:
+          static vtkCloudMatSink *New();
+          vtkTypeMacro(vtkCloudMatSink,vtkPolyDataWriter)
+          void PrintSelf(ostream& os, vtkIndent indent);
+
+          void SetOutput(OutputArray cloud, OutputArray colors = noArray(), OutputArray normals = noArray(), OutputArray tcoords = noArray());
+
+        protected:
+          vtkCloudMatSink();
+          ~vtkCloudMatSink();
+
+          void WriteData();
+
+          _OutputArray cloud, colors, normals, tcoords;
+
+        private:
+          vtkCloudMatSink(const vtkCloudMatSink&);  // Not implemented.
+          void operator=(const vtkCloudMatSink&);  // Not implemented.
+        };
     }
 }
 
-#endif /* __OPENCV_OPENCL_SAFE_CALL_HPP__ */
+#endif
diff --git a/modules/viz/src/vtk/vtkCloudMatSource.cpp b/modules/viz/src/vtk/vtkCloudMatSource.cpp
new file mode 100644
index 000000000..74d01bbd0
--- /dev/null
+++ b/modules/viz/src/vtk/vtkCloudMatSource.cpp
@@ -0,0 +1,286 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(vtkCloudMatSource);
+
+    template<typename _Tp> struct VtkDepthTraits;
+
+    template<> struct VtkDepthTraits<float>
+    {
+        const static int data_type = VTK_FLOAT;
+        typedef vtkFloatArray array_type;
+    };
+
+    template<> struct VtkDepthTraits<double>
+    {
+        const static int data_type = VTK_DOUBLE;
+        typedef vtkDoubleArray array_type;
+    };
+}}
+
+cv::viz::vtkCloudMatSource::vtkCloudMatSource() { SetNumberOfInputPorts(0); }
+cv::viz::vtkCloudMatSource::~vtkCloudMatSource() {}
+
+int cv::viz::vtkCloudMatSource::SetCloud(InputArray _cloud)
+{
+    CV_Assert(_cloud.depth() == CV_32F || _cloud.depth() == CV_64F);
+    CV_Assert(_cloud.channels() == 3 || _cloud.channels() == 4);
+
+    Mat cloud = _cloud.getMat();
+
+    int total = _cloud.depth() == CV_32F ? filterNanCopy<float>(cloud) : filterNanCopy<double>(cloud);
+
+    vertices = vtkSmartPointer<vtkCellArray>::New();
+    vertices->Allocate(vertices->EstimateSize(1, total));
+    vertices->InsertNextCell(total);
+    for(int i = 0; i < total; ++i)
+        vertices->InsertCellPoint(i);
+
+    return total;
+}
+
+int cv::viz::vtkCloudMatSource::SetColorCloud(InputArray _cloud, InputArray _colors)
+{
+    int total = SetCloud(_cloud);
+
+    if (_colors.empty())
+        return total;
+
+    CV_Assert(_colors.depth() == CV_8U && _colors.channels() <= 4 && _colors.channels() != 2);
+    CV_Assert(_colors.size() == _cloud.size());
+
+    Mat cloud = _cloud.getMat();
+    Mat colors = _colors.getMat();
+
+    if (cloud.depth() == CV_32F)
+        filterNanColorsCopy<float>(colors, cloud, total);
+    else if (cloud.depth() == CV_64F)
+        filterNanColorsCopy<double>(colors, cloud, total);
+
+    return total;
+}
+
+int cv::viz::vtkCloudMatSource::SetColorCloudNormals(InputArray _cloud, InputArray _colors, InputArray _normals)
+{
+    int total = SetColorCloud(_cloud, _colors);
+
+    if (_normals.empty())
+        return total;
+
+    CV_Assert(_normals.depth() == CV_32F || _normals.depth() == CV_64F);
+    CV_Assert(_normals.channels() == 3 || _normals.channels() == 4);
+    CV_Assert(_normals.size() == _cloud.size());
+
+    Mat c = _cloud.getMat();
+    Mat n = _normals.getMat();
+
+    if (n.depth() == CV_32F && c.depth() == CV_32F)
+        filterNanNormalsCopy<float, float>(n, c, total);
+    else if (n.depth() == CV_32F && c.depth() == CV_64F)
+        filterNanNormalsCopy<float, double>(n, c, total);
+    else if (n.depth() == CV_64F && c.depth() == CV_32F)
+        filterNanNormalsCopy<double, float>(n, c, total);
+    else if (n.depth() == CV_64F && c.depth() == CV_64F)
+        filterNanNormalsCopy<double, double>(n, c, total);
+    else
+        CV_Assert(!"Unsupported normals/cloud type");
+
+    return total;
+}
+
+int cv::viz::vtkCloudMatSource::SetColorCloudNormalsTCoords(InputArray _cloud, InputArray _colors, InputArray _normals, InputArray _tcoords)
+{
+    int total = SetColorCloudNormals(_cloud, _colors, _normals);
+
+    if (_tcoords.empty())
+        return total;
+
+    CV_Assert(_tcoords.depth() == CV_32F || _tcoords.depth() == CV_64F);
+    CV_Assert(_tcoords.channels() == 2 && _tcoords.size() == _cloud.size());
+
+    Mat cl = _cloud.getMat();
+    Mat tc = _tcoords.getMat();
+
+    if (tc.depth() == CV_32F && cl.depth() == CV_32F)
+        filterNanTCoordsCopy<float, float>(tc, cl, total);
+    else if (tc.depth() == CV_32F && cl.depth() == CV_64F)
+        filterNanTCoordsCopy<float, double>(tc, cl, total);
+    else if (tc.depth() == CV_64F && cl.depth() == CV_32F)
+        filterNanTCoordsCopy<double, float>(tc, cl, total);
+    else if (tc.depth() == CV_64F && cl.depth() == CV_64F)
+        filterNanTCoordsCopy<double, double>(tc, cl, total);
+    else
+        CV_Assert(!"Unsupported tcoords/cloud type");
+
+    return total;
+}
+
+int cv::viz::vtkCloudMatSource::RequestData(vtkInformation *vtkNotUsed(request), vtkInformationVector **vtkNotUsed(inputVector), vtkInformationVector *outputVector)
+{
+    vtkInformation *outInfo = outputVector->GetInformationObject(0);
+    vtkPolyData *output = vtkPolyData::SafeDownCast(outInfo->Get(vtkDataObject::DATA_OBJECT()));
+
+    output->SetPoints(points);
+    output->SetVerts(vertices);
+    if (scalars)
+        output->GetPointData()->SetScalars(scalars);
+
+    if (normals)
+        output->GetPointData()->SetNormals(normals);
+
+    if (tcoords)
+        output->GetPointData()->SetTCoords(tcoords);
+
+    return 1;
+}
+
+template<typename _Tp>
+int cv::viz::vtkCloudMatSource::filterNanCopy(const Mat& cloud)
+{
+    CV_DbgAssert(DataType<_Tp>::depth == cloud.depth());
+    points = vtkSmartPointer<vtkPoints>::New();
+    points->SetDataType(VtkDepthTraits<_Tp>::data_type);
+    points->Allocate(cloud.total());
+    points->SetNumberOfPoints(cloud.total());
+
+    int s_chs = cloud.channels();
+    int total = 0;
+    for (int y = 0; y < cloud.rows; ++y)
+    {
+        const _Tp* srow = cloud.ptr<_Tp>(y);
+        const _Tp* send = srow + cloud.cols * s_chs;
+
+        for (; srow != send; srow += s_chs)
+            if (!isNan(srow))
+                points->SetPoint(total++, srow);
+    }
+    points->SetNumberOfPoints(total);
+    points->Squeeze();
+    return total;
+}
+
+template<typename _Msk>
+void cv::viz::vtkCloudMatSource::filterNanColorsCopy(const Mat& cloud_colors, const Mat& mask, int total)
+{
+    Vec3b* array = new Vec3b[total];
+    Vec3b* pos = array;
+
+    int s_chs = cloud_colors.channels();
+    int m_chs = mask.channels();
+    for (int y = 0; y < cloud_colors.rows; ++y)
+    {
+        const unsigned char* srow = cloud_colors.ptr<unsigned char>(y);
+        const unsigned char* send = srow + cloud_colors.cols * s_chs;
+        const _Msk* mrow = mask.ptr<_Msk>(y);
+
+        if (cloud_colors.channels() == 1)
+        {
+            for (; srow != send; srow += s_chs, mrow += m_chs)
+                if (!isNan(mrow))
+                    *pos++ = Vec3b(srow[0], srow[0], srow[0]);
+        }
+        else
+            for (; srow != send; srow += s_chs, mrow += m_chs)
+                if (!isNan(mrow))
+                    *pos++ = Vec3b(srow[2], srow[1], srow[0]);
+
+    }
+
+    scalars = vtkSmartPointer<vtkUnsignedCharArray>::New();
+    scalars->SetName("Colors");
+    scalars->SetNumberOfComponents(3);
+    scalars->SetNumberOfTuples(total);
+    scalars->SetArray(array->val, total * 3, 0);
+}
+
+template<typename _Tn, typename _Msk>
+void cv::viz::vtkCloudMatSource::filterNanNormalsCopy(const Mat& cloud_normals, const Mat& mask, int total)
+{
+    normals = vtkSmartPointer< typename VtkDepthTraits<_Tn>::array_type >::New();
+    normals->SetName("Normals");
+    normals->SetNumberOfComponents(3);
+    normals->SetNumberOfTuples(total);
+
+    int s_chs = cloud_normals.channels();
+    int m_chs = mask.channels();
+
+    int pos = 0;
+    for (int y = 0; y < cloud_normals.rows; ++y)
+    {
+        const _Tn* srow = cloud_normals.ptr<_Tn>(y);
+        const _Tn* send = srow + cloud_normals.cols * s_chs;
+
+        const _Msk* mrow = mask.ptr<_Msk>(y);
+
+        for (; srow != send; srow += s_chs, mrow += m_chs)
+            if (!isNan(mrow))
+                normals->SetTuple(pos++, srow);
+    }
+}
+
+template<typename _Tn, typename _Msk>
+void cv::viz::vtkCloudMatSource::filterNanTCoordsCopy(const Mat& _tcoords, const Mat& mask, int total)
+{
+    typedef Vec<_Tn, 2> Vec2;
+    tcoords = vtkSmartPointer< typename VtkDepthTraits<_Tn>::array_type >::New();
+    tcoords->SetName("TextureCoordinates");
+    tcoords->SetNumberOfComponents(2);
+    tcoords->SetNumberOfTuples(total);
+
+    int pos = 0;
+    for (int y = 0; y < mask.rows; ++y)
+    {
+        const Vec2* srow = _tcoords.ptr<Vec2>(y);
+        const Vec2* send = srow + _tcoords.cols;
+        const _Msk* mrow = mask.ptr<_Msk>(y);
+
+        for (; srow != send; ++srow, mrow += mask.channels())
+            if (!isNan(mrow))
+                tcoords->SetTuple(pos++, srow->val);
+    }
+}
diff --git a/modules/ocl/perf/perf_fast.cpp b/modules/viz/src/vtk/vtkCloudMatSource.h
similarity index 52%
rename from modules/ocl/perf/perf_fast.cpp
rename to modules/viz/src/vtk/vtkCloudMatSource.h
index e5ac84894..56bd93e06 100644
--- a/modules/ocl/perf/perf_fast.cpp
+++ b/modules/viz/src/vtk/vtkCloudMatSource.h
@@ -38,56 +38,59 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 // Authors:
-//  * Peter Andreas Entschev, peter@entschev.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
 //M*/
 
-#include "perf_precomp.hpp"
+#ifndef __vtkCloudMatSource_h
+#define __vtkCloudMatSource_h
 
-using namespace perf;
+#include <opencv2/core.hpp>
+#include <vtkPolyDataAlgorithm.h>
+#include <vtkSmartPointer.h>
+#include <vtkPoints.h>
+#include <vtkCellArray.h>
 
-///////////// FAST ////////////////////////
-
-typedef std::tr1::tuple<std::string, int, bool> Image_Threshold_NonmaxSupression_t;
-typedef perf::TestBaseWithParam<Image_Threshold_NonmaxSupression_t> Image_Threshold_NonmaxSupression;
-
-PERF_TEST_P(Image_Threshold_NonmaxSupression, FAST,
-            testing::Combine(testing::Values<string>("gpu/perf/aloe.png"),
-                    testing::Values(20),
-                    testing::Bool()))
+namespace cv
 {
-    const Image_Threshold_NonmaxSupression_t params = GetParam();
-    const std::string imgFile = std::tr1::get<0>(params);
-    const int threshold = std::tr1::get<1>(params);
-    const bool nonmaxSupression = std::tr1::get<2>(params);
-
-    const cv::Mat img = imread(getDataPath(imgFile), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (RUN_OCL_IMPL)
+    namespace viz
     {
-        cv::ocl::FAST_OCL fast(threshold, nonmaxSupression, 0.5);
+        class vtkCloudMatSource : public vtkPolyDataAlgorithm
+        {
+        public:
+            static vtkCloudMatSource *New();
+            vtkTypeMacro(vtkCloudMatSource,vtkPolyDataAlgorithm)
 
-        cv::ocl::oclMat d_img(img);
-        cv::ocl::oclMat d_keypoints;
+            virtual int SetCloud(InputArray cloud);
+            virtual int SetColorCloud(InputArray cloud, InputArray colors);
+            virtual int SetColorCloudNormals(InputArray cloud, InputArray colors, InputArray normals);
+            virtual int SetColorCloudNormalsTCoords(InputArray cloud, InputArray colors, InputArray normals, InputArray tcoords);
 
-        OCL_TEST_CYCLE() fast(d_img, cv::ocl::oclMat(), d_keypoints);
+        protected:
+            vtkCloudMatSource();
+            ~vtkCloudMatSource();
 
-        std::vector<cv::KeyPoint> ocl_keypoints;
-        fast.downloadKeypoints(d_keypoints, ocl_keypoints);
+            int RequestData(vtkInformation *, vtkInformationVector **, vtkInformationVector *);
 
-        sortKeyPoints(ocl_keypoints);
+            vtkSmartPointer<vtkPoints> points;
+            vtkSmartPointer<vtkCellArray> vertices;
+            vtkSmartPointer<vtkUnsignedCharArray> scalars;
+            vtkSmartPointer<vtkDataArray> normals;
+            vtkSmartPointer<vtkDataArray> tcoords;
+        private:
+            vtkCloudMatSource(const vtkCloudMatSource&);  // Not implemented.
+            void operator=(const vtkCloudMatSource&);  // Not implemented.
 
-        SANITY_CHECK_KEYPOINTS(ocl_keypoints);
+            template<typename _Tp> int filterNanCopy(const Mat& cloud);
+            template<typename _Msk> void filterNanColorsCopy(const Mat& cloud_colors, const Mat& mask, int total);
+
+            template<typename _Tn, typename _Msk>
+            void filterNanNormalsCopy(const Mat& cloud_normals, const Mat& mask, int total);
+
+            template<typename _Tn, typename _Msk>
+            void filterNanTCoordsCopy(const Mat& tcoords, const Mat& mask, int total);
+        };
     }
-    else if (RUN_PLAIN_IMPL)
-    {
-        std::vector<cv::KeyPoint> cpu_keypoints;
-
-        TEST_CYCLE() cv::FAST(img, cpu_keypoints, threshold, nonmaxSupression);
-
-        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
-    }
-    else
-        OCL_PERF_ELSE;
 }
+
+#endif
diff --git a/modules/viz/src/vtk/vtkImageMatSource.cpp b/modules/viz/src/vtk/vtkImageMatSource.cpp
new file mode 100644
index 000000000..58a5642d4
--- /dev/null
+++ b/modules/viz/src/vtk/vtkImageMatSource.cpp
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(vtkImageMatSource);
+}}
+
+cv::viz::vtkImageMatSource::vtkImageMatSource()
+{
+    this->SetNumberOfInputPorts(0);
+    this->ImageData = vtkImageData::New();
+}
+
+int cv::viz::vtkImageMatSource::RequestInformation(vtkInformation *, vtkInformationVector**, vtkInformationVector *outputVector)
+{
+    vtkInformation* outInfo = outputVector->GetInformationObject(0);
+
+    outInfo->Set(vtkStreamingDemandDrivenPipeline::WHOLE_EXTENT(), this->ImageData->GetExtent(), 6);
+    outInfo->Set(vtkDataObject::SPACING(), 1.0, 1.0, 1.0);
+    outInfo->Set(vtkDataObject::ORIGIN(),  0.0, 0.0, 0.0);
+
+    vtkDataObject::SetPointDataActiveScalarInfo(outInfo, this->ImageData->GetScalarType(), this->ImageData->GetNumberOfScalarComponents());
+    return 1;
+}
+
+int cv::viz::vtkImageMatSource::RequestData(vtkInformation*, vtkInformationVector**, vtkInformationVector *outputVector)
+{
+     vtkInformation *outInfo = outputVector->GetInformationObject(0);
+
+     vtkImageData *output = vtkImageData::SafeDownCast(outInfo->Get(vtkDataObject::DATA_OBJECT()) );
+     output->ShallowCopy(this->ImageData);
+     return 1;
+}
+
+void cv::viz::vtkImageMatSource::SetImage(InputArray _image)
+{
+    CV_Assert(_image.depth() == CV_8U && (_image.channels() == 1 || _image.channels() == 3 || _image.channels() == 4));
+
+    Mat image = _image.getMat();
+
+    this->ImageData->SetDimensions(image.cols, image.rows, 1);
+#if VTK_MAJOR_VERSION <= 5
+    this->ImageData->SetNumberOfScalarComponents(image.channels());
+    this->ImageData->SetScalarTypeToUnsignedChar();
+    this->ImageData->AllocateScalars();
+#else
+    this->ImageData->AllocateScalars(VTK_UNSIGNED_CHAR, image.channels());
+#endif
+
+    switch(image.channels())
+    {
+    case 1: copyGrayImage(image, this->ImageData); break;
+    case 3: copyRGBImage (image, this->ImageData); break;
+    case 4: copyRGBAImage(image, this->ImageData); break;
+    }
+    this->ImageData->Modified();
+}
+
+void cv::viz::vtkImageMatSource::copyGrayImage(const Mat &source, vtkSmartPointer<vtkImageData> output)
+{
+    unsigned char* dptr = reinterpret_cast<unsigned char*>(output->GetScalarPointer());
+    size_t elem_step = output->GetIncrements()[1]/sizeof(unsigned char);
+
+    for (int y = 0; y < source.rows; ++y)
+    {
+        unsigned char* drow = dptr + elem_step * y;
+        const unsigned char *srow = source.ptr<unsigned char>(y);
+        for (int x = 0; x < source.cols; ++x)
+            drow[x] = *srow++;
+    }
+}
+
+void cv::viz::vtkImageMatSource::copyRGBImage(const Mat &source, vtkSmartPointer<vtkImageData> output)
+{
+    Vec3b* dptr = reinterpret_cast<Vec3b*>(output->GetScalarPointer());
+    size_t elem_step = output->GetIncrements()[1]/sizeof(Vec3b);
+
+    for (int y = 0; y < source.rows; ++y)
+    {
+        Vec3b* drow = dptr + elem_step * y;
+        const unsigned char *srow = source.ptr<unsigned char>(y);
+        for (int x = 0; x < source.cols; ++x, srow += source.channels())
+            drow[x] = Vec3b(srow[2], srow[1], srow[0]);
+    }
+}
+
+void cv::viz::vtkImageMatSource::copyRGBAImage(const Mat &source, vtkSmartPointer<vtkImageData> output)
+{
+    Vec4b* dptr = reinterpret_cast<Vec4b*>(output->GetScalarPointer());
+    size_t elem_step = output->GetIncrements()[1]/sizeof(Vec4b);
+
+    for (int y = 0; y < source.rows; ++y)
+    {
+        Vec4b* drow = dptr + elem_step * y;
+        const unsigned char *srow = source.ptr<unsigned char>(y);
+        for (int x = 0; x < source.cols; ++x, srow += source.channels())
+            drow[x] = Vec4b(srow[2], srow[1], srow[0], srow[3]);
+    }
+}
diff --git a/modules/ocl/test/test_fast.cpp b/modules/viz/src/vtk/vtkImageMatSource.h
similarity index 62%
rename from modules/ocl/test/test_fast.cpp
rename to modules/viz/src/vtk/vtkImageMatSource.h
index 19ff68e12..db0c093ed 100644
--- a/modules/ocl/test/test_fast.cpp
+++ b/modules/viz/src/vtk/vtkImageMatSource.h
@@ -38,56 +38,45 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 // Authors:
-//  * Peter Andreas Entschev, peter@entschev.com
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
 //M*/
 
-#include "test_precomp.hpp"
+#include "precomp.hpp"
 
-#ifdef HAVE_OPENCL
+#ifndef __vtkImageMatSource_h
+#define __vtkImageMatSource_h
 
-////////////////////////////////////////////////////////
-// FAST
-
-namespace
+namespace cv
 {
-    IMPLEMENT_PARAM_CLASS(FAST_Threshold, int)
-    IMPLEMENT_PARAM_CLASS(FAST_NonmaxSupression, bool)
-}
-
-PARAM_TEST_CASE(FAST, FAST_Threshold, FAST_NonmaxSupression)
-{
-    int threshold;
-    bool nonmaxSupression;
-
-    virtual void SetUp()
+    namespace viz
     {
-        threshold = GET_PARAM(0);
-        nonmaxSupression = GET_PARAM(1);
+        class vtkImageMatSource : public vtkImageAlgorithm
+        {
+        public:
+            static vtkImageMatSource *New();
+            vtkTypeMacro(vtkImageMatSource,vtkImageAlgorithm);
+
+            void SetImage(InputArray image);
+
+        protected:
+            vtkImageMatSource();
+            ~vtkImageMatSource() {}
+
+            vtkSmartPointer<vtkImageData> ImageData;
+
+            int RequestInformation(vtkInformation*, vtkInformationVector**, vtkInformationVector*);
+            int RequestData (vtkInformation*, vtkInformationVector**, vtkInformationVector*);
+        private:
+            vtkImageMatSource(const vtkImageMatSource&);  // Not implemented.
+            void operator=(const vtkImageMatSource&);  // Not implemented.
+
+            static void copyGrayImage(const Mat &source, vtkSmartPointer<vtkImageData> output);
+            static void copyRGBImage (const Mat &source, vtkSmartPointer<vtkImageData> output);
+            static void copyRGBAImage(const Mat &source, vtkSmartPointer<vtkImageData> output);
+        };
     }
-};
-
-OCL_TEST_P(FAST, Accuracy)
-{
-    cv::Mat image = readImage("gpu/perf/aloe.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::ocl::FAST_OCL fast(threshold);
-    fast.nonmaxSupression = nonmaxSupression;
-
-    cv::ocl::oclMat ocl_image = cv::ocl::oclMat(image);
-
-    std::vector<cv::KeyPoint> keypoints;
-    fast(ocl_image, cv::ocl::oclMat(), keypoints);
-
-    std::vector<cv::KeyPoint> keypoints_gold;
-    cv::FAST(image, keypoints_gold, threshold, nonmaxSupression);
-
-    ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints);
 }
 
-INSTANTIATE_TEST_CASE_P(OCL_Features2D, FAST, testing::Combine(
-                        testing::Values(FAST_Threshold(25), FAST_Threshold(50)),
-                        testing::Values(FAST_NonmaxSupression(false), FAST_NonmaxSupression(true))));
 
 #endif
diff --git a/modules/viz/src/vtk/vtkOBJWriter.cpp b/modules/viz/src/vtk/vtkOBJWriter.cpp
new file mode 100644
index 000000000..452ad19a7
--- /dev/null
+++ b/modules/viz/src/vtk/vtkOBJWriter.cpp
@@ -0,0 +1,241 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(vtkOBJWriter);
+}}
+
+cv::viz::vtkOBJWriter::vtkOBJWriter()
+{
+    std::ofstream fout; // only used to extract the default precision
+    this->DecimalPrecision = fout.precision();
+    this->FileName = NULL;
+    this->FileType = VTK_ASCII;
+}
+
+cv::viz::vtkOBJWriter::~vtkOBJWriter(){}
+
+void cv::viz::vtkOBJWriter::WriteData()
+{
+    vtkPolyData *input = this->GetInput();
+    if (!input)
+        return;
+
+    std::ostream *outfilep = this->OpenVTKFile();
+    if (!outfilep)
+        return;
+
+    std::ostream& outfile = *outfilep;
+
+    //write header
+    outfile << "# wavefront obj file written by the visualization toolkit" << std::endl << std::endl;
+    outfile << "mtllib NONE" << std::endl << std::endl;
+
+    // write out the points
+    for (int i = 0; i < input->GetNumberOfPoints(); i++)
+    {
+        Vec3d p;
+        input->GetPoint(i, p.val);
+        outfile << std::setprecision(this->DecimalPrecision) << "v " << p[0] << " " << p[1] << " " << p[2] << std::endl;
+    }
+
+    const int idStart = 1;
+
+    // write out the point data
+    vtkSmartPointer<vtkDataArray> normals = input->GetPointData()->GetNormals();
+    if(normals)
+    {
+        for (int i = 0; i < normals->GetNumberOfTuples(); i++)
+        {
+            Vec3d p;
+            normals->GetTuple(i, p.val);
+            outfile << std::setprecision(this->DecimalPrecision) << "vn " << p[0] << " " << p[1] << " " << p[2] << std::endl;
+        }
+    }
+
+    vtkSmartPointer<vtkDataArray> tcoords = input->GetPointData()->GetTCoords();
+    if (tcoords)
+    {
+        for (int i = 0; i < tcoords->GetNumberOfTuples(); i++)
+        {
+            Vec2d p;
+            tcoords->GetTuple(i, p.val);
+            outfile << std::setprecision(this->DecimalPrecision) << "vt " << p[0] << " " << p[1] << std::endl;
+        }
+    }
+
+    // write out a group name and material
+    outfile << std::endl << "g grp" << idStart << std::endl;
+    outfile << "usemtl mtlNONE" << std::endl;
+
+    // write out verts if any
+    if (input->GetNumberOfVerts() > 0)
+    {
+        vtkIdType npts = 0, *index = 0;
+        vtkCellArray *cells = input->GetVerts();
+        for (cells->InitTraversal(); cells->GetNextCell(npts, index); )
+        {
+            outfile << "p ";
+            for (int i = 0; i < npts; i++)
+                outfile << index[i] + idStart << " ";
+            outfile << std::endl;
+        }
+    }
+
+    // write out lines if any
+    if (input->GetNumberOfLines() > 0)
+    {
+        vtkIdType npts = 0, *index = 0;
+        vtkCellArray *cells = input->GetLines();
+        for (cells->InitTraversal(); cells->GetNextCell(npts, index); )
+        {
+            outfile << "l ";
+            if (tcoords)
+            {
+                for (int i = 0; i < npts; i++)
+                    outfile << index[i] + idStart << "/" << index[i] + idStart << " ";
+            }
+            else
+                for (int i = 0; i < npts; i++)
+                    outfile << index[i] + idStart << " ";
+
+            outfile << std::endl;
+        }
+    }
+
+    // write out polys if any
+    if (input->GetNumberOfPolys() > 0)
+    {
+        vtkIdType npts = 0, *index = 0;
+        vtkCellArray *cells = input->GetPolys();
+        for (cells->InitTraversal(); cells->GetNextCell(npts, index); )
+        {
+            outfile << "f ";
+            for (int i = 0; i < npts; i++)
+            {
+                if (normals)
+                {
+                    if (tcoords)
+                        outfile << index[i] + idStart << "/"  << index[i] + idStart << "/" << index[i] + idStart << " ";
+                    else
+                        outfile << index[i] + idStart << "//" << index[i] + idStart << " ";
+                }
+                else
+                {
+                    if (tcoords)
+                        outfile << index[i] + idStart << " " << index[i] + idStart << " ";
+                    else
+                        outfile << index[i] + idStart << " ";
+                }
+            }
+            outfile << std::endl;
+        }
+    }
+
+    // write out tstrips if any
+    if (input->GetNumberOfStrips() > 0)
+    {
+        vtkIdType npts = 0, *index = 0;
+        vtkCellArray *cells = input->GetStrips();
+        for (cells->InitTraversal(); cells->GetNextCell(npts, index); )
+        {
+            for (int i = 2, i1, i2; i < npts; ++i)
+            {
+                if (i % 2)
+                {
+                    i1 = i - 1;
+                    i2 = i - 2;
+                }
+                else
+                {
+                    i1 = i - 1;
+                    i2 = i - 2;
+                }
+
+                if(normals)
+                {
+                    if (tcoords)
+                    {
+                        outfile << "f " << index[i1] + idStart << "/" << index[i1] + idStart << "/" << index[i1] + idStart << " "
+                            << index[i2]+ idStart << "/" << index[i2] + idStart << "/" << index[i2] + idStart << " "
+                            << index[i] + idStart << "/" << index[i]  + idStart << "/" << index[i]  + idStart << std::endl;
+                    }
+                    else
+                    {
+                        outfile << "f " << index[i1] + idStart << "//" << index[i1] + idStart << " " << index[i2] + idStart
+                                << "//" << index[i2] + idStart << " "  << index[i]  + idStart << "//" << index[i] + idStart << std::endl;
+                    }
+                }
+                else
+                {
+                    if (tcoords)
+                    {
+                        outfile << "f " << index[i1] + idStart << "/" << index[i1] + idStart << " " << index[i2] + idStart
+                                << "/" << index[i2] + idStart << " "  << index[i]  + idStart << "/" << index[i]  + idStart << std::endl;
+                    }
+                    else
+                        outfile << "f " << index[i1] + idStart << " " << index[i2] + idStart << " " << index[i] + idStart << std::endl;
+                }
+            } /* for (int i = 2; i < npts; ++i) */
+        }
+    } /* if (input->GetNumberOfStrips() > 0) */
+
+    this->CloseVTKFile(outfilep);
+
+    // Delete the file if an error occurred
+    if (this->ErrorCode == vtkErrorCode::OutOfDiskSpaceError)
+    {
+        vtkErrorMacro("Ran out of disk space; deleting file: " << this->FileName);
+        unlink(this->FileName);
+    }
+}
+
+void cv::viz::vtkOBJWriter::PrintSelf(ostream& os, vtkIndent indent)
+{
+    Superclass::PrintSelf(os, indent);
+    os << indent << "DecimalPrecision: " << DecimalPrecision << "\n";
+}
diff --git a/modules/viz/src/precomp.cpp b/modules/viz/src/vtk/vtkOBJWriter.h
similarity index 74%
rename from modules/viz/src/precomp.cpp
rename to modules/viz/src/vtk/vtkOBJWriter.h
index 834648577..f8889884d 100644
--- a/modules/viz/src/precomp.cpp
+++ b/modules/viz/src/vtk/vtkOBJWriter.h
@@ -38,12 +38,42 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 // Authors:
-//  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
-#include "precomp.hpp"
+#ifndef __vtkOBJWriter_h
+#define __vtkOBJWriter_h
+
+#include <vtkPolyDataWriter.h>
+
+namespace cv
+{
+    namespace viz
+    {
+        class vtkOBJWriter : public vtkPolyDataWriter
+        {
+        public:
+          static vtkOBJWriter *New();
+          vtkTypeMacro(vtkOBJWriter,vtkPolyDataWriter)
+          void PrintSelf(ostream& os, vtkIndent indent);
+
+          vtkGetMacro(DecimalPrecision, int);
+          vtkSetMacro(DecimalPrecision, int);
+
+        protected:
+          vtkOBJWriter();
+          ~vtkOBJWriter();
+
+          void WriteData();
+
+          int DecimalPrecision;
+
+        private:
+          vtkOBJWriter(const vtkOBJWriter&);  // Not implemented.
+          void operator=(const vtkOBJWriter&);  // Not implemented.
+        };
+    }
+}
+
+#endif
diff --git a/modules/viz/src/vtk/vtkTrajectorySource.cpp b/modules/viz/src/vtk/vtkTrajectorySource.cpp
new file mode 100644
index 000000000..e098a1d55
--- /dev/null
+++ b/modules/viz/src/vtk/vtkTrajectorySource.cpp
@@ -0,0 +1,110 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace viz
+{
+    vtkStandardNewMacro(vtkTrajectorySource);
+}}
+
+cv::viz::vtkTrajectorySource::vtkTrajectorySource() { SetNumberOfInputPorts(0); }
+cv::viz::vtkTrajectorySource::~vtkTrajectorySource() {}
+
+void cv::viz::vtkTrajectorySource::SetTrajectory(InputArray _traj)
+{
+    CV_Assert(_traj.kind() == _InputArray::STD_VECTOR || _traj.kind() == _InputArray::MAT);
+    CV_Assert(_traj.type() == CV_32FC(16) || _traj.type() == CV_64FC(16));
+
+    Mat traj;
+    _traj.getMat().convertTo(traj, CV_64F);
+    const Affine3d* dpath = traj.ptr<Affine3d>();
+    size_t total = traj.total();
+
+    points = vtkSmartPointer<vtkPoints>::New();
+    points->SetDataType(VTK_DOUBLE);
+    points->SetNumberOfPoints(total);
+
+    tensors = vtkSmartPointer<vtkDoubleArray>::New();
+    tensors->SetNumberOfComponents(9);
+    tensors->SetNumberOfTuples(total);
+
+    for(size_t i = 0; i < total; ++i, ++dpath)
+    {
+        Matx33d R = dpath->rotation().t();  // transposed because of
+        tensors->SetTuple(i, R.val);        // column major order
+
+        Vec3d p = dpath->translation();
+        points->SetPoint(i, p.val);
+    }
+}
+
+cv::Mat cv::viz::vtkTrajectorySource::ExtractPoints(InputArray _traj)
+{
+    CV_Assert(_traj.kind() == _InputArray::STD_VECTOR || _traj.kind() == _InputArray::MAT);
+    CV_Assert(_traj.type() == CV_32FC(16) || _traj.type() == CV_64FC(16));
+
+    Mat points(1, _traj.total(), CV_MAKETYPE(_traj.depth(), 3));
+    const Affine3d* dpath = _traj.getMat().ptr<Affine3d>();
+    const Affine3f* fpath = _traj.getMat().ptr<Affine3f>();
+
+    if (_traj.depth() == CV_32F)
+        for(int i = 0; i < points.cols; ++i)
+            points.at<Vec3f>(i) = fpath[i].translation();
+
+    if (_traj.depth() == CV_64F)
+        for(int i = 0; i < points.cols; ++i)
+            points.at<Vec3d>(i) = dpath[i].translation();
+
+    return points;
+}
+
+int cv::viz::vtkTrajectorySource::RequestData(vtkInformation *vtkNotUsed(request), vtkInformationVector **vtkNotUsed(inputVector), vtkInformationVector *outputVector)
+{
+    vtkInformation *outInfo = outputVector->GetInformationObject(0);
+    vtkPolyData *output = vtkPolyData::SafeDownCast(outInfo->Get(vtkDataObject::DATA_OBJECT()));
+    output->SetPoints(points);
+    output->GetPointData()->SetTensors(tensors);
+    return 1;
+}
diff --git a/modules/bioinspired/include/opencv2/bioinspired/bioinspired.hpp b/modules/viz/src/vtk/vtkTrajectorySource.h
similarity index 63%
rename from modules/bioinspired/include/opencv2/bioinspired/bioinspired.hpp
rename to modules/viz/src/vtk/vtkTrajectorySource.h
index 40be2854e..f6c9c77b9 100644
--- a/modules/bioinspired/include/opencv2/bioinspired/bioinspired.hpp
+++ b/modules/viz/src/vtk/vtkTrajectorySource.h
@@ -7,11 +7,9 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
@@ -39,10 +37,48 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
 //M*/
 
-#ifdef __OPENCV_BUILD
-#error this is a compatibility header which should not be used inside the OpenCV library
-#endif
+#ifndef __vtkTrajectorySource_h
+#define __vtkTrajectorySource_h
 
-#include "opencv2/bioinspired.hpp"
+#include <opencv2/core/mat.hpp>
+#include <vtkPolyDataAlgorithm.h>
+#include <vtkSmartPointer.h>
+#include <vtkPoints.h>
+#include <vtkCellArray.h>
+
+namespace cv
+{
+    namespace viz
+    {
+        class vtkTrajectorySource : public vtkPolyDataAlgorithm
+        {
+        public:
+            static vtkTrajectorySource *New();
+            vtkTypeMacro(vtkTrajectorySource,vtkPolyDataAlgorithm)
+
+            virtual void SetTrajectory(InputArray trajectory);
+
+            static Mat ExtractPoints(InputArray trajectory);
+
+        protected:
+            vtkTrajectorySource();
+            ~vtkTrajectorySource();
+
+            vtkSmartPointer<vtkPoints> points;
+            vtkSmartPointer<vtkDoubleArray> tensors;
+
+            int RequestData(vtkInformation *, vtkInformationVector **, vtkInformationVector *);
+        private:
+            vtkTrajectorySource(const vtkTrajectorySource&);  // Not implemented.
+            void operator=(const vtkTrajectorySource&);  // Not implemented.
+
+        };
+    }
+}
+
+#endif
diff --git a/modules/ocl/src/cl_programcache.hpp b/modules/viz/src/vtk/vtkXYZWriter.cpp
similarity index 59%
rename from modules/ocl/src/cl_programcache.hpp
rename to modules/viz/src/vtk/vtkXYZWriter.cpp
index ebf3e7676..4518a0103 100644
--- a/modules/ocl/src/cl_programcache.hpp
+++ b/modules/viz/src/vtk/vtkXYZWriter.cpp
@@ -10,12 +10,9 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
-// @Authors
-//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -40,46 +37,57 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
 //M*/
 
 #include "precomp.hpp"
 
-namespace cv {
-namespace ocl {
-
-class ProgramCache
+namespace cv { namespace viz
 {
-protected:
-    ProgramCache();
-    ~ProgramCache();
-public:
-    static ProgramCache *getProgramCache();
+    vtkStandardNewMacro(vtkXYZWriter);
+}}
 
-    cl_program getProgram(const Context *ctx, const cv::ocl::ProgramEntry* source,
-                          const char *build_options);
+cv::viz::vtkXYZWriter::vtkXYZWriter()
+{
+    std::ofstream fout; // only used to extract the default precision
+    this->DecimalPrecision = fout.precision();
+}
 
-    void releaseProgram();
-protected:
-    //lookup the binary given the file name
-    // (with acquired mutexCache)
-    cl_program progLookup(const String& srcsign);
+void cv::viz::vtkXYZWriter::WriteData()
+{
+    vtkPolyData *input = this->GetInput();
+    if (!input)
+        return;
 
-    //add program to the cache
-    // (with acquired mutexCache)
-    void addProgram(const String& srcsign, cl_program program);
+    // OpenVTKFile() will report any errors that happen
+    ostream *outfilep = this->OpenVTKFile();
+    if (!outfilep)
+        return;
 
-    std::map <String, cl_program> codeCache;
-    unsigned int cacheSize;
+    ostream &outfile = *outfilep;
 
-    //The presumed watermark for the cache volume (256MB). Is it enough?
-    //We may need more delicate algorithms when necessary later.
-    //Right now, let's just leave it along.
-    static const unsigned MAX_PROG_CACHE_SIZE = 1024;
+    for(vtkIdType i = 0; i < input->GetNumberOfPoints(); ++i)
+    {
+        Vec3d p;
+        input->GetPoint(i, p.val);
+        outfile << std::setprecision(this->DecimalPrecision) << p[0] << " " << p[1] << " " << p[2] << std::endl;
+    }
 
-    // acquire both mutexes in this order: 1) mutexFiles 2) mutexCache
-    static cv::Mutex mutexFiles;
-    static cv::Mutex mutexCache;
-};
+    // Close the file
+    this->CloseVTKFile(outfilep);
 
-}//namespace ocl
-}//namespace cv
+    // Delete the file if an error occurred
+    if (this->ErrorCode == vtkErrorCode::OutOfDiskSpaceError)
+    {
+        vtkErrorMacro("Ran out of disk space; deleting file: " << this->FileName);
+        unlink(this->FileName);
+    }
+}
+
+void cv::viz::vtkXYZWriter::PrintSelf(ostream& os, vtkIndent indent)
+{
+    this->Superclass::PrintSelf(os,indent);
+    os << indent << "DecimalPrecision: " << this->DecimalPrecision << "\n";
+}
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/viz/src/vtk/vtkXYZWriter.h
similarity index 71%
rename from modules/ocl/include/opencv2/ocl/ocl.hpp
rename to modules/viz/src/vtk/vtkXYZWriter.h
index 3dd46545a..3db18b793 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/viz/src/vtk/vtkXYZWriter.h
@@ -10,8 +10,6 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
@@ -39,10 +37,42 @@
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
+// Authors:
+//  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
+//
 //M*/
 
-#ifdef __OPENCV_BUILD
-#error this is a compatibility header which should not be used inside the OpenCV library
-#endif
+#ifndef __vtkXYZWriter_h
+#define __vtkXYZWriter_h
 
-#include "opencv2/ocl.hpp"
+#include "vtkPolyDataWriter.h"
+
+namespace cv
+{
+    namespace viz
+    {
+        class vtkXYZWriter : public vtkPolyDataWriter
+        {
+        public:
+            static vtkXYZWriter *New();
+            vtkTypeMacro(vtkXYZWriter,vtkPolyDataWriter)
+            void PrintSelf(ostream& os, vtkIndent indent);
+
+            vtkGetMacro(DecimalPrecision, int)
+            vtkSetMacro(DecimalPrecision, int)
+
+        protected:
+            vtkXYZWriter();
+            ~vtkXYZWriter(){}
+
+            void WriteData();
+
+            int DecimalPrecision;
+
+        private:
+            vtkXYZWriter(const vtkXYZWriter&);  // Not implemented.
+            void operator=(const vtkXYZWriter&);  // Not implemented.
+        };
+    }
+}
+#endif
diff --git a/modules/viz/src/widget.cpp b/modules/viz/src/widget.cpp
index 04abdcf7a..33b467ebc 100644
--- a/modules/viz/src/widget.cpp
+++ b/modules/viz/src/widget.cpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #include "precomp.hpp"
@@ -55,7 +52,6 @@ class cv::viz::Widget::Impl
 {
 public:
     vtkSmartPointer<vtkProp> prop;
-
     Impl() : prop(0) {}
 };
 
@@ -63,13 +59,17 @@ cv::viz::Widget::Widget() : impl_( new Impl() ) { }
 
 cv::viz::Widget::Widget(const Widget& other) : impl_( new Impl() )
 {
-    if (other.impl_ && other.impl_->prop) impl_->prop = other.impl_->prop;
+    if (other.impl_ && other.impl_->prop)
+        impl_->prop = other.impl_->prop;
 }
 
 cv::viz::Widget& cv::viz::Widget::operator=(const Widget& other)
 {
-    if (!impl_) impl_ = new Impl();
-    if (other.impl_) impl_->prop = other.impl_->prop;
+    if (!impl_)
+        impl_ = new Impl();
+
+    if (other.impl_)
+        impl_->prop = other.impl_->prop;
     return *this;
 }
 
@@ -84,45 +84,22 @@ cv::viz::Widget::~Widget()
 
 cv::viz::Widget cv::viz::Widget::fromPlyFile(const String &file_name)
 {
+    CV_Assert(vtkPLYReader::CanReadFile(file_name.c_str()));
+
     vtkSmartPointer<vtkPLYReader> reader = vtkSmartPointer<vtkPLYReader>::New();
     reader->SetFileName(file_name.c_str());
 
-    vtkSmartPointer<vtkDataSet> data = reader->GetOutput();
-    CV_Assert("File does not exist or file format is not supported." && data);
-
-    vtkSmartPointer<vtkLODActor> actor = vtkSmartPointer<vtkLODActor>::New();
-
     vtkSmartPointer<vtkDataSetMapper> mapper = vtkSmartPointer<vtkDataSetMapper>::New();
-#if VTK_MAJOR_VERSION <= 5
-    mapper->SetInput(data);
-#else
-    mapper->SetInputData(data);
-#endif
-
-    vtkSmartPointer<vtkDataArray> scalars = data->GetPointData()->GetScalars();
-    if (scalars)
-    {
-        cv::Vec3d minmax(scalars->GetRange());
-        mapper->SetScalarRange(minmax.val);
-        mapper->SetScalarModeToUsePointData();
-
-        // interpolation OFF, if data is a vtkPolyData that contains only vertices, ON for anything else.
-        vtkPolyData* polyData = vtkPolyData::SafeDownCast(data);
-        bool interpolation = (polyData && polyData->GetNumberOfCells() != polyData->GetNumberOfVerts());
-
-        mapper->SetInterpolateScalarsBeforeMapping(interpolation);
-        mapper->ScalarVisibilityOn();
-    }
+    mapper->SetInputConnection( reader->GetOutputPort() );
     mapper->ImmediateModeRenderingOff();
 
-    actor->SetNumberOfCloudPoints(int(std::max<vtkIdType>(1, data->GetNumberOfPoints() / 10)));
+    vtkSmartPointer<vtkActor> actor = vtkSmartPointer<vtkActor>::New();
     actor->GetProperty()->SetInterpolationToFlat();
     actor->GetProperty()->BackfaceCullingOn();
-
     actor->SetMapper(mapper);
 
     Widget widget;
-    widget.impl_->prop = actor;
+    WidgetAccessor::setProp(widget, actor);
     return widget;
 }
 
@@ -133,37 +110,15 @@ void cv::viz::Widget::setRenderingProperty(int property, double value)
 
     switch (property)
     {
-        case POINT_SIZE:
-        {
-            actor->GetProperty()->SetPointSize(float(value));
-            actor->Modified();
-            break;
-        }
-        case OPACITY:
-        {
-            actor->GetProperty()->SetOpacity(value);
-            actor->Modified();
-            break;
-        }
-        case IMMEDIATE_RENDERING:
-        {
-            actor->GetMapper()->SetImmediateModeRendering(int(value));
-            actor->Modified();
-            break;
-        }
-        case LINE_WIDTH:
-        {
-            actor->GetProperty()->SetLineWidth(float(value));
-            actor->Modified();
-            break;
-        }
+        case POINT_SIZE:          actor->GetProperty()->SetPointSize(float(value)); break;
+        case OPACITY:             actor->GetProperty()->SetOpacity(value);          break;
+        case LINE_WIDTH:          actor->GetProperty()->SetLineWidth(float(value)); break;
+        case IMMEDIATE_RENDERING: actor->GetMapper()->SetImmediateModeRendering(int(value)); break;
         case FONT_SIZE:
         {
             vtkTextActor* text_actor = vtkTextActor::SafeDownCast(actor);
             CV_Assert("Widget does not have text content." && text_actor);
-            vtkSmartPointer<vtkTextProperty> tprop = text_actor->GetTextProperty();
-            tprop->SetFontSize(int(value));
-            text_actor->Modified();
+            text_actor->GetTextProperty()->SetFontSize(int(value));
             break;
         }
         case REPRESENTATION:
@@ -174,7 +129,6 @@ void cv::viz::Widget::setRenderingProperty(int property, double value)
                 case REPRESENTATION_WIREFRAME: actor->GetProperty()->SetRepresentationToWireframe(); break;
                 case REPRESENTATION_SURFACE:   actor->GetProperty()->SetRepresentationToSurface();  break;
             }
-            actor->Modified();
             break;
         }
         case SHADING:
@@ -186,14 +140,11 @@ void cv::viz::Widget::setRenderingProperty(int property, double value)
                 {
                     if (!actor->GetMapper()->GetInput()->GetPointData()->GetNormals())
                     {
-                        vtkSmartPointer<vtkPolyDataNormals> normals = vtkSmartPointer<vtkPolyDataNormals>::New();
-#if VTK_MAJOR_VERSION <= 5
-                        normals->SetInput(actor->GetMapper()->GetInput());
-#else
-                        normals->SetInputData(actor->GetMapper()->GetInput());
-#endif
-                        normals->Update();
-                        vtkDataSetMapper::SafeDownCast(actor->GetMapper())->SetInputConnection(normals->GetOutputPort());
+                        vtkSmartPointer<vtkPolyDataMapper> mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+                        CV_Assert("Can't set shading property for such type of widget" && mapper);
+
+                        vtkSmartPointer<vtkPolyData> with_normals = VtkUtils::ComputeNormals(mapper->GetInput());
+                        VtkUtils::SetInputData(mapper, with_normals);
                     }
                     actor->GetProperty()->SetInterpolationToGouraud();
                     break;
@@ -202,27 +153,22 @@ void cv::viz::Widget::setRenderingProperty(int property, double value)
                 {
                     if (!actor->GetMapper()->GetInput()->GetPointData()->GetNormals())
                     {
-                        vtkSmartPointer<vtkPolyDataNormals> normals = vtkSmartPointer<vtkPolyDataNormals>::New();
-#if VTK_MAJOR_VERSION <= 5
-                        normals->SetInput(actor->GetMapper()->GetInput());
-#else
-                        normals->SetInputData(actor->GetMapper()->GetInput());
-#endif
-                        normals->Update();
-                        vtkDataSetMapper::SafeDownCast(actor->GetMapper())->SetInputConnection(normals->GetOutputPort());
+                        vtkSmartPointer<vtkPolyDataMapper> mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+                        CV_Assert("Can't set shading property for such type of widget" && mapper);
+
+                        vtkSmartPointer<vtkPolyData> with_normals = VtkUtils::ComputeNormals(mapper->GetInput());
+                        VtkUtils::SetInputData(mapper, with_normals);
                     }
                     actor->GetProperty()->SetInterpolationToPhong();
                     break;
                 }
             }
-            actor->Modified();
             break;
         }
-
-
         default:
             CV_Assert("setPointCloudRenderingProperties: Unknown property");
     }
+    actor->Modified();
 }
 
 double cv::viz::Widget::getRenderingProperty(int property) const
@@ -233,32 +179,16 @@ double cv::viz::Widget::getRenderingProperty(int property) const
     double value = 0.0;
     switch (property)
     {
-        case POINT_SIZE:
-        {
-            value = actor->GetProperty()->GetPointSize();
-            break;
-        }
-        case OPACITY:
-        {
-            value = actor->GetProperty()->GetOpacity();
-            break;
-        }
-        case IMMEDIATE_RENDERING:
-        {
-            value = actor->GetMapper()->GetImmediateModeRendering();
-            break;
-        }
-        case LINE_WIDTH:
-        {
-            value = actor->GetProperty()->GetLineWidth();
-            break;
-        }
+        case POINT_SIZE: value = actor->GetProperty()->GetPointSize(); break;
+        case OPACITY:    value = actor->GetProperty()->GetOpacity();   break;
+        case LINE_WIDTH: value = actor->GetProperty()->GetLineWidth(); break;
+        case IMMEDIATE_RENDERING:  value = actor->GetMapper()->GetImmediateModeRendering();  break;
+
         case FONT_SIZE:
         {
             vtkTextActor* text_actor = vtkTextActor::SafeDownCast(actor);
             CV_Assert("Widget does not have text content." && text_actor);
-            vtkSmartPointer<vtkTextProperty> tprop = text_actor->GetTextProperty();
-            value = tprop->GetFontSize();
+            value = text_actor->GetTextProperty()->GetFontSize();;
             break;
         }
         case REPRESENTATION:
@@ -303,38 +233,17 @@ void cv::viz::WidgetAccessor::setProp(Widget& widget, vtkSmartPointer<vtkProp> p
 ///////////////////////////////////////////////////////////////////////////////////////////////
 /// widget3D implementation
 
-struct cv::viz::Widget3D::MatrixConverter
-{
-    static Matx44f convertToMatx(const vtkSmartPointer<vtkMatrix4x4>& vtk_matrix)
-    {
-        Matx44f m;
-        for (int i = 0; i < 4; i++)
-            for (int k = 0; k < 4; k++)
-                m(i, k) = vtk_matrix->GetElement(i, k);
-        return m;
-    }
-
-    static vtkSmartPointer<vtkMatrix4x4> convertToVtkMatrix(const Matx44f& m)
-    {
-        vtkSmartPointer<vtkMatrix4x4> vtk_matrix = vtkSmartPointer<vtkMatrix4x4>::New();
-        for (int i = 0; i < 4; i++)
-            for (int k = 0; k < 4; k++)
-                vtk_matrix->SetElement(i, k, m(i, k));
-        return vtk_matrix;
-    }
-};
-
-void cv::viz::Widget3D::setPose(const Affine3f &pose)
+void cv::viz::Widget3D::setPose(const Affine3d &pose)
 {
     vtkProp3D *actor = vtkProp3D::SafeDownCast(WidgetAccessor::getProp(*this));
     CV_Assert("Widget is not 3D." && actor);
 
-    vtkSmartPointer<vtkMatrix4x4> matrix = convertToVtkMatrix(pose.matrix);
+    vtkSmartPointer<vtkMatrix4x4> matrix = vtkmatrix(pose.matrix);
     actor->SetUserMatrix(matrix);
     actor->Modified();
 }
 
-void cv::viz::Widget3D::updatePose(const Affine3f &pose)
+void cv::viz::Widget3D::updatePose(const Affine3d &pose)
 {
     vtkProp3D *actor = vtkProp3D::SafeDownCast(WidgetAccessor::getProp(*this));
     CV_Assert("Widget is not 3D." && actor);
@@ -343,25 +252,33 @@ void cv::viz::Widget3D::updatePose(const Affine3f &pose)
     if (!matrix)
     {
         setPose(pose);
-        return ;
+        return;
     }
-    Matx44f matrix_cv = MatrixConverter::convertToMatx(matrix);
 
-    Affine3f updated_pose = pose * Affine3f(matrix_cv);
-    matrix = MatrixConverter::convertToVtkMatrix(updated_pose.matrix);
+    Affine3d updated_pose = pose * Affine3d(*matrix->Element);
+    matrix = vtkmatrix(updated_pose.matrix);
 
     actor->SetUserMatrix(matrix);
     actor->Modified();
 }
 
-cv::Affine3f cv::viz::Widget3D::getPose() const
+cv::Affine3d cv::viz::Widget3D::getPose() const
 {
     vtkProp3D *actor = vtkProp3D::SafeDownCast(WidgetAccessor::getProp(*this));
     CV_Assert("Widget is not 3D." && actor);
+    return Affine3d(*actor->GetUserMatrix()->Element);
+}
 
-    vtkSmartPointer<vtkMatrix4x4> matrix = actor->GetUserMatrix();
-    Matx44f matrix_cv = MatrixConverter::convertToMatx(matrix);
-    return Affine3f(matrix_cv);
+void cv::viz::Widget3D::applyTransform(const Affine3d &transform)
+{
+    vtkActor *actor = vtkActor::SafeDownCast(WidgetAccessor::getProp(*this));
+    CV_Assert("Widget is not 3D actor." && actor);
+
+    vtkSmartPointer<vtkPolyDataMapper> mapper = vtkPolyDataMapper::SafeDownCast(actor->GetMapper());
+    CV_Assert("Widget doesn't have a polydata mapper" && mapper);
+    mapper->Update();
+
+    VtkUtils::SetInputData(mapper, VtkUtils::TransformPolydata(mapper->GetInput(), transform));
 }
 
 void cv::viz::Widget3D::setColor(const Color &color)
diff --git a/modules/viz/test/test_main.cpp b/modules/viz/test/test_main.cpp
index 6b2499344..e737d2db3 100644
--- a/modules/viz/test/test_main.cpp
+++ b/modules/viz/test/test_main.cpp
@@ -1,3 +1,3 @@
 #include "test_precomp.hpp"
 
-CV_TEST_MAIN("cv")
+CV_TEST_MAIN("viz")
diff --git a/modules/viz/test/test_precomp.cpp b/modules/viz/test/test_precomp.cpp
index 5956e13e3..c2673fee6 100644
--- a/modules/viz/test/test_precomp.cpp
+++ b/modules/viz/test/test_precomp.cpp
@@ -1 +1,24 @@
 #include "test_precomp.hpp"
+
+cv::String cv::Path::combine(const String& item1, const String& item2)
+{
+    if (item1.empty())
+        return item2;
+
+    if (item2.empty())
+        return item1;
+
+    char last = item1[item1.size()-1];
+
+    bool need_append = last != '/' && last != '\\';
+    return item1 + (need_append ? "/" : "") + item2;
+}
+
+cv::String cv::Path::combine(const String& item1, const String& item2, const String& item3)
+{ return combine(combine(item1, item2), item3); }
+
+cv::String cv::Path::change_extension(const String& file, const String& ext)
+{
+    String::size_type pos = file.find_last_of('.');
+    return pos == String::npos ? file : file.substr(0, pos+1) + ext;
+}
diff --git a/modules/viz/test/test_precomp.hpp b/modules/viz/test/test_precomp.hpp
index 9b98f206b..1a5c4fe1c 100644
--- a/modules/viz/test/test_precomp.hpp
+++ b/modules/viz/test/test_precomp.hpp
@@ -41,9 +41,6 @@
 //  * Ozan Tonkal, ozantonkal@gmail.com
 //  * Anatoly Baksheev, Itseez Inc.  myname.mysurname <> mycompany.com
 //
-//  OpenCV Viz module is complete rewrite of
-//  PCL visualization module (www.pointclouds.org)
-//
 //M*/
 
 #ifdef __GNUC__
@@ -66,5 +63,42 @@
 #include <iostream>
 #include <fstream>
 #include <string>
+#include <limits>
+
+namespace cv
+{
+    struct Path
+    {
+        static String combine(const String& item1, const String& item2);
+        static String combine(const String& item1, const String& item2, const String& item3);
+        static String change_extension(const String& file, const String& ext);
+    };
+
+    inline cv::String get_dragon_ply_file_path()
+    {
+        return Path::combine(cvtest::TS::ptr()->get_data_path(), "dragon.ply");
+    }
+
+    template<typename _Tp>
+    inline std::vector< Affine3<_Tp> > generate_test_trajectory()
+    {
+        std::vector< Affine3<_Tp> > result;
+
+        for (int i = 0, j = 0; i <= 270; i += 3, j += 10)
+        {
+            double x = 2 * cos(i * 3 * CV_PI/180.0) * (1.0 + 0.5 * cos(1.2 + i * 1.2 * CV_PI/180.0));
+            double y = 0.25 + i/270.0 + sin(j * CV_PI/180.0) * 0.2 * sin(0.6 + j * 1.5 * CV_PI/180.0);
+            double z = 2 * sin(i * 3 * CV_PI/180.0) * (1.0 + 0.5 * cos(1.2 + i * CV_PI/180.0));
+            result.push_back(viz::makeCameraPose(Vec3d(x, y, z), Vec3d::all(0.0), Vec3d(0.0, 1.0, 0.0)));
+        }
+        return result;
+    }
+
+    inline Mat make_gray(const Mat& image)
+    {
+        Mat chs[3]; split(image, chs);
+        return 0.114 * chs[0] + 0.58 * chs[1] + 0.3 * chs[2];
+    }
+}
 
 #endif
diff --git a/modules/viz/test/test_tutorial2.cpp b/modules/viz/test/test_tutorial2.cpp
index 6e9189b8f..a901adc2c 100644
--- a/modules/viz/test/test_tutorial2.cpp
+++ b/modules/viz/test/test_tutorial2.cpp
@@ -12,34 +12,34 @@ void tutorial2()
     myWindow.showWidget("Coordinate Widget", viz::WCoordinateSystem());
 
     /// Add line to represent (1,1,1) axis
-    viz::WLine axis(Point3f(-1.0f,-1.0f,-1.0f), Point3f(1.0f,1.0f,1.0f));
+    viz::WLine axis(Point3f(-1.0, -1.0, -1.0), Point3d(1.0, 1.0, 1.0));
     axis.setRenderingProperty(viz::LINE_WIDTH, 4.0);
     myWindow.showWidget("Line Widget", axis);
 
     /// Construct a cube widget
-    viz::WCube cube_widget(Point3f(0.5,0.5,0.0), Point3f(0.0,0.0,-0.5), true, viz::Color::blue());
+    viz::WCube cube_widget(Point3d(0.5, 0.5, 0.0), Point3d(0.0, 0.0, -0.5), true, viz::Color::blue());
     cube_widget.setRenderingProperty(viz::LINE_WIDTH, 4.0);
 
     /// Display widget (update if already displayed)
     myWindow.showWidget("Cube Widget", cube_widget);
 
     /// Rodrigues vector
-    Mat rot_vec = Mat::zeros(1,3,CV_32F);
-    float translation_phase = 0.0, translation = 0.0;
+    Vec3d rot_vec = Vec3d::all(0);
+    double translation_phase = 0.0, translation = 0.0;
     while(!myWindow.wasStopped())
     {
         /* Rotation using rodrigues */
         /// Rotate around (1,1,1)
-        rot_vec.at<float>(0,0) += CV_PI * 0.01f;
-        rot_vec.at<float>(0,1) += CV_PI * 0.01f;
-        rot_vec.at<float>(0,2) += CV_PI * 0.01f;
+        rot_vec[0] += CV_PI * 0.01;
+        rot_vec[1] += CV_PI * 0.01;
+        rot_vec[2] += CV_PI * 0.01;
 
         /// Shift on (1,1,1)
-        translation_phase += CV_PI * 0.01f;
+        translation_phase += CV_PI * 0.01;
         translation = sin(translation_phase);
 
         /// Construct pose
-        Affine3f pose(rot_vec, Vec3f(translation, translation, translation));
+        Affine3d pose(rot_vec, Vec3d(translation, translation, translation));
 
         myWindow.setWidgetPose("Cube Widget", pose);
 
@@ -48,7 +48,7 @@ void tutorial2()
 }
 
 
-TEST(Viz_viz3d, DISABLED_tutorial2_pose_of_widget)
+TEST(Viz, DISABLED_tutorial2_pose_of_widget)
 {
     tutorial2();
 }
diff --git a/modules/viz/test/test_tutorial3.cpp b/modules/viz/test/test_tutorial3.cpp
index 6c63763df..590e29ebf 100644
--- a/modules/viz/test/test_tutorial3.cpp
+++ b/modules/viz/test/test_tutorial3.cpp
@@ -3,28 +3,6 @@
 using namespace cv;
 using namespace std;
 
-/**
- * @function cvcloud_load
- * @brief load bunny.ply
- */
-Mat cvcloud_load()
-{
-    Mat cloud(1, 20000, CV_32FC3);
-    ifstream ifs("d:/cloud_dragon.ply");
-
-    string str;
-    for(size_t i = 0; i < 12; ++i)
-        getline(ifs, str);
-
-    Point3f* data = cloud.ptr<cv::Point3f>();
-    //float dummy1, dummy2;
-    for(size_t i = 0; i < 20000; ++i)
-        ifs >> data[i].x >> data[i].y >> data[i].z;// >> dummy1 >> dummy2;
-
-    //cloud *= 5.0f;
-    return cloud;
-}
-
 /**
  * @function main
  */
@@ -37,29 +15,29 @@ void tutorial3(bool camera_pov)
     myWindow.showWidget("Coordinate Widget", viz::WCoordinateSystem());
 
     /// Let's assume camera has the following properties
-    Point3f cam_pos(3.0f,3.0f,3.0f), cam_focal_point(3.0f,3.0f,2.0f), cam_y_dir(-1.0f,0.0f,0.0f);
+    Point3d cam_pos(3.0, 3.0, 3.0), cam_focal_point(3.0, 3.0, 2.0), cam_y_dir(-1.0, 0.0, 0.0);
 
     /// We can get the pose of the cam using makeCameraPose
-    Affine3f cam_pose = viz::makeCameraPose(cam_pos, cam_focal_point, cam_y_dir);
+    Affine3d cam_pose = viz::makeCameraPose(cam_pos, cam_focal_point, cam_y_dir);
 
     /// We can get the transformation matrix from camera coordinate system to global using
     /// - makeTransformToGlobal. We need the axes of the camera
-    Affine3f transform = viz::makeTransformToGlobal(Vec3f(0.0f,-1.0f,0.0f), Vec3f(-1.0f,0.0f,0.0f), Vec3f(0.0f,0.0f,-1.0f), cam_pos);
+    Affine3d transform = viz::makeTransformToGlobal(Vec3d(0.0, -1.0, 0.0), Vec3d(-1.0, 0.0, 0.0), Vec3d(0.0, 0.0, -1.0), cam_pos);
 
     /// Create a cloud widget.
-    Mat bunny_cloud = cvcloud_load();
-    viz::WCloud cloud_widget(bunny_cloud, viz::Color::green());
+    Mat dragon_cloud = viz::readCloud(get_dragon_ply_file_path());
+    viz::WCloud cloud_widget(dragon_cloud, viz::Color::green());
 
     /// Pose of the widget in camera frame
-    Affine3f cloud_pose = Affine3f().translate(Vec3f(0.0f,0.0f,3.0f));
+    Affine3d cloud_pose = Affine3d().translate(Vec3d(0.0, 0.0, 3.0));
     /// Pose of the widget in global frame
-    Affine3f cloud_pose_global = transform * cloud_pose;
+    Affine3d cloud_pose_global = transform * cloud_pose;
 
     /// Visualize camera frame
     if (!camera_pov)
     {
         viz::WCameraPosition cpw(0.5); // Coordinate axes
-        viz::WCameraPosition cpw_frustum(Vec2f(0.889484, 0.523599)); // Camera frustum
+        viz::WCameraPosition cpw_frustum(Vec2f(0.889484f, 0.523599f)); // Camera frustum
         myWindow.showWidget("CPW", cpw, cam_pose);
         myWindow.showWidget("CPW_FRUSTUM", cpw_frustum, cam_pose);
     }
@@ -75,12 +53,12 @@ void tutorial3(bool camera_pov)
     myWindow.spin();
 }
 
-TEST(Viz_viz3d, DISABLED_tutorial3_global_view)
+TEST(Viz, DISABLED_tutorial3_global_view)
 {
     tutorial3(false);
 }
 
-TEST(Viz_viz3d, DISABLED_tutorial3_camera_view)
+TEST(Viz, DISABLED_tutorial3_camera_view)
 {
     tutorial3(true);
 }
diff --git a/modules/viz/test/test_viz3d.cpp b/modules/viz/test/test_viz3d.cpp
index 98811165a..45d3cdc3c 100644
--- a/modules/viz/test/test_viz3d.cpp
+++ b/modules/viz/test/test_viz3d.cpp
@@ -41,141 +41,24 @@
  //M*/
 #include "test_precomp.hpp"
 
-
 using namespace cv;
 
-static cv::Mat cvcloud_load()
+TEST(Viz_viz3d, DISABLED_develop)
 {
-    cv::Mat cloud(1, 20000, CV_32FC3);
-        std::ifstream ifs("/Users/nerei/cloud_dragon.ply");
+    cv::Mat cloud = cv::viz::readCloud(get_dragon_ply_file_path());
 
-    std::string str;
-    for(size_t i = 0; i < 11; ++i)
-        std::getline(ifs, str);
-
-    cv::Point3f* data = cloud.ptr<cv::Point3f>();
-    for(size_t i = 0; i < 20000; ++i)
-        ifs >> data[i].x >> data[i].y >> data[i].z;
-
-    return cloud;
-}
-
-bool constant_cam = true;
-cv::viz::Widget cam_1, cam_coordinates;
-
-void keyboard_callback(const viz::KeyboardEvent & event, void * cookie)
-{
-    if (event.keyDown())
-    {
-        if (event.getKeySym() == "space")
-        {
-            viz::Viz3d &viz = *((viz::Viz3d *) cookie);
-            constant_cam = !constant_cam;
-            if (constant_cam)
-            {
-                viz.showWidget("cam_1", cam_1);
-                viz.showWidget("cam_coordinate", cam_coordinates);
-                viz.showWidget("cam_text", viz::WText("Global View", Point2i(5,5), 28));
-                viz.resetCamera();
-            }
-            else
-            {
-                viz.showWidget("cam_text", viz::WText("Cam View", Point2i(5,5), 28));
-                viz.removeWidget("cam_1");
-                viz.removeWidget("cam_coordinate");
-            }
-        }
-    }
-}
-
-TEST(Viz_viz3d, develop)
-{
     cv::viz::Viz3d viz("abc");
+    viz.setBackgroundMeshLab();
+    viz.showWidget("coo", cv::viz::WCoordinateSystem(1));
+    viz.showWidget("cloud", cv::viz::WPaintedCloud(cloud));
 
-    cv::viz::Mesh3d bunny_mesh = cv::viz::Mesh3d::loadMesh("bunny.ply");
-    cv::viz::WMesh bunny_widget(bunny_mesh);
-    bunny_widget.setColor(cv::viz::Color::cyan());
+    //---->>>>> <to_test_in_future>
+    //std::vector<cv::Affine3d> gt, es;
+    //cv::viz::readTrajectory(gt, "d:/Datasets/trajs/gt%05d.xml");
+    //cv::viz::readTrajectory(es, "d:/Datasets/trajs/es%05d.xml");
+    //cv::Mat cloud = cv::viz::readCloud(get_dragon_ply_file_path());
+    //---->>>>> </to_test_in_future>
 
-    cam_1 = cv::viz::WCameraPosition(cv::Vec2f(0.6f, 0.4f), 0.2, cv::viz::Color::green());
-    cam_coordinates = cv::viz::WCameraPosition(0.2);
 
-    viz.showWidget("bunny", bunny_widget);
-    viz.showWidget("cam_1", cam_1, viz::makeCameraPose(Point3f(1.f,0.f,0.f), Point3f(0.f,0.f,0.f), Point3f(0.f,1.f,0.f)));
-    viz.showWidget("cam_coordinate", cam_coordinates, viz::makeCameraPose(Point3f(1.f,0.f,0.f), Point3f(0.f,0.f,0.f), Point3f(0.f,1.f,0.f)));
-
-    std::vector<Affine3f> cam_path;
-
-    for (int i = 0, j = 0; i <= 360; ++i, j+=5)
-    {
-        cam_path.push_back(viz::makeCameraPose(Vec3d(0.5*cos(i*CV_PI/180.0), 0.5*sin(j*CV_PI/180.0), 0.5*sin(i*CV_PI/180.0)), Vec3f(0.f, 0.f, 0.f), Vec3f(0.f, 1.f, 0.f)));
-    }
-
-    int path_counter = 0;
-    int cam_path_size = cam_path.size();
-
-    // OTHER WIDGETS
-    cv::Mat img = imread("opencv.png");
-
-    int downSample = 4;
-
-    int row_max = img.rows/downSample;
-    int col_max = img.cols/downSample;
-
-    cv::Mat *clouds = new cv::Mat[img.cols/downSample];
-    cv::Mat *colors = new cv::Mat[img.cols/downSample];
-
-    for (int col = 0; col < col_max; ++col)
-    {
-        clouds[col] = Mat::zeros(img.rows/downSample, 1, CV_32FC3);
-        colors[col] = Mat::zeros(img.rows/downSample, 1, CV_8UC3);
-        for (int row = 0; row < row_max; ++row)
-        {
-            clouds[col].at<Vec3f>(row) = Vec3f(downSample * float(col) / img.cols, 1.f-(downSample * float(row) / img.rows), 0.f);
-            colors[col].at<Vec3b>(row) = img.at<Vec3b>(row*downSample,col*downSample);
-        }
-    }
-
-    for (int col = 0; col < col_max; ++col)
-    {
-        std::stringstream strstrm;
-        strstrm << "cloud_" << col;
-        viz.showWidget(strstrm.str(), viz::WCloud(clouds[col], colors[col]));
-        viz.getWidget(strstrm.str()).setRenderingProperty(viz::POINT_SIZE, 3.0);
-        viz.getWidget(strstrm.str()).setRenderingProperty(viz::OPACITY, 0.45);
-    }
-
-    viz.showWidget("trajectory", viz::WTrajectory(cam_path, viz::WTrajectory::DISPLAY_PATH, viz::Color::yellow()));
-    viz.showWidget("cam_text", viz::WText("Global View", Point2i(5,5), 28));
-    viz.registerKeyboardCallback(keyboard_callback, (void *) &viz);
-
-    int angle = 0;
-
-    while(!viz.wasStopped())
-    {
-        if (path_counter == cam_path_size)
-        {
-            path_counter = 0;
-        }
-
-        if (!constant_cam)
-        {
-            viz.setViewerPose(cam_path[path_counter]);
-        }
-
-        if (angle == 360) angle = 0;
-
-        cam_1.cast<viz::WCameraPosition>().setPose(cam_path[path_counter]);
-        cam_coordinates.cast<viz::WCameraPosition>().setPose(cam_path[path_counter++]);
-
-        for (int i = 0; i < col_max; ++i)
-        {
-            std::stringstream strstrm;
-            strstrm << "cloud_" << i;
-            viz.setWidgetPose(strstrm.str(), Affine3f().translate(Vec3f(-0.5f, 0.f, (float)(-0.7 + 0.2*sin((angle+i*10)*CV_PI / 180.0)))));
-        }
-        angle += 10;
-        viz.spinOnce(42, true);
-    }
-
-    volatile void* a = (void*)&cvcloud_load; (void)a; //fixing warnings
+    viz.spin();
 }
diff --git a/modules/viz/test/tests_simple.cpp b/modules/viz/test/tests_simple.cpp
new file mode 100644
index 000000000..aae468ed9
--- /dev/null
+++ b/modules/viz/test/tests_simple.cpp
@@ -0,0 +1,407 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+ //
+ //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+ //
+ //  By downloading, copying, installing or using the software you agree to this license.
+ //  If you do not agree to this license, do not download, install,
+ //  copy or use the software.
+ //
+ //
+ //                           License Agreement
+ //                For Open Source Computer Vision Library
+ //
+ // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+ // Copyright (C) 2008-2013, Willow Garage Inc., all rights reserved.
+ // Third party copyrights are property of their respective owners.
+ //
+ // Redistribution and use in source and binary forms, with or without modification,
+ // are permitted provided that the following conditions are met:
+ //
+ //   * Redistribution's of source code must retain the above copyright notice,
+ //     this list of conditions and the following disclaimer.
+ //
+ //   * Redistribution's in binary form must reproduce the above copyright notice,
+ //     this list of conditions and the following disclaimer in the documentation
+ //     and / or other materials provided with the distribution.
+ //
+ //   * The name of the copyright holders may not be used to endorse or promote products
+ //     derived from this software without specific prior written permission.
+ //
+ // This software is provided by the copyright holders and contributors "as is" and
+ // any express or implied warranties, including, but not limited to, the implied
+ // warranties of merchantability and fitness for a particular purpose are disclaimed.
+ // In no event shall the Intel Corporation or contributors be liable for any direct,
+ // indirect, incidental, special, exemplary, or consequential damages
+ // (including, but not limited to, procurement of substitute goods or services;
+ // loss of use, data, or profits; or business interruption) however caused
+ // and on any theory of liability, whether in contract, strict liability,
+ // or tort (including negligence or otherwise) arising in any way out of
+ // the use of this software, even if advised of the possibility of such damage.
+ //
+ //M*/
+
+#include "test_precomp.hpp"
+
+using namespace cv;
+using namespace cv::viz;
+
+TEST(Viz, show_cloud_bluberry)
+{
+    Mat dragon_cloud = readCloud(get_dragon_ply_file_path());
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_cloud_bluberry");
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("dragon", WCloud(dragon_cloud, Color::bluberry()), pose);
+
+    viz.showWidget("text2d", WText("Bluberry cloud", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_cloud_random_color)
+{
+    Mat dragon_cloud = readCloud(get_dragon_ply_file_path());
+
+    Mat colors(dragon_cloud.size(), CV_8UC3);
+    theRNG().fill(colors, RNG::UNIFORM, 0, 255);
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_cloud_random_color");
+    viz.setBackgroundMeshLab();
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("dragon", WCloud(dragon_cloud, colors), pose);
+    viz.showWidget("text2d", WText("Random color cloud", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_cloud_masked)
+{
+    Mat dragon_cloud = readCloud(get_dragon_ply_file_path());
+
+    Vec3f qnan = Vec3f::all(std::numeric_limits<float>::quiet_NaN());
+    for(size_t i = 0; i < dragon_cloud.total(); ++i)
+        if (i % 15 != 0)
+            dragon_cloud.at<Vec3f>(i) = qnan;
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_cloud_masked");
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("dragon", WCloud(dragon_cloud), pose);
+    viz.showWidget("text2d", WText("Nan masked cloud", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_cloud_collection)
+{
+    Mat cloud = readCloud(get_dragon_ply_file_path());
+
+    WCloudCollection ccol;
+    ccol.addCloud(cloud, Color::white(), Affine3d().translate(Vec3d(0, 0, 0)).rotate(Vec3d(CV_PI/2, 0, 0)));
+    ccol.addCloud(cloud, Color::blue(),  Affine3d().translate(Vec3d(1, 0, 0)));
+    ccol.addCloud(cloud, Color::red(),   Affine3d().translate(Vec3d(2, 0, 0)));
+
+    Viz3d viz("show_cloud_collection");
+    viz.setBackgroundColor(Color::mlab());
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("ccol", ccol);
+    viz.showWidget("text2d", WText("Cloud collection", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_painted_clouds)
+{
+    Mat cloud = readCloud(get_dragon_ply_file_path());
+
+    Viz3d viz("show_painted_clouds");
+    viz.setBackgroundMeshLab();
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("cloud1", WPaintedCloud(cloud), Affine3d(Vec3d(0.0, -CV_PI/2, 0.0), Vec3d(-1.5, 0.0, 0.0)));
+    viz.showWidget("cloud2", WPaintedCloud(cloud, Vec3d(0.0, -0.75, -1.0), Vec3d(0.0, 0.75, 0.0)), Affine3d(Vec3d(0.0, CV_PI/2, 0.0), Vec3d(1.5, 0.0, 0.0)));
+    viz.showWidget("cloud3", WPaintedCloud(cloud, Vec3d(0.0, 0.0, -1.0), Vec3d(0.0, 0.0, 1.0), Color::blue(), Color::red()));
+    viz.showWidget("arrow", WArrow(Vec3d(0.0, 1.0, -1.0), Vec3d(0.0, 1.0, 1.0), 0.009, Color::raspberry()));
+    viz.showWidget("text2d", WText("Painted clouds", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_mesh)
+{
+    Mesh mesh = Mesh::load(get_dragon_ply_file_path());
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_mesh");
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("mesh", WMesh(mesh), pose);
+    viz.showWidget("text2d", WText("Just mesh", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_mesh_random_colors)
+{
+    Mesh mesh = Mesh::load(get_dragon_ply_file_path());
+    theRNG().fill(mesh.colors, RNG::UNIFORM, 0, 255);
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_mesh_random_color");
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("mesh", WMesh(mesh), pose);
+    viz.setRenderingProperty("mesh", SHADING, SHADING_PHONG);
+    viz.showWidget("text2d", WText("Random color mesh", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_textured_mesh)
+{
+    Mat lena = imread(Path::combine(cvtest::TS::ptr()->get_data_path(), "lena.png"));
+
+    std::vector<Vec3d> points;
+    std::vector<Vec2d> tcoords;
+    std::vector<int> polygons;
+    for(size_t i = 0; i < 64; ++i)
+    {
+        double angle = CV_PI/2 * i/64.0;
+        points.push_back(Vec3d(0.00, cos(angle), sin(angle))*0.75);
+        points.push_back(Vec3d(1.57, cos(angle), sin(angle))*0.75);
+        tcoords.push_back(Vec2d(0.0, i/64.0));
+        tcoords.push_back(Vec2d(1.0, i/64.0));
+    }
+
+    for(size_t i = 0; i < points.size()/2-1; ++i)
+    {
+        int polys[] = {3, 2*i, 2*i+1, 2*i+2, 3, 2*i+1, 2*i+2, 2*i+3};
+        polygons.insert(polygons.end(), polys, polys + sizeof(polys)/sizeof(polys[0]));
+    }
+
+    cv::viz::Mesh mesh;
+    mesh.cloud = Mat(points, true).reshape(3, 1);
+    mesh.tcoords = Mat(tcoords, true).reshape(2, 1);
+    mesh.polygons = Mat(polygons, true).reshape(1, 1);
+    mesh.texture = lena;
+
+    Viz3d viz("show_textured_mesh");
+    viz.setBackgroundMeshLab();
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("mesh", WMesh(mesh));
+    viz.setRenderingProperty("mesh", SHADING, SHADING_PHONG);
+    viz.showWidget("text2d", WText("Textured mesh", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_polyline)
+{
+    Mat polyline(1, 32, CV_64FC3);
+    for(size_t i = 0; i < polyline.total(); ++i)
+        polyline.at<Vec3d>(i) = Vec3d(i/16.0, cos(i * CV_PI/6), sin(i * CV_PI/6));
+
+    Viz3d viz("show_polyline");
+    viz.showWidget("polyline", WPolyLine(Mat(polyline), Color::apricot()));
+    viz.showWidget("coosys", WCoordinateSystem());
+    viz.showWidget("text2d", WText("Polyline", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_sampled_normals)
+{
+    Mesh mesh = Mesh::load(get_dragon_ply_file_path());
+    computeNormals(mesh, mesh.normals);
+
+    Affine3d pose = Affine3d().rotate(Vec3d(0, 0.8, 0));
+
+    Viz3d viz("show_sampled_normals");
+    viz.showWidget("mesh", WMesh(mesh), pose);
+    viz.showWidget("normals", WCloudNormals(mesh.cloud, mesh.normals, 30, 0.1f, Color::green()), pose);
+    viz.setRenderingProperty("normals", LINE_WIDTH, 2.0);
+    viz.showWidget("text2d", WText("Cloud or mesh normals", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_trajectories)
+{
+    std::vector<Affine3d> path = generate_test_trajectory<double>(), sub0, sub1, sub2, sub3, sub4, sub5;
+
+    Mat(path).rowRange(0, path.size()/10+1).copyTo(sub0);
+    Mat(path).rowRange(path.size()/10, path.size()/5+1).copyTo(sub1);
+    Mat(path).rowRange(path.size()/5, 11*path.size()/12).copyTo(sub2);
+    Mat(path).rowRange(11*path.size()/12, path.size()).copyTo(sub3);
+    Mat(path).rowRange(3*path.size()/4, 33*path.size()/40).copyTo(sub4);
+    Mat(path).rowRange(33*path.size()/40, 9*path.size()/10).copyTo(sub5);
+    Matx33d K(1024.0, 0.0, 320.0, 0.0, 1024.0, 240.0, 0.0, 0.0, 1.0);
+
+    Viz3d viz("show_trajectories");
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("sub0", WTrajectorySpheres(sub0, 0.25, 0.07));
+    viz.showWidget("sub1", WTrajectory(sub1, WTrajectory::PATH, 0.2, Color::brown()));
+    viz.showWidget("sub2", WTrajectory(sub2, WTrajectory::FRAMES, 0.2));
+    viz.showWidget("sub3", WTrajectory(sub3, WTrajectory::BOTH, 0.2, Color::green()));
+    viz.showWidget("sub4", WTrajectoryFrustums(sub4, K, 0.3, Color::yellow()));
+    viz.showWidget("sub5", WTrajectoryFrustums(sub5, Vec2d(0.78, 0.78), 0.15));
+    viz.showWidget("text2d", WText("Different kinds of supported trajectories", Point(20, 20), 20, Color::green()));
+
+    int i = 0;
+    while(!viz.wasStopped())
+    {
+        double a = --i % 360;
+        Vec3d pose(sin(a * CV_PI/180), 0.7, cos(a * CV_PI/180));
+        viz.setViewerPose(makeCameraPose(pose * 7.5, Vec3d(0.0, 0.5, 0.0), Vec3d(0.0, 0.1, 0.0)));
+        viz.spinOnce(20, true);
+    }
+    viz.resetCamera();
+    viz.spin();
+}
+
+TEST(Viz, show_trajectory_reposition)
+{
+    std::vector<Affine3f> path = generate_test_trajectory<float>();
+
+    Viz3d viz("show_trajectory_reposition_to_origin");
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("sub3", WTrajectory(Mat(path).rowRange(0, path.size()/3), WTrajectory::BOTH, 0.2, Color::brown()), path.front().inv());
+    viz.showWidget("text2d", WText("Trajectory resposition to origin", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_camera_positions)
+{
+    Matx33d K(1024.0, 0.0, 320.0, 0.0, 1024.0, 240.0, 0.0, 0.0, 1.0);
+    Mat lena = imread(Path::combine(cvtest::TS::ptr()->get_data_path(), "lena.png"));
+    Mat gray = make_gray(lena);
+
+    Affine3d poses[2];
+    for(int i = 0; i < 2; ++i)
+    {
+        Vec3d pose = 5 * Vec3d(sin(3.14 + 2.7 + i*60 * CV_PI/180), 0.4 - i*0.3, cos(3.14 + 2.7 + i*60 * CV_PI/180));
+        poses[i] = makeCameraPose(pose, Vec3d(0.0, 0.0, 0.0), Vec3d(0.0, -0.1, 0.0));
+    }
+
+    Viz3d viz("show_camera_positions");
+    viz.showWidget("sphe", WSphere(Point3d(0,0,0), 1.0, 10, Color::orange_red()));
+    viz.showWidget("coos", WCoordinateSystem(1.5));
+    viz.showWidget("pos1", WCameraPosition(0.75), poses[0]);
+    viz.showWidget("pos2", WCameraPosition(Vec2d(0.78, 0.78), lena, 2.2, Color::green()), poses[0]);
+    viz.showWidget("pos3", WCameraPosition(0.75), poses[1]);
+    viz.showWidget("pos4", WCameraPosition(K, gray, 3, Color::indigo()), poses[1]);
+    viz.showWidget("text2d", WText("Camera positions with images", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_overlay_image)
+{
+    Mat lena = imread(Path::combine(cvtest::TS::ptr()->get_data_path(), "lena.png"));
+    Mat gray = make_gray(lena);
+
+    Size2d half_lsize = Size2d(lena.size()) * 0.5;
+
+    Viz3d viz("show_overlay_image");
+    viz.setBackgroundMeshLab();
+    Size vsz = viz.getWindowSize();
+
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("cube", WCube());
+    viz.showWidget("img1", WImageOverlay(lena, Rect(Point(10, 10), half_lsize)));
+    viz.showWidget("img2", WImageOverlay(gray, Rect(Point(vsz.width-10-lena.cols/2, 10), half_lsize)));
+    viz.showWidget("img3", WImageOverlay(gray, Rect(Point(10, vsz.height-10-lena.rows/2), half_lsize)));
+    viz.showWidget("img5", WImageOverlay(lena, Rect(Point(vsz.width-10-lena.cols/2, vsz.height-10-lena.rows/2), half_lsize)));
+    viz.showWidget("text2d", WText("Overlay images", Point(20, 20), 20, Color::green()));
+
+    int i = 0;
+    while(!viz.wasStopped())
+    {
+        double a = ++i % 360;
+        Vec3d pose(sin(a * CV_PI/180), 0.7, cos(a * CV_PI/180));
+        viz.setViewerPose(makeCameraPose(pose * 3, Vec3d(0.0, 0.5, 0.0), Vec3d(0.0, 0.1, 0.0)));
+        viz.getWidget("img1").cast<WImageOverlay>().setImage(lena * pow(sin(i*10*CV_PI/180) * 0.5 + 0.5, 1.0));
+        viz.spinOnce(1, true);
+    }
+    viz.showWidget("text2d", WText("Overlay images (stopped)", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+
+TEST(Viz, show_image_method)
+{
+    Mat lena = imread(Path::combine(cvtest::TS::ptr()->get_data_path(), "lena.png"));
+
+    Viz3d viz("show_image_method");
+    viz.showImage(lena);
+    viz.spinOnce(1500, true);
+    viz.showImage(lena, lena.size());
+    viz.spinOnce(1500, true);
+
+    cv::viz::imshow("show_image_method", make_gray(lena)).spin();
+}
+
+TEST(Viz, show_image_3d)
+{
+    Mat lena = imread(Path::combine(cvtest::TS::ptr()->get_data_path(), "lena.png"));
+    Mat gray = make_gray(lena);
+
+    Viz3d viz("show_image_3d");
+    viz.setBackgroundMeshLab();
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("cube", WCube());
+    viz.showWidget("arr0", WArrow(Vec3d(0.5, 0.0, 0.0), Vec3d(1.5, 0.0, 0.0), 0.009, Color::raspberry()));
+    viz.showWidget("img0", WImage3D(lena, Size2d(1.0, 1.0)), Affine3d(Vec3d(0.0, CV_PI/2, 0.0), Vec3d(.5, 0.0, 0.0)));
+    viz.showWidget("arr1", WArrow(Vec3d(-0.5, -0.5, 0.0), Vec3d(0.2, 0.2, 0.0), 0.009, Color::raspberry()));
+    viz.showWidget("img1", WImage3D(gray, Size2d(1.0, 1.0), Vec3d(-0.5, -0.5, 0.0), Vec3d(1.0, 1.0, 0.0), Vec3d(0.0, 1.0, 0.0)));
+
+    viz.showWidget("arr3", WArrow(Vec3d::all(-0.5), Vec3d::all(0.5), 0.009, Color::raspberry()));
+
+    viz.showWidget("text2d", WText("Images in 3D", Point(20, 20), 20, Color::green()));
+
+    int i = 0;
+    while(!viz.wasStopped())
+    {
+        viz.getWidget("img0").cast<WImage3D>().setImage(lena * pow(sin(i++*7.5*CV_PI/180) * 0.5 + 0.5, 1.0));
+        viz.spinOnce(1, true);
+    }
+    viz.showWidget("text2d", WText("Images in 3D (stopped)", Point(20, 20), 20, Color::green()));
+    viz.spin();
+}
+
+TEST(Viz, show_simple_widgets)
+{
+    Viz3d viz("show_simple_widgets");
+    viz.setBackgroundMeshLab();
+
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("cube", WCube());
+    viz.showWidget("cub0", WCube(Vec3d::all(-1.0), Vec3d::all(-0.5), false, Color::indigo()));
+    viz.showWidget("arro", WArrow(Vec3d::all(-0.5), Vec3d::all(0.5), 0.009, Color::raspberry()));
+    viz.showWidget("cir1", WCircle(0.5, 0.01, Color::bluberry()));
+    viz.showWidget("cir2", WCircle(0.5, Point3d(0.5, 0.0, 0.0), Vec3d(1.0, 0.0, 0.0), 0.01, Color::apricot()));
+
+    viz.showWidget("cyl0", WCylinder(Vec3d(-0.5, 0.5, -0.5), Vec3d(0.5, 0.5, -0.5), 0.125, 30, Color::brown()));
+    viz.showWidget("con0", WCone(0.25, 0.125, 6, Color::azure()));
+    viz.showWidget("con1", WCone(0.125, Point3d(0.5, -0.5, 0.5), Point3d(0.5, -1.0, 0.5), 6, Color::turquoise()));
+
+    viz.showWidget("text2d", WText("Different simple widgets", Point(20, 20), 20, Color::green()));
+    viz.showWidget("text3d", WText3D("Simple 3D text", Point3d( 0.5,  0.5, 0.5), 0.125, false, Color::green()));
+
+    viz.showWidget("plane1", WPlane(Size2d(0.25, 0.75)));
+    viz.showWidget("plane2", WPlane(Vec3d(0.5, -0.5, -0.5), Vec3d(0.0, 1.0, 1.0), Vec3d(1.0, 1.0, 0.0), Size2d(1.0, 0.5), Color::gold()));
+
+    viz.showWidget("grid1", WGrid(Vec2i(7,7), Vec2d::all(0.75), Color::gray()), Affine3d().translate(Vec3d(0.0, 0.0, -1.0)));
+
+    viz.spin();
+    viz.getWidget("text2d").cast<WText>().setText("Different simple widgets (updated)");
+    viz.getWidget("text3d").cast<WText3D>().setText("Updated text 3D");
+    viz.spin();
+}
+
+TEST(Viz, show_follower)
+{
+    Viz3d viz("show_follower");
+
+    viz.showWidget("coos", WCoordinateSystem());
+    viz.showWidget("cube", WCube());
+    viz.showWidget("t3d_2", WText3D("Simple 3D follower", Point3d(-0.5, -0.5, 0.5), 0.125, true,  Color::green()));
+    viz.showWidget("text2d", WText("Follower: text always facing camera", Point(20, 20), 20, Color::green()));
+    viz.setBackgroundMeshLab();
+    viz.spin();
+    viz.getWidget("t3d_2").cast<WText3D>().setText("Updated follower 3D");
+    viz.spin();
+}
diff --git a/platforms/android/android.toolchain.cmake b/platforms/android/android.toolchain.cmake
index 68b256fbd..457164a1e 100644
--- a/platforms/android/android.toolchain.cmake
+++ b/platforms/android/android.toolchain.cmake
@@ -1,5 +1,5 @@
 # Copyright (c) 2010-2011, Ethan Rublee
-# Copyright (c) 2011-2013, Andrey Kamaev
+# Copyright (c) 2011-2014, Andrey Kamaev
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -12,9 +12,9 @@
 #     this list of conditions and the following disclaimer in the documentation
 #     and/or other materials provided with the distribution.
 #
-# 3.  The name of the copyright holders may be used to endorse or promote
-#     products derived from this software without specific prior written
-#     permission.
+# 3.  Neither the name of the copyright holder nor the names of its
+#     contributors may be used to endorse or promote products derived from this
+#     software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -29,12 +29,12 @@
 # POSSIBILITY OF SUCH DAMAGE.
 
 # ------------------------------------------------------------------------------
-#  Android CMake toolchain file, for use with the Android NDK r5-r8
+#  Android CMake toolchain file, for use with the Android NDK r5-r9
 #  Requires cmake 2.6.3 or newer (2.8.5 or newer is recommended).
 #  See home page: https://github.com/taka-no-me/android-cmake
 #
 #  The file is mantained by the OpenCV project. The latest version can be get at
-#  https://github.com/Itseez/opencv/tree/master/platforms/android/android.toolchain.cmake
+#  http://code.opencv.org/projects/opencv/repository/revisions/master/changes/android/android.toolchain.cmake
 #
 #  Usage Linux:
 #   $ export ANDROID_NDK=/absolute/path/to/the/android-ndk
@@ -87,8 +87,7 @@
 #        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP.
 #        "x86" - matches to the NDK ABI with the same name.
 #            See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
-#        "mips" - matches to the NDK ABI with the same name
-#            (It is not tested on real devices by the authos of this toolchain)
+#        "mips" - matches to the NDK ABI with the same name.
 #            See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
 #
 #    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
@@ -292,6 +291,16 @@
 #   - April 2013
 #     [+] support non-release NDK layouts (from Linaro git and Android git)
 #     [~] automatically detect if explicit link to crtbegin_*.o is needed
+#   - June 2013
+#     [~] fixed stl include path for standalone toolchain made by NDK >= r8c
+#   - July 2013
+#     [+] updated for NDK r9
+#   - November 2013
+#     [+] updated for NDK r9b
+#   - December 2013
+#     [+] updated for NDK r9c
+#   - January 2014
+#     [~] fix copying of shared STL
 # ------------------------------------------------------------------------------
 
 cmake_minimum_required( VERSION 2.6.3 )
@@ -318,7 +327,7 @@ set( CMAKE_SYSTEM_VERSION 1 )
 # rpath makes low sence for Android
 set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
 
-set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r9b -r9 -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
+set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r9c -r9b -r9 -r8e -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
 if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS)
  if( CMAKE_HOST_WIN32 )
   file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
@@ -464,7 +473,7 @@ endif()
 
 
 # detect current host platform
-if( NOT DEFINED ANDROID_NDK_HOST_X64 AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64")
+if( NOT DEFINED ANDROID_NDK_HOST_X64 AND (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64|x86_64|AMD64" OR CMAKE_HOST_APPLE) )
  set( ANDROID_NDK_HOST_X64 1 CACHE BOOL "Try to use 64-bit compiler toolchain" )
  mark_as_advanced( ANDROID_NDK_HOST_X64 )
 endif()
@@ -484,9 +493,7 @@ else()
  message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
 endif()
 
-# CMAKE_HOST_SYSTEM_PROCESSOR on MacOS X always says i386 on Intel platform
-# So we do not trust ANDROID_NDK_HOST_X64 on Apple hosts
-if( NOT ANDROID_NDK_HOST_X64 AND NOT CMAKE_HOST_APPLE)
+if( NOT ANDROID_NDK_HOST_X64 )
  set( ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_NDK_HOST_SYSTEM_NAME2} )
 endif()
 
@@ -634,30 +641,27 @@ endif()
 
 macro( __GLOB_NDK_TOOLCHAINS __availableToolchainsVar __availableToolchainsLst __toolchain_subpath )
  foreach( __toolchain ${${__availableToolchainsLst}} )
-  # Skip renderscript folder. It's not C++ toolchain
-  if (NOT ${__toolchain} STREQUAL "renderscript")
-   if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
-     string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
-   else()
-     set( __gcc_toolchain "${__toolchain}" )
-   endif()
-   __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
-   if( __machine )
-    string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
-    if( __machine MATCHES i686 )
-     set( __arch "x86" )
-    elseif( __machine MATCHES arm )
-     set( __arch "arm" )
-    elseif( __machine MATCHES mipsel )
-     set( __arch "mipsel" )
-    endif()
-    list( APPEND __availableToolchainMachines "${__machine}" )
-    list( APPEND __availableToolchainArchs "${__arch}" )
-    list( APPEND __availableToolchainCompilerVersions "${__version}" )
-    list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
-   endif()
-   unset( __gcc_toolchain )
+  if( "${__toolchain}" MATCHES "-clang3[.][0-9]$" AND NOT EXISTS "${ANDROID_NDK_TOOLCHAINS_PATH}/${__toolchain}${__toolchain_subpath}" )
+   string( REGEX REPLACE "-clang3[.][0-9]$" "-4.6" __gcc_toolchain "${__toolchain}" )
+  else()
+   set( __gcc_toolchain "${__toolchain}" )
   endif()
+  __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK_TOOLCHAINS_PATH}/${__gcc_toolchain}${__toolchain_subpath}" )
+  if( __machine )
+   string( REGEX MATCH "[0-9]+[.][0-9]+([.][0-9x]+)?$" __version "${__gcc_toolchain}" )
+   if( __machine MATCHES i686 )
+    set( __arch "x86" )
+   elseif( __machine MATCHES arm )
+    set( __arch "arm" )
+   elseif( __machine MATCHES mipsel )
+    set( __arch "mipsel" )
+   endif()
+   list( APPEND __availableToolchainMachines "${__machine}" )
+   list( APPEND __availableToolchainArchs "${__arch}" )
+   list( APPEND __availableToolchainCompilerVersions "${__version}" )
+   list( APPEND ${__availableToolchainsVar} "${__toolchain}" )
+  endif()
+  unset( __gcc_toolchain )
  endforeach()
 endmacro()
 
@@ -687,6 +691,7 @@ if( BUILD_WITH_ANDROID_NDK )
   endif()
   __LIST_FILTER( __availableToolchainsLst "^[.]" )
   __LIST_FILTER( __availableToolchainsLst "llvm" )
+  __LIST_FILTER( __availableToolchainsLst "renderscript" )
   __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH}" )
   if( NOT __availableToolchains AND NOT ANDROID_NDK_TOOLCHAINS_SUBPATH STREQUAL ANDROID_NDK_TOOLCHAINS_SUBPATH2 )
    __GLOB_NDK_TOOLCHAINS( __availableToolchains __availableToolchainsLst "${ANDROID_NDK_TOOLCHAINS_SUBPATH2}" )
@@ -975,7 +980,11 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
 
  if( NOT ANDROID_STL STREQUAL "none" )
-  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}" )
+  set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/include/c++/${ANDROID_COMPILER_VERSION}" )
+  if( NOT EXISTS "${ANDROID_STL_INCLUDE_DIRS}" )
+   # old location ( pre r8c )
+   set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_STANDALONE_TOOLCHAIN}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}" )
+  endif()
   if( ARMEABI_V7A AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}/bits" )
    list( APPEND ANDROID_STL_INCLUDE_DIRS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/${CMAKE_SYSTEM_PROCESSOR}" )
   elseif( ARMEABI AND NOT ANDROID_FORCE_ARM_BUILD AND EXISTS "${ANDROID_STL_INCLUDE_DIRS}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/thumb/bits" )
@@ -1130,15 +1139,7 @@ endif()
 # case of shared STL linkage
 if( ANDROID_STL MATCHES "shared" AND DEFINED __libstl )
  string( REPLACE "_static.a" "_shared.so" __libstl "${__libstl}" )
- if( NOT _CMAKE_IN_TRY_COMPILE AND __libstl MATCHES "[.]so$" )
-  get_filename_component( __libstlname "${__libstl}" NAME )
-  execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${__libstl}" "${LIBRARY_OUTPUT_PATH}/${__libstlname}" RESULT_VARIABLE __fileCopyProcess )
-  if( NOT __fileCopyProcess EQUAL 0 OR NOT EXISTS "${LIBRARY_OUTPUT_PATH}/${__libstlname}")
-   message( SEND_ERROR "Failed copying of ${__libstl} to the ${LIBRARY_OUTPUT_PATH}/${__libstlname}" )
-  endif()
-  unset( __fileCopyProcess )
-  unset( __libstlname )
- endif()
+ # TODO: check if .so file exists before the renaming
 endif()
 
 
@@ -1503,7 +1504,8 @@ endif()
 
 # global includes and link directories
 include_directories( SYSTEM "${ANDROID_SYSROOT}/usr/include" ${ANDROID_STL_INCLUDE_DIRS} )
-link_directories( "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
+get_filename_component(__android_install_path "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" ABSOLUTE) # avoid CMP0015 policy warning
+link_directories( "${__android_install_path}" )
 
 # detect if need link crtbegin_so.o explicitly
 if( NOT DEFINED ANDROID_EXPLICIT_CRT_LINK )
@@ -1555,6 +1557,18 @@ if(NOT _CMAKE_IN_TRY_COMPILE)
  set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "path for android libs" )
 endif()
 
+# copy shaed stl library to build directory
+if( NOT _CMAKE_IN_TRY_COMPILE AND __libstl MATCHES "[.]so$" )
+ get_filename_component( __libstlname "${__libstl}" NAME )
+ execute_process( COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${__libstl}" "${LIBRARY_OUTPUT_PATH}/${__libstlname}" RESULT_VARIABLE __fileCopyProcess )
+ if( NOT __fileCopyProcess EQUAL 0 OR NOT EXISTS "${LIBRARY_OUTPUT_PATH}/${__libstlname}")
+  message( SEND_ERROR "Failed copying of ${__libstl} to the ${LIBRARY_OUTPUT_PATH}/${__libstlname}" )
+ endif()
+ unset( __fileCopyProcess )
+ unset( __libstlname )
+endif()
+
+
 # set these global flags for cmake client scripts to change behavior
 set( ANDROID True )
 set( BUILD_ANDROID True )
@@ -1663,6 +1677,19 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
 endif()
 
 
+# force cmake to produce / instead of \ in build commands for Ninja generator
+if( CMAKE_GENERATOR MATCHES "Ninja" AND CMAKE_HOST_WIN32 )
+ # it is a bad hack after all
+ # CMake generates Ninja makefiles with UNIX paths only if it thinks that we are going to build with MinGW
+ set( CMAKE_COMPILER_IS_MINGW TRUE ) # tell CMake that we are MinGW
+ set( CMAKE_CROSSCOMPILING TRUE )    # stop recursion
+ enable_language( C )
+ enable_language( CXX )
+ # unset( CMAKE_COMPILER_IS_MINGW ) # can't unset because CMake does not convert back-slashes in response files without it
+ unset( MINGW )
+endif()
+
+
 # set some obsolete variables for backward compatibility
 set( ANDROID_SET_OBSOLETE_VARIABLES ON CACHE BOOL "Define obsolete Andrid-specific cmake variables" )
 mark_as_advanced( ANDROID_SET_OBSOLETE_VARIABLES )
@@ -1717,7 +1744,7 @@ endif()
 #   BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
 #   ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
 #   ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "x86" or "mips" depending on ANDROID_ABI
-#   ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c, r8d, r8e; set only for NDK
+#   ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c, r8d, r8e, r9, r9b, r9c; set only for NDK
 #   ANDROID_ARCH_NAME : "arm" or "x86" or "mips" depending on ANDROID_ABI
 #   ANDROID_SYSROOT : path to the compiler sysroot
 #   TOOL_OS_SUFFIX : "" or ".exe" depending on host platform
diff --git a/platforms/android/libinfo/CMakeLists.txt b/platforms/android/libinfo/CMakeLists.txt
index 028413ec6..55dd27859 100644
--- a/platforms/android/libinfo/CMakeLists.txt
+++ b/platforms/android/libinfo/CMakeLists.txt
@@ -36,4 +36,4 @@ set_target_properties(${the_module} PROPERTIES
   )
 
 get_filename_component(lib_name "libopencv_info.so" NAME)
-install(FILES "${LIBRARY_OUTPUT_PATH}/${lib_name}" DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main)
+install(FILES "${LIBRARY_OUTPUT_PATH}/${lib_name}" DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT libs)
diff --git a/platforms/android/package/CMakeLists.txt b/platforms/android/package/CMakeLists.txt
index 1382a078c..b48a55a6a 100644
--- a/platforms/android/package/CMakeLists.txt
+++ b/platforms/android/package/CMakeLists.txt
@@ -89,6 +89,6 @@ add_custom_command(
          DEPENDS "${OpenCV_BINARY_DIR}/bin/classes.jar.dephelper" "${PACKAGE_DIR}/res/values/strings.xml" "${PACKAGE_DIR}/res/drawable/icon.png" ${camera_wrappers} opencv_java
        )
 
-install(FILES "${APK_NAME}" DESTINATION "apk/" COMPONENT main)
+install(FILES "${APK_NAME}" DESTINATION "apk/" COMPONENT libs)
 add_custom_target(android_package ALL SOURCES "${APK_NAME}" )
 add_dependencies(android_package opencv_java)
diff --git a/platforms/android/service/CMakeLists.txt b/platforms/android/service/CMakeLists.txt
index dde145513..c99b71392 100644
--- a/platforms/android/service/CMakeLists.txt
+++ b/platforms/android/service/CMakeLists.txt
@@ -3,4 +3,4 @@ if(BUILD_ANDROID_SERVICE)
   #add_subdirectory(engine_test)
 endif()
 
-install(FILES "readme.txt" DESTINATION "apk/" COMPONENT main)
+install(FILES "readme.txt" DESTINATION "apk/" COMPONENT libs)
diff --git a/platforms/android/service/doc/JavaHelper.rst b/platforms/android/service/doc/JavaHelper.rst
index 5c1e1c325..05576a1b2 100644
--- a/platforms/android/service/doc/JavaHelper.rst
+++ b/platforms/android/service/doc/JavaHelper.rst
@@ -63,3 +63,7 @@ OpenCV version constants
 .. data:: OPENCV_VERSION_2_4_7
 
     OpenCV Library version 2.4.7
+
+.. data:: OPENCV_VERSION_2_4_8
+
+    OpenCV Library version 2.4.8
diff --git a/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
index dbd192b79..359906406 100644
--- a/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
+++ b/platforms/android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
@@ -15,7 +15,7 @@ using namespace android;
 
 const int OpenCVEngine::Platform = DetectKnownPlatforms();
 const int OpenCVEngine::CpuID = GetCpuID();
-const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400, 2040500, 2040600, 2040700};
+const int OpenCVEngine::KnownVersions[] = {2040000, 2040100, 2040200, 2040300, 2040301, 2040302, 2040400, 2040500, 2040600, 2040700, 2040701, 2040800};
 
 bool OpenCVEngine::ValidateVersion(int version)
 {
diff --git a/platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp b/platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp
index 2e6b35a7b..a404a450f 100644
--- a/platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp
+++ b/platforms/android/service/engine/jni/BinderComponent/StringUtils.cpp
@@ -34,13 +34,13 @@ bool ParseString(const string& src, string& key, string& value)
     if (src.empty())
         return false;
 
-    // find seporator ":"
-    size_t seporator_pos = src.find(":");
-    if (string::npos != seporator_pos)
+    // find separator ":"
+    size_t separator_pos = src.find(":");
+    if (string::npos != separator_pos)
     {
-        key = src.substr(0, seporator_pos);
+        key = src.substr(0, separator_pos);
         StripString(key);
-        value = src.substr(seporator_pos+1);
+        value = src.substr(separator_pos+1);
         StripString(value);
         return true;
     }
@@ -50,42 +50,42 @@ bool ParseString(const string& src, string& key, string& value)
     }
 }
 
-set<string> SplitString(const string& src, const char seporator)
+set<string> SplitString(const string& src, const char separator)
 {
     set<string> result;
 
     if (!src.empty())
     {
-        size_t seporator_pos;
+        size_t separator_pos;
         size_t prev_pos = 0;
         do
         {
-            seporator_pos = src.find(seporator, prev_pos);
-            result.insert(src.substr(prev_pos, seporator_pos - prev_pos));
-            prev_pos = seporator_pos + 1;
+            separator_pos = src.find(separator, prev_pos);
+            result.insert(src.substr(prev_pos, separator_pos - prev_pos));
+            prev_pos = separator_pos + 1;
         }
-        while (string::npos != seporator_pos);
+        while (string::npos != separator_pos);
     }
 
     return result;
 }
 
-vector<string> SplitStringVector(const string& src, const char seporator)
+vector<string> SplitStringVector(const string& src, const char separator)
 {
     vector<string> result;
 
     if (!src.empty())
     {
-        size_t seporator_pos;
+        size_t separator_pos;
         size_t prev_pos = 0;
         do
         {
-            seporator_pos = src.find(seporator, prev_pos);
-            string tmp = src.substr(prev_pos, seporator_pos - prev_pos);
+            separator_pos = src.find(separator, prev_pos);
+            string tmp = src.substr(prev_pos, separator_pos - prev_pos);
             result.push_back(tmp);
-            prev_pos = seporator_pos + 1;
+            prev_pos = separator_pos + 1;
         }
-        while (string::npos != seporator_pos);
+        while (string::npos != separator_pos);
     }
 
     return result;
diff --git a/platforms/android/service/engine/jni/BinderComponent/StringUtils.h b/platforms/android/service/engine/jni/BinderComponent/StringUtils.h
index e36bfcc7c..6ef9eed4d 100644
--- a/platforms/android/service/engine/jni/BinderComponent/StringUtils.h
+++ b/platforms/android/service/engine/jni/BinderComponent/StringUtils.h
@@ -6,8 +6,8 @@
 #include <vector>
 
 bool StripString(std::string& src);
-std::set<std::string> SplitString(const std::string& src, const char seporator);
+std::set<std::string> SplitString(const std::string& src, const char separator);
 bool ParseString(const std::string& src, std::string& key, std::string& value);
-std::vector<std::string> SplitStringVector(const std::string& src, const char seporator);
+std::vector<std::string> SplitStringVector(const std::string& src, const char separator);
 
 #endif
diff --git a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
index 98ea82874..ca364b444 100644
--- a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -203,7 +203,7 @@ inline int SplitPlatform(const vector<string>& features)
 }
 
 /* Package naming convention
- * All parts of package name seporated by "_" symbol
+ * All parts of package name separated by "_" symbol
  * First part is base namespace.
  * Second part is version. Version starts from "v" symbol. After "v" symbol version nomber without dot symbol added.
  * If platform is known third part is platform name
diff --git a/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
index 83dd9c27e..8e7dfab00 100644
--- a/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
@@ -55,7 +55,7 @@ TEST(Parse, ParseEmptyString)
     EXPECT_FALSE(ParseString(a, key, value));
 }
 
-TEST(Parse, ParseStringWithoutSeporator)
+TEST(Parse, ParseStringWithoutSeparator)
 {
     string a = "qqqwww";
     string key;
diff --git a/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp b/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
index 952af6280..14295ecbc 100644
--- a/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/PackageManagmentTest.cpp
@@ -144,7 +144,7 @@ TEST(PackageManager, GetPackagePathForMips)
 }
 #endif
 
-// TODO: Enable tests if seporate package will be exists
+// TODO: Enable tests if separate package will be exists
 // TEST(PackageManager, GetPackagePathForTegra2)
 // {
 //     PackageManagerStub pm;
diff --git a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
index a6cf193e3..13e0f7f84 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
+++ b/platforms/android/service/engine/src/org/opencv/engine/OpenCVEngineInterface.aidl
@@ -25,9 +25,9 @@ interface OpenCVEngineInterface
     boolean installVersion(String version);
 
     /**
-    * Return list of libraries in loading order seporated by ";" symbol
+    * Return list of libraries in loading order separated by ";" symbol
     * @param OpenCV version
-    * @return Returns OpenCV libraries names seporated by symbol ";" in loading order
+    * @return Returns OpenCV libraries names separated by symbol ";" in loading order
     */
     String getLibraryList(String version);
 }
diff --git a/platforms/android/service/readme.txt b/platforms/android/service/readme.txt
index a280b506f..65678093d 100644
--- a/platforms/android/service/readme.txt
+++ b/platforms/android/service/readme.txt
@@ -14,20 +14,20 @@ manually using adb tool:
 
 .. code-block:: sh
 
-    adb install OpenCV-2.4.7.1-android-sdk/apk/OpenCV_2.4.7.1_Manager_2.15_<platform>.apk
+    adb install OpenCV-2.4.8-android-sdk/apk/OpenCV_2.4.8_Manager_2.16_<platform>.apk
 
 Use the table below to determine proper OpenCV Manager package for your device:
 
-+------------------------------+--------------+------------------------------------------------------+
-| Hardware Platform            | Android ver. | Package name                                         |
-+==============================+==============+======================================================+
-| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon.apk          |
-+------------------------------+--------------+------------------------------------------------------+
-| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon-android8.apk |
-+------------------------------+--------------+------------------------------------------------------+
-| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armeabi.apk              |
-+------------------------------+--------------+------------------------------------------------------+
-| Intel x86                    |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_x86.apk                  |
-+------------------------------+--------------+------------------------------------------------------+
-| MIPS                         |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_mips.apk                 |
-+------------------------------+--------------+------------------------------------------------------+
++------------------------------+--------------+----------------------------------------------------+
+| Hardware Platform            | Android ver. | Package name                                       |
++==============================+==============+====================================================+
+| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.8_Manager_2.16_armv7a-neon.apk          |
++------------------------------+--------------+----------------------------------------------------+
+| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.8_Manager_2.16_armv7a-neon-android8.apk |
++------------------------------+--------------+----------------------------------------------------+
+| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.8_Manager_2.16_armeabi.apk              |
++------------------------------+--------------+----------------------------------------------------+
+| Intel x86                    |    >= 2.3    | OpenCV_2.4.8_Manager_2.16_x86.apk                  |
++------------------------------+--------------+----------------------------------------------------+
+| MIPS                         |    >= 2.3    | OpenCV_2.4.8_Manager_2.16_mips.apk                 |
++------------------------------+--------------+----------------------------------------------------+
diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake
index c6b0469ad..2c5b7406d 100644
--- a/platforms/linux/arm-gnueabi.toolchain.cmake
+++ b/platforms/linux/arm-gnueabi.toolchain.cmake
@@ -28,14 +28,11 @@ set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-s
 set(CMAKE_EXE_LINKER_FLAGS    "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}")
 
 if(USE_NEON)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon")
+  message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." )
+  set(ENABLE_NEON TRUE)
 elseif(USE_VFPV3)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3")
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3-d16")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3-d16")
+  message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." )
+  set(ENABLE_VFPV3 TRUE)
 endif()
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 9dd3df0b6..36b68d2d1 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -13,7 +13,7 @@ if(NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_LIST_DIR)
 add_subdirectory(c)
 add_subdirectory(cpp)
 add_subdirectory(gpu)
-add_subdirectory(ocl)
+add_subdirectory(tapi)
 
 if(WIN32 AND HAVE_DIRECTX)
   add_subdirectory(directx)
@@ -23,7 +23,6 @@ if(ANDROID AND BUILD_ANDROID_EXAMPLES)
   add_subdirectory(android)
 endif()
 
-
 #
 # END OF BUILD CASE 1: Build samples with library sources
 #
@@ -63,7 +62,6 @@ endif()
 
 add_subdirectory(c)
 add_subdirectory(cpp)
-add_subdirectory(ocl)
 # FIXIT: can't use cvconfig.h in samples: add_subdirectory(gpu)
 
 if(WIN32)
@@ -73,4 +71,4 @@ endif()
 #
 # END OF BUILD CASE 2: Build samples with library binaries
 #
-endif()
\ No newline at end of file
+endif()
diff --git a/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java
index 04da9a949..b9db22de1 100644
--- a/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java
+++ b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java
@@ -21,6 +21,7 @@ public class CvNativeActivity extends Activity {
                     System.loadLibrary("native_activity");
                     Intent intent = new Intent(CvNativeActivity.this, android.app.NativeActivity.class);
                     CvNativeActivity.this.startActivity(intent);
+                    CvNativeActivity.this.finish();
                 } break;
                 default:
                 {
@@ -34,7 +35,7 @@ public class CvNativeActivity extends Activity {
         Log.i(TAG, "Instantiated new " + this.getClass());
     }
 
-   @Override
+    @Override
     public void onResume()
     {
         super.onResume();
diff --git a/samples/c/CMakeLists.txt b/samples/c/CMakeLists.txt
index 77a42949d..260f6f60c 100644
--- a/samples/c/CMakeLists.txt
+++ b/samples/c/CMakeLists.txt
@@ -39,7 +39,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
         set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
       endif()
       install(TARGETS ${the_target}
-              RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/c" COMPONENT main)
+              RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/c" COMPONENT samples)
     endif()
   ENDMACRO()
 
@@ -51,9 +51,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   endforeach()
 endif()
 
-if (INSTALL_C_EXAMPLES AND NOT WIN32)
+if(INSTALL_C_EXAMPLES AND NOT WIN32)
   file(GLOB C_SAMPLES *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
   install(FILES ${C_SAMPLES}
-          DESTINATION share/OpenCV/samples/c
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/c
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
 endif ()
diff --git a/samples/c/adaptiveskindetector.cpp b/samples/c/adaptiveskindetector.cpp
index 21c9ffe9b..ac9057df2 100644
--- a/samples/c/adaptiveskindetector.cpp
+++ b/samples/c/adaptiveskindetector.cpp
@@ -126,12 +126,12 @@ ASDFrameHolder::ASDFrameHolder( )
 {
     image = NULL;
     timeStamp = 0;
-};
+}
 
 ASDFrameHolder::~ASDFrameHolder( )
 {
     cvReleaseImage(&image);
-};
+}
 
 void ASDFrameHolder::assignFrame(IplImage *sourceImage, double frameTime)
 {
@@ -143,22 +143,22 @@ void ASDFrameHolder::assignFrame(IplImage *sourceImage, double frameTime)
 
     image = cvCloneImage(sourceImage);
     timeStamp = frameTime;
-};
+}
 
 IplImage *ASDFrameHolder::getImage()
 {
     return image;
-};
+}
 
 double ASDFrameHolder::getTimeStamp()
 {
     return timeStamp;
-};
+}
 
 void ASDFrameHolder::setImage(IplImage *sourceImage)
 {
     image = sourceImage;
-};
+}
 
 
 //-------------------- ASDFrameSequencer -----------------------//
@@ -166,26 +166,26 @@ void ASDFrameHolder::setImage(IplImage *sourceImage)
 ASDFrameSequencer::~ASDFrameSequencer()
 {
     close();
-};
+}
 
 IplImage *ASDFrameSequencer::getNextImage()
 {
     return NULL;
-};
+}
 
 void ASDFrameSequencer::close()
 {
 
-};
+}
 
 bool ASDFrameSequencer::isOpen()
 {
     return false;
-};
+}
 
 void ASDFrameSequencer::getFrameCaption(char* /*caption*/) {
     return;
-};
+}
 
 IplImage* ASDCVFrameSequencer::getNextImage()
 {
@@ -201,7 +201,7 @@ IplImage* ASDCVFrameSequencer::getNextImage()
     {
         return NULL;
     }
-};
+}
 
 void ASDCVFrameSequencer::close()
 {
@@ -209,12 +209,12 @@ void ASDCVFrameSequencer::close()
     {
         cvReleaseCapture(&capture);
     }
-};
+}
 
 bool ASDCVFrameSequencer::isOpen()
 {
     return (capture != NULL);
-};
+}
 
 
 //-------------------- ASDFrameSequencerWebCam -----------------------//
@@ -233,7 +233,7 @@ bool ASDFrameSequencerWebCam::open(int cameraIndex)
     {
         return true;
     }
-};
+}
 
 
 //-------------------- ASDFrameSequencerVideoFile -----------------------//
@@ -251,7 +251,7 @@ bool ASDFrameSequencerVideoFile::open(const char *fileName)
     {
         return true;
     }
-};
+}
 
 
 //-------------------- ASDFrameSequencerImageFile -----------------------//
@@ -263,11 +263,11 @@ void ASDFrameSequencerImageFile::open(const char *fileNameMask, int startIndex,
     nEndIndex = endIndex;
 
     std::sprintf(sFileNameMask, "%s", fileNameMask);
-};
+}
 
 void ASDFrameSequencerImageFile::getFrameCaption(char *caption) {
     std::sprintf(caption, sFileNameMask, nCurrentIndex);
-};
+}
 
 IplImage* ASDFrameSequencerImageFile::getNextImage()
 {
@@ -283,23 +283,23 @@ IplImage* ASDFrameSequencerImageFile::getNextImage()
     IplImage* img = cvLoadImage(fileName);
 
     return img;
-};
+}
 
 void ASDFrameSequencerImageFile::close()
 {
     nCurrentIndex = nEndIndex+1;
-};
+}
 
 bool ASDFrameSequencerImageFile::isOpen()
 {
     return (nCurrentIndex <= nEndIndex);
-};
+}
 
 static void putTextWithShadow(IplImage *img, const char *str, CvPoint point, CvFont *font, CvScalar color = CV_RGB(255, 255, 128))
 {
     cvPutText(img, str, cvPoint(point.x-1,point.y-1), font, CV_RGB(0, 0, 0));
     cvPutText(img, str, point, font, color);
-};
+}
 
 #define ASD_RGB_SET_PIXEL(pointer, r, g, b) { (*pointer) = (unsigned char)b; (*(pointer+1)) = (unsigned char)g; (*(pointer+2)) = (unsigned char)r; }
 
@@ -336,7 +336,7 @@ static void displayBuffer(IplImage *rgbDestImage, IplImage *buffer, int rValue,
         destY = 0;
         destX += dx;
     }
-};
+}
 
 int main(int argc, char** argv )
 {
diff --git a/samples/cpp/3calibration.cpp b/samples/cpp/3calibration.cpp
index a6ee45d85..55644e9db 100644
--- a/samples/cpp/3calibration.cpp
+++ b/samples/cpp/3calibration.cpp
@@ -137,8 +137,8 @@ static bool run3Calibration( vector<vector<Point2f> > imagePoints1,
         double err = stereoCalibrate(objpt, imgpt, imgpt_right, cameraMatrix1, distCoeffs1,
                                      cameraMatrix, distCoeffs,
                                      imageSize, R, T, E, F,
-                                     TermCriteria(TermCriteria::COUNT, 30, 0),
-                                     CALIB_FIX_INTRINSIC);
+                                     CALIB_FIX_INTRINSIC,
+                                     TermCriteria(TermCriteria::COUNT, 30, 0));
         printf("Pair (1,%d) calibration reprojection error = %g\n", c, sqrt(err/(N*2)));
         if( c == 2 )
         {
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 4b0bf011d..6ef95a8d3 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -5,7 +5,7 @@
 
 SET(OPENCV_CPP_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc
     opencv_highgui opencv_ml opencv_video opencv_objdetect opencv_photo opencv_nonfree opencv_softcascade
-    opencv_features2d opencv_calib3d opencv_legacy opencv_contrib opencv_stitching opencv_videostab opencv_bioinspired opencv_shape)
+    opencv_features2d opencv_calib3d opencv_legacy opencv_contrib opencv_stitching opencv_videostab opencv_shape)
 
 ocv_check_dependencies(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 
@@ -77,7 +77,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
         set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
       endif()
       install(TARGETS ${the_target}
-              RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/${sample_subfolder}" COMPONENT main)
+              RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/${sample_subfolder}" COMPONENT samples)
     endif()
   ENDMACRO()
 
@@ -99,9 +99,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   endforeach()
 endif()
 
-if (INSTALL_C_EXAMPLES AND NOT WIN32)
+if(INSTALL_C_EXAMPLES AND NOT WIN32)
   file(GLOB C_SAMPLES *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
   install(FILES ${C_SAMPLES}
-          DESTINATION share/OpenCV/samples/cpp
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/cpp
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
 endif()
diff --git a/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
deleted file mode 100644
index 66cae7719..000000000
--- a/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-
-//============================================================================
-// Name        : OpenEXRimages_HDR_Retina_toneMapping.cpp
-// Author      : Alexandre Benoit (benoit.alexandre.vision@gmail.com)
-// Version     : 0.1
-// Copyright   : Alexandre Benoit, LISTIC Lab, july 2011
-// Description : HighDynamicRange retina tone mapping with the help of the Gipsa/Listic's retina in C++, Ansi-style
-//============================================================================
-
-#include <iostream>
-#include <cstring>
-
-#include "opencv2/bioinspired.hpp" // retina based algorithms
-#include "opencv2/imgproc.hpp" // cvCvtcolor function
-#include "opencv2/highgui.hpp" // display
-
-static void help(std::string errorMessage)
-{
-    std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HDR_Retina_toneMapping [OpenEXR image to process]"<<std::endl;
-    std::cout<<"\t[OpenEXR image to process] : the input HDR image to process, must be an OpenEXR format, see http://www.openexr.com/ to get some samples or create your own using camera bracketing and Photoshop or equivalent software for OpenEXR image synthesis"<<std::endl;
-    std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping memorial.exr"<<std::endl;
-}
-
-// simple procedure for 1D curve tracing
-static void drawPlot(const cv::Mat curve, const std::string figureTitle, const int lowerLimit, const int upperLimit)
-{
-    //std::cout<<"curve size(h,w) = "<<curve.size().height<<", "<<curve.size().width<<std::endl;
-    cv::Mat displayedCurveImage = cv::Mat::ones(200, curve.size().height, CV_8U);
-
-    cv::Mat windowNormalizedCurve;
-    normalize(curve, windowNormalizedCurve, 0, 200, cv::NORM_MINMAX, CV_32F);
-
-    displayedCurveImage = cv::Scalar::all(255); // set a white background
-    int binW = cvRound((double)displayedCurveImage.cols/curve.size().height);
-
-    for( int i = 0; i < curve.size().height; i++ )
-        rectangle( displayedCurveImage, cv::Point(i*binW, displayedCurveImage.rows),
-                cv::Point((i+1)*binW, displayedCurveImage.rows - cvRound(windowNormalizedCurve.at<float>(i))),
-                cv::Scalar::all(0), -1, 8, 0 );
-    rectangle( displayedCurveImage, cv::Point(0, 0),
-            cv::Point((lowerLimit)*binW, 200),
-            cv::Scalar::all(128), -1, 8, 0 );
-    rectangle( displayedCurveImage, cv::Point(displayedCurveImage.cols, 0),
-            cv::Point((upperLimit)*binW, 200),
-            cv::Scalar::all(128), -1, 8, 0 );
-
-    cv::imshow(figureTitle, displayedCurveImage);
-}
-/*
- * objective : get the gray level map of the input image and rescale it to the range [0-255]
- */
- static void rescaleGrayLevelMat(const cv::Mat &inputMat, cv::Mat &outputMat, const float histogramClippingLimit)
- {
-
-     // adjust output matrix wrt the input size but single channel
-     std::cout<<"Input image rescaling with histogram edges cutting (in order to eliminate bad pixels created during the HDR image creation) :"<<std::endl;
-     //std::cout<<"=> image size (h,w,channels) = "<<inputMat.size().height<<", "<<inputMat.size().width<<", "<<inputMat.channels()<<std::endl;
-     //std::cout<<"=> pixel coding (nbchannel, bytes per channel) = "<<inputMat.elemSize()/inputMat.elemSize1()<<", "<<inputMat.elemSize1()<<std::endl;
-
-     // rescale between 0-255, keeping floating point values
-     cv::normalize(inputMat, outputMat, 0.0, 255.0, cv::NORM_MINMAX);
-
-     // extract a 8bit image that will be used for histogram edge cut
-     cv::Mat intGrayImage;
-     if (inputMat.channels()==1)
-     {
-         outputMat.convertTo(intGrayImage, CV_8U);
-     }else
-     {
-         cv::Mat rgbIntImg;
-         outputMat.convertTo(rgbIntImg, CV_8UC3);
-         cvtColor(rgbIntImg, intGrayImage, cv::COLOR_BGR2GRAY);
-     }
-
-     // get histogram density probability in order to cut values under above edges limits (here 5-95%)... usefull for HDR pixel errors cancellation
-     cv::Mat dst, hist;
-     int histSize = 256;
-     calcHist(&intGrayImage, 1, 0, cv::Mat(), hist, 1, &histSize, 0);
-     cv::Mat normalizedHist;
-     normalize(hist, normalizedHist, 1, 0, cv::NORM_L1, CV_32F); // normalize histogram so that its sum equals 1
-
-     double min_val, max_val;
-     minMaxLoc(normalizedHist, &min_val, &max_val);
-     //std::cout<<"Hist max,min = "<<max_val<<", "<<min_val<<std::endl;
-
-     // compute density probability
-     cv::Mat denseProb=cv::Mat::zeros(normalizedHist.size(), CV_32F);
-     denseProb.at<float>(0)=normalizedHist.at<float>(0);
-     int histLowerLimit=0, histUpperLimit=0;
-     for (int i=1;i<normalizedHist.size().height;++i)
-     {
-         denseProb.at<float>(i)=denseProb.at<float>(i-1)+normalizedHist.at<float>(i);
-         //std::cout<<normalizedHist.at<float>(i)<<", "<<denseProb.at<float>(i)<<std::endl;
-         if ( denseProb.at<float>(i)<histogramClippingLimit)
-             histLowerLimit=i;
-         if ( denseProb.at<float>(i)<1-histogramClippingLimit)
-             histUpperLimit=i;
-     }
-     // deduce min and max admitted gray levels
-     float minInputValue = (float)histLowerLimit/histSize*255;
-     float maxInputValue = (float)histUpperLimit/histSize*255;
-
-     std::cout<<"=> Histogram limits "
-             <<"\n\t"<<histogramClippingLimit*100<<"% index = "<<histLowerLimit<<" => normalizedHist value = "<<denseProb.at<float>(histLowerLimit)<<" => input gray level = "<<minInputValue
-             <<"\n\t"<<(1-histogramClippingLimit)*100<<"% index = "<<histUpperLimit<<" => normalizedHist value = "<<denseProb.at<float>(histUpperLimit)<<" => input gray level = "<<maxInputValue
-             <<std::endl;
-     //drawPlot(denseProb, "input histogram density probability", histLowerLimit, histUpperLimit);
-     drawPlot(normalizedHist, "input histogram", histLowerLimit, histUpperLimit);
-
-     // rescale image range [minInputValue-maxInputValue] to [0-255]
-     outputMat-=minInputValue;
-     outputMat*=255.0/(maxInputValue-minInputValue);
-     // cut original histogram and back project to original image
-     cv::threshold( outputMat, outputMat, 255.0, 255.0, 2 ); //THRESH_TRUNC, clips values above 255
-     cv::threshold( outputMat, outputMat, 0.0, 0.0, 3 ); //THRESH_TOZERO, clips values under 0
-
- }
- // basic callback method for interface management
- cv::Mat inputImage;
- cv::Mat imageInputRescaled;
- int histogramClippingValue;
- static void callBack_rescaleGrayLevelMat(int, void*)
- {
-     std::cout<<"Histogram clipping value changed, current value = "<<histogramClippingValue<<std::endl;
-     rescaleGrayLevelMat(inputImage, imageInputRescaled, (float)(histogramClippingValue/100.0));
-     normalize(imageInputRescaled, imageInputRescaled, 0.0, 255.0, cv::NORM_MINMAX);
- }
-
- cv::Ptr<cv::bioinspired::Retina> retina;
- int retinaHcellsGain;
- int localAdaptation_photoreceptors, localAdaptation_Gcells;
- static void callBack_updateRetinaParams(int, void*)
- {
-     retina->setupOPLandIPLParvoChannel(true, true, (float)(localAdaptation_photoreceptors/200.0), 0.5f, 0.43f, (float)retinaHcellsGain, 1.f, 7.f, (float)(localAdaptation_Gcells/200.0));
- }
-
- int colorSaturationFactor;
- static void callback_saturateColors(int, void*)
- {
-     retina->setColorSaturation(true, (float)colorSaturationFactor);
- }
-
- int main(int argc, char* argv[]) {
-     // welcome message
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"* Retina demonstration for High Dynamic Range compression (tone-mapping) : demonstrates the use of a wrapper class of the Gipsa/Listic Labs retina model."<<std::endl;
-     std::cout<<"* This retina model allows spatio-temporal image processing (applied on still images, video sequences)."<<std::endl;
-     std::cout<<"* This demo focuses demonstration of the dynamic compression capabilities of the model"<<std::endl;
-     std::cout<<"* => the main application is tone mapping of HDR images (i.e. see on a 8bit display a more than 8bits coded (up to 16bits) image with details in high and low luminance ranges"<<std::endl;
-     std::cout<<"* The retina model still have the following properties:"<<std::endl;
-     std::cout<<"* => It applies a spectral whithening (mid-frequency details enhancement)"<<std::endl;
-     std::cout<<"* => high frequency spatio-temporal noise reduction"<<std::endl;
-     std::cout<<"* => low frequency luminance to be reduced (luminance range compression)"<<std::endl;
-     std::cout<<"* => local logarithmic luminance compression allows details to be enhanced in low light conditions\n"<<std::endl;
-     std::cout<<"* for more information, reer to the following papers :"<<std::endl;
-     std::cout<<"* Benoit A., Caplier A., Durette B., Herault, J., \"USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING\", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011"<<std::endl;
-     std::cout<<"* Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891."<<std::endl;
-     std::cout<<"* => reports comments/remarks at benoit.alexandre.vision@gmail.com"<<std::endl;
-     std::cout<<"* => more informations and papers at : http://sites.google.com/site/benoitalexandrevision/"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"** WARNING : this sample requires OpenCV to be configured with OpenEXR support **"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"*** You can use free tools to generate OpenEXR images from images sets   :    ***"<<std::endl;
-     std::cout<<"*** =>  1. take a set of photos from the same viewpoint using bracketing      ***"<<std::endl;
-     std::cout<<"*** =>  2. generate an OpenEXR image with tools like qtpfsgui.sourceforge.net ***"<<std::endl;
-     std::cout<<"*** =>  3. apply tone mapping with this program                               ***"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-
-     // basic input arguments checking
-     if (argc<2)
-     {
-         help("bad number of parameter");
-         return -1;
-     }
-
-     bool useLogSampling = !strcmp(argv[argc-1], "log"); // check if user wants retina log sampling processing
-     int chosenMethod=0;
-     if (!strcmp(argv[argc-1], "fast"))
-     {
-         chosenMethod=1;
-         std::cout<<"Using fast method (no spectral whithning), adaptation of Meylan&al 2008 method"<<std::endl;
-     }
-
-     std::string inputImageName=argv[1];
-
-     //////////////////////////////////////////////////////////////////////////////
-     // checking input media type (still image, video file, live video acquisition)
-     std::cout<<"RetinaDemo: processing image "<<inputImageName<<std::endl;
-     // image processing case
-     // declare the retina input buffer... that will be fed differently in regard of the input media
-     inputImage = cv::imread(inputImageName, -1); // load image in RGB mode
-     std::cout<<"=> image size (h,w) = "<<inputImage.size().height<<", "<<inputImage.size().width<<std::endl;
-     if (!inputImage.total())
-     {
-        help("could not load image, program end");
-            return -1;
-         }
-     // rescale between 0 and 1
-     normalize(inputImage, inputImage, 0.0, 1.0, cv::NORM_MINMAX);
-     cv::Mat gammaTransformedImage;
-     cv::pow(inputImage, 1./5, gammaTransformedImage); // apply gamma curve: img = img ** (1./5)
-     imshow("EXR image original image, 16bits=>8bits linear rescaling ", inputImage);
-     imshow("EXR image with basic processing : 16bits=>8bits with gamma correction", gammaTransformedImage);
-     if (inputImage.empty())
-     {
-         help("Input image could not be loaded, aborting");
-         return -1;
-     }
-
-     //////////////////////////////////////////////////////////////////////////////
-     // Program start in a try/catch safety context (Retina may throw errors)
-     try
-     {
-         /* create a retina instance with default parameters setup, uncomment the initialisation you wanna test
-          * -> if the last parameter is 'log', then activate log sampling (favour foveal vision and subsamples peripheral vision)
-          */
-         if (useLogSampling)
-         {
-             retina = cv::bioinspired::createRetina(inputImage.size(),true, cv::bioinspired::RETINA_COLOR_BAYER, true, 2.0, 10.0);
-                 }
-         else// -> else allocate "classical" retina :
-             retina = cv::bioinspired::createRetina(inputImage.size());
-
-         // create a fast retina tone mapper (Meyla&al algorithm)
-         std::cout<<"Allocating fast tone mapper..."<<std::endl;
-         //cv::Ptr<cv::RetinaFastToneMapping> fastToneMapper=createRetinaFastToneMapping(inputImage.size());
-         std::cout<<"Fast tone mapper allocated"<<std::endl;
-
-         // save default retina parameters file in order to let you see this and maybe modify it and reload using method "setup"
-         retina->write("RetinaDefaultParameters.xml");
-
-         // desactivate Magnocellular pathway processing (motion information extraction) since it is not usefull here
-         retina->activateMovingContoursProcessing(false);
-
-         // declare retina output buffers
-         cv::Mat retinaOutput_parvo;
-
-         /////////////////////////////////////////////
-         // prepare displays and interactions
-         histogramClippingValue=0; // default value... updated with interface slider
-         //inputRescaleMat = inputImage;
-         //outputRescaleMat = imageInputRescaled;
-         cv::namedWindow("Processing configuration",1);
-         cv::createTrackbar("histogram edges clipping limit", "Processing configuration",&histogramClippingValue,50,callBack_rescaleGrayLevelMat);
-
-         colorSaturationFactor=3;
-         cv::createTrackbar("Color saturation", "Processing configuration", &colorSaturationFactor,5,callback_saturateColors);
-
-         retinaHcellsGain=40;
-         cv::createTrackbar("Hcells gain", "Processing configuration",&retinaHcellsGain,100,callBack_updateRetinaParams);
-
-         localAdaptation_photoreceptors=197;
-         localAdaptation_Gcells=190;
-         cv::createTrackbar("Ph sensitivity", "Processing configuration", &localAdaptation_photoreceptors,199,callBack_updateRetinaParams);
-         cv::createTrackbar("Gcells sensitivity", "Processing configuration", &localAdaptation_Gcells,199,callBack_updateRetinaParams);
-
-
-         /////////////////////////////////////////////
-         // apply default parameters of user interaction variables
-         rescaleGrayLevelMat(inputImage, imageInputRescaled, (float)histogramClippingValue/100);
-         retina->setColorSaturation(true,(float)colorSaturationFactor);
-         callBack_updateRetinaParams(1,NULL); // first call for default parameters setup
-
-         // processing loop with stop condition
-         bool continueProcessing=true;
-         while(continueProcessing)
-         {
-             // run retina filter
-             if (!chosenMethod)
-             {
-                 retina->run(imageInputRescaled);
-                 // Retrieve and display retina output
-                 retina->getParvo(retinaOutput_parvo);
-                 cv::imshow("Retina input image (with cut edges histogram for basic pixels error avoidance)", imageInputRescaled/255.0);
-                 cv::imshow("Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping", retinaOutput_parvo);
-                 cv::imwrite("HDRinput.jpg",imageInputRescaled/255.0);
-                 cv::imwrite("RetinaToneMapping.jpg",retinaOutput_parvo);
-             }
-             else
-             {
-                 // apply the simplified hdr tone mapping method
-                 cv::Mat fastToneMappingOutput;
-                 retina->applyFastToneMapping(imageInputRescaled, fastToneMappingOutput);
-                 cv::imshow("Retina fast tone mapping output : 16bit=>8bit image retina tonemapping", fastToneMappingOutput);
-             }
-             /*cv::Mat fastToneMappingOutput_specificObject;
-             fastToneMapper->setup(3.f, 1.5f, 1.f);
-             fastToneMapper->applyFastToneMapping(imageInputRescaled, fastToneMappingOutput_specificObject);
-             cv::imshow("### Retina fast tone mapping output : 16bit=>8bit image retina tonemapping", fastToneMappingOutput_specificObject);
-*/
-             cv::waitKey(10);
-         }
-     }catch(cv::Exception e)
-     {
-         std::cerr<<"Error using Retina : "<<e.what()<<std::endl;
-     }
-
-     // Program end message
-     std::cout<<"Retina demo end"<<std::endl;
-
-     return 0;
- }
diff --git a/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp b/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
deleted file mode 100644
index 1388c55c0..000000000
--- a/samples/cpp/OpenEXRimages_HDR_Retina_toneMapping_video.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-
-//============================================================================
-// Name        : OpenEXRimages_HDR_Retina_toneMapping_video.cpp
-// Author      : Alexandre Benoit (benoit.alexandre.vision@gmail.com)
-// Version     : 0.2
-// Copyright   : Alexandre Benoit, LISTIC Lab, december 2011
-// Description : HighDynamicRange retina tone mapping for image sequences with the help of the Gipsa/Listic's retina in C++, Ansi-style
-// Known issues: the input OpenEXR sequences can have bad computed pixels that should be removed
-//               => a simple method consists of cutting histogram edges (a slider for this on the UI is provided)
-//               => however, in image sequences, this histogramm cut must be done in an elegant way from frame to frame... still not done...
-//============================================================================
-
-#include <iostream>
-#include <stdio.h>
-#include <cstring>
-
-#include "opencv2/bioinspired.hpp" // retina based algorithms
-#include "opencv2/imgproc.hpp" // cvCvtcolor function
-#include "opencv2/highgui.hpp" // display
-
-#ifndef _CRT_SECURE_NO_WARNINGS
-# define _CRT_SECURE_NO_WARNINGS
-#endif
-
-static void help(std::string errorMessage)
-{
-    std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : ./OpenEXRimages_HDR_Retina_toneMapping [OpenEXR image sequence to process] [OPTIONNAL start frame] [OPTIONNAL end frame]"<<std::endl;
-    std::cout<<"\t[OpenEXR image sequence to process] : std::sprintf style ready prototype filename of the input HDR images to process, must be an OpenEXR format, see http://www.openexr.com/ to get some samples or create your own using camera bracketing and Photoshop or equivalent software for OpenEXR image synthesis"<<std::endl;
-    std::cout<<"\t\t => WARNING : image index number of digits cannot exceed 10"<<std::endl;
-    std::cout<<"\t[start frame] : the starting frame tat should be considered"<<std::endl;
-    std::cout<<"\t[end frame] : the ending frame tat should be considered"<<std::endl;
-    std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping_video memorial%3d.exr 20 45"<<std::endl;
-    std::cout<<"\t-Image processing : ./OpenEXRimages_HDR_Retina_toneMapping_video memorial%3d.exr 20 45 log"<<std::endl;
-    std::cout<<"\t ==> to process images from memorial020d.exr to memorial045d.exr"<<std::endl;
-
-}
-
-// simple procedure for 1D curve tracing
-static void drawPlot(const cv::Mat curve, const std::string figureTitle, const int lowerLimit, const int upperLimit)
-{
-    //std::cout<<"curve size(h,w) = "<<curve.size().height<<", "<<curve.size().width<<std::endl;
-    cv::Mat displayedCurveImage = cv::Mat::ones(200, curve.size().height, CV_8U);
-
-    cv::Mat windowNormalizedCurve;
-    normalize(curve, windowNormalizedCurve, 0, 200, cv::NORM_MINMAX, CV_32F);
-
-    displayedCurveImage = cv::Scalar::all(255); // set a white background
-    int binW = cvRound((double)displayedCurveImage.cols/curve.size().height);
-
-    for( int i = 0; i < curve.size().height; i++ )
-        rectangle( displayedCurveImage, cv::Point(i*binW, displayedCurveImage.rows),
-                cv::Point((i+1)*binW, displayedCurveImage.rows - cvRound(windowNormalizedCurve.at<float>(i))),
-                cv::Scalar::all(0), -1, 8, 0 );
-    rectangle( displayedCurveImage, cv::Point(0, 0),
-            cv::Point((lowerLimit)*binW, 200),
-            cv::Scalar::all(128), -1, 8, 0 );
-    rectangle( displayedCurveImage, cv::Point(displayedCurveImage.cols, 0),
-            cv::Point((upperLimit)*binW, 200),
-            cv::Scalar::all(128), -1, 8, 0 );
-
-    cv::imshow(figureTitle, displayedCurveImage);
-}
-
-/*
- * objective : get the gray level map of the input image and rescale it to the range [0-255] if rescale0_255=TRUE, simply trunks else
- */
-static void rescaleGrayLevelMat(const cv::Mat &inputMat, cv::Mat &outputMat, const float histogramClippingLimit, const bool rescale0_255)
- {
-     // adjust output matrix wrt the input size but single channel
-     std::cout<<"Input image rescaling with histogram edges cutting (in order to eliminate bad pixels created during the HDR image creation) :"<<std::endl;
-     //std::cout<<"=> image size (h,w,channels) = "<<inputMat.size().height<<", "<<inputMat.size().width<<", "<<inputMat.channels()<<std::endl;
-     //std::cout<<"=> pixel coding (nbchannel, bytes per channel) = "<<inputMat.elemSize()/inputMat.elemSize1()<<", "<<inputMat.elemSize1()<<std::endl;
-
-     // get min and max values to use afterwards if no 0-255 rescaling is used
-     double maxInput, minInput, histNormRescalefactor=1.f;
-     double histNormOffset=0.f;
-     minMaxLoc(inputMat, &minInput, &maxInput);
-     histNormRescalefactor=255.f/(maxInput-minInput);
-     histNormOffset=minInput;
-     std::cout<<"Hist max,min = "<<maxInput<<", "<<minInput<<" => scale, offset = "<<histNormRescalefactor<<", "<<histNormOffset<<std::endl;
-     // rescale between 0-255, keeping floating point values
-     cv::Mat normalisedImage;
-     cv::normalize(inputMat, normalisedImage, 0.f, 255.f, cv::NORM_MINMAX);
-     if (rescale0_255)
-        normalisedImage.copyTo(outputMat);
-     // extract a 8bit image that will be used for histogram edge cut
-     cv::Mat intGrayImage;
-     if (inputMat.channels()==1)
-     {
-         normalisedImage.convertTo(intGrayImage, CV_8U);
-     }else
-     {
-         cv::Mat rgbIntImg;
-         normalisedImage.convertTo(rgbIntImg, CV_8UC3);
-         cvtColor(rgbIntImg, intGrayImage, cv::COLOR_BGR2GRAY);
-     }
-
-     // get histogram density probability in order to cut values under above edges limits (here 5-95%)... usefull for HDR pixel errors cancellation
-     cv::Mat dst, hist;
-     int histSize = 256;
-     calcHist(&intGrayImage, 1, 0, cv::Mat(), hist, 1, &histSize, 0);
-     cv::Mat normalizedHist;
-
-     normalize(hist, normalizedHist, 1.f, 0.f, cv::NORM_L1, CV_32F); // normalize histogram so that its sum equals 1
-
-     // compute density probability
-     cv::Mat denseProb=cv::Mat::zeros(normalizedHist.size(), CV_32F);
-     denseProb.at<float>(0)=normalizedHist.at<float>(0);
-     int histLowerLimit=0, histUpperLimit=0;
-     for (int i=1;i<normalizedHist.size().height;++i)
-     {
-         denseProb.at<float>(i)=denseProb.at<float>(i-1)+normalizedHist.at<float>(i);
-         //std::cout<<normalizedHist.at<float>(i)<<", "<<denseProb.at<float>(i)<<std::endl;
-         if ( denseProb.at<float>(i)<histogramClippingLimit)
-             histLowerLimit=i;
-         if ( denseProb.at<float>(i)<1.f-histogramClippingLimit)
-             histUpperLimit=i;
-     }
-     // deduce min and max admitted gray levels
-     float minInputValue = (float)histLowerLimit/histSize*255.f;
-     float maxInputValue = (float)histUpperLimit/histSize*255.f;
-
-     std::cout<<"=> Histogram limits "
-             <<"\n\t"<<histogramClippingLimit*100.f<<"% index = "<<histLowerLimit<<" => normalizedHist value = "<<denseProb.at<float>(histLowerLimit)<<" => input gray level = "<<minInputValue
-             <<"\n\t"<<(1.f-histogramClippingLimit)*100.f<<"% index = "<<histUpperLimit<<" => normalizedHist value = "<<denseProb.at<float>(histUpperLimit)<<" => input gray level = "<<maxInputValue
-             <<std::endl;
-     //drawPlot(denseProb, "input histogram density probability", histLowerLimit, histUpperLimit);
-     drawPlot(normalizedHist, "input histogram", histLowerLimit, histUpperLimit);
-
-    if(rescale0_255) // rescale between 0-255 if asked to
-    {
-        cv::threshold( outputMat, outputMat, maxInputValue, maxInputValue, 2 ); //THRESH_TRUNC, clips values above maxInputValue
-        cv::threshold( outputMat, outputMat, minInputValue, minInputValue, 3 ); //THRESH_TOZERO, clips values under minInputValue
-        // rescale image range [minInputValue-maxInputValue] to [0-255]
-        outputMat-=minInputValue;
-        outputMat*=255.f/(maxInputValue-minInputValue);
-    }else
-    {
-        inputMat.copyTo(outputMat);
-        // update threshold in the initial input image range
-        maxInputValue=(float)((maxInputValue-255.f)/histNormRescalefactor+maxInput);
-        minInputValue=(float)(minInputValue/histNormRescalefactor+minInput);
-        std::cout<<"===> Input Hist clipping values (max,min) = "<<maxInputValue<<", "<<minInputValue<<std::endl;
-        cv::threshold( outputMat, outputMat, maxInputValue, maxInputValue, 2 ); //THRESH_TRUNC, clips values above maxInputValue
-        cv::threshold( outputMat, outputMat, minInputValue, minInputValue, 3 ); //
-    }
- }
-
- // basic callback method for interface management
- cv::Mat inputImage;
- cv::Mat imageInputRescaled;
- float globalRescalefactor=1;
- cv::Scalar globalOffset=0;
- int histogramClippingValue;
- static void callBack_rescaleGrayLevelMat(int, void*)
- {
-     std::cout<<"Histogram clipping value changed, current value = "<<histogramClippingValue<<std::endl;
-    // rescale and process
-    inputImage+=globalOffset;
-    inputImage*=globalRescalefactor;
-    inputImage+=cv::Scalar(50, 50, 50, 50); // WARNING value linked to the hardcoded value (200.0) used in the globalRescalefactor in order to center on the 128 mean value... experimental but... basic compromise
-    rescaleGrayLevelMat(inputImage, imageInputRescaled, (float)histogramClippingValue/100.f, true);
-
- }
-
- cv::Ptr<cv::bioinspired::Retina> retina;
- int retinaHcellsGain;
- int localAdaptation_photoreceptors, localAdaptation_Gcells;
- static void callBack_updateRetinaParams(int, void*)
- {
-     retina->setupOPLandIPLParvoChannel(true, true, (float)(localAdaptation_photoreceptors/200.0), 0.5f, 0.43f, (float)retinaHcellsGain, 1.f, 7.f, (float)(localAdaptation_Gcells/200.0));
- }
-
- int colorSaturationFactor;
- static void callback_saturateColors(int, void*)
- {
-     retina->setColorSaturation(true, (float)colorSaturationFactor);
- }
-
-// loadNewFrame : loads a n image wrt filename parameters. it also manages image rescaling/histogram edges cutting (acts differently at first image i.e. if firstTimeread=true)
-static void loadNewFrame(const std::string filenamePrototype, const int currentFileIndex, const bool firstTimeread)
-{
-     char *currentImageName=NULL;
-    currentImageName = (char*)malloc(sizeof(char)*filenamePrototype.size()+10);
-
-    // grab the first frame
-    sprintf(currentImageName, filenamePrototype.c_str(), currentFileIndex);
-
-     //////////////////////////////////////////////////////////////////////////////
-     // checking input media type (still image, video file, live video acquisition)
-     std::cout<<"RetinaDemo: reading image : "<<currentImageName<<std::endl;
-     // image processing case
-     // declare the retina input buffer... that will be fed differently in regard of the input media
-     inputImage = cv::imread(currentImageName, -1); // load image in RGB mode
-     std::cout<<"=> image size (h,w) = "<<inputImage.size().height<<", "<<inputImage.size().width<<std::endl;
-     if (inputImage.empty())
-     {
-        help("could not load image, program end");
-            return;;
-         }
-
-    // rescaling/histogram clipping stage
-    // rescale between 0 and 1
-    // TODO : take care of this step !!! maybe disable of do this in a nicer way ... each successive image should get the same transformation... but it depends on the initial image format
-    double maxInput, minInput;
-    minMaxLoc(inputImage, &minInput, &maxInput);
-    std::cout<<"ORIGINAL IMAGE pixels values range (max,min) : "<<maxInput<<", "<<minInput<<std::endl;
-
-    if (firstTimeread)
-    {
-        /* the first time, get the pixel values range and rougthly update scaling value
-        in order to center values around 128 and getting a range close to [0-255],
-        => actually using a little less in order to let some more flexibility in range evolves...
-        */
-        double maxInput1, minInput1;
-        minMaxLoc(inputImage, &minInput1, &maxInput1);
-        std::cout<<"FIRST IMAGE pixels values range (max,min) : "<<maxInput1<<", "<<minInput1<<std::endl;
-        globalRescalefactor=(float)(50.0/(maxInput1-minInput1)); // less than 255 for flexibility... experimental value to be carefull about
-        double channelOffset = -1.5*minInput;
-        globalOffset= cv::Scalar(channelOffset, channelOffset, channelOffset, channelOffset);
-    }
-    // call the generic input image rescaling callback
-    callBack_rescaleGrayLevelMat(1,NULL);
-}
-
- int main(int argc, char* argv[]) {
-     // welcome message
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"* Retina demonstration for High Dynamic Range compression (tone-mapping) : demonstrates the use of a wrapper class of the Gipsa/Listic Labs retina model."<<std::endl;
-     std::cout<<"* This retina model allows spatio-temporal image processing (applied on still images, video sequences)."<<std::endl;
-     std::cout<<"* This demo focuses demonstration of the dynamic compression capabilities of the model"<<std::endl;
-     std::cout<<"* => the main application is tone mapping of HDR images (i.e. see on a 8bit display a more than 8bits coded (up to 16bits) image with details in high and low luminance ranges"<<std::endl;
-     std::cout<<"* The retina model still have the following properties:"<<std::endl;
-     std::cout<<"* => It applies a spectral whithening (mid-frequency details enhancement)"<<std::endl;
-     std::cout<<"* => high frequency spatio-temporal noise reduction"<<std::endl;
-     std::cout<<"* => low frequency luminance to be reduced (luminance range compression)"<<std::endl;
-     std::cout<<"* => local logarithmic luminance compression allows details to be enhanced in low light conditions\n"<<std::endl;
-     std::cout<<"* for more information, reer to the following papers :"<<std::endl;
-     std::cout<<"* Benoit A., Caplier A., Durette B., Herault, J., \"USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING\", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011"<<std::endl;
-     std::cout<<"* Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891."<<std::endl;
-     std::cout<<"* => reports comments/remarks at benoit.alexandre.vision@gmail.com"<<std::endl;
-     std::cout<<"* => more informations and papers at : http://sites.google.com/site/benoitalexandrevision/"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"** WARNING : this sample requires OpenCV to be configured with OpenEXR support **"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-     std::cout<<"*** You can use free tools to generate OpenEXR images from images sets   :    ***"<<std::endl;
-     std::cout<<"*** =>  1. take a set of photos from the same viewpoint using bracketing      ***"<<std::endl;
-     std::cout<<"*** =>  2. generate an OpenEXR image with tools like qtpfsgui.sourceforge.net ***"<<std::endl;
-     std::cout<<"*** =>  3. apply tone mapping with this program                               ***"<<std::endl;
-     std::cout<<"*********************************************************************************"<<std::endl;
-
-     // basic input arguments checking
-     if (argc<4)
-     {
-         help("bad number of parameter");
-         return -1;
-     }
-
-     bool useLogSampling = !strcmp(argv[argc-1], "log"); // check if user wants retina log sampling processing
-
-     int startFrameIndex=0, endFrameIndex=0, currentFrameIndex=0;
-     sscanf(argv[2], "%d", &startFrameIndex);
-     sscanf(argv[3], "%d", &endFrameIndex);
-     std::string inputImageNamePrototype(argv[1]);
-
-     //////////////////////////////////////////////////////////////////////////////
-     // checking input media type (still image, video file, live video acquisition)
-     std::cout<<"RetinaDemo: setting up system with first image..."<<std::endl;
-     loadNewFrame(inputImageNamePrototype, startFrameIndex, true);
-
-     if (inputImage.empty())
-     {
-        help("could not load image, program end");
-            return -1;
-         }
-
-     //////////////////////////////////////////////////////////////////////////////
-     // Program start in a try/catch safety context (Retina may throw errors)
-     try
-     {
-         /* create a retina instance with default parameters setup, uncomment the initialisation you wanna test
-          * -> if the last parameter is 'log', then activate log sampling (favour foveal vision and subsamples peripheral vision)
-          */
-         if (useLogSampling)
-                {
-                     retina = cv::bioinspired::createRetina(inputImage.size(),true, cv::bioinspired::RETINA_COLOR_BAYER, true, 2.0, 10.0);
-                 }
-         else// -> else allocate "classical" retina :
-             retina = cv::bioinspired::createRetina(inputImage.size());
-
-        // save default retina parameters file in order to let you see this and maybe modify it and reload using method "setup"
-        retina->write("RetinaDefaultParameters.xml");
-
-                 // desactivate Magnocellular pathway processing (motion information extraction) since it is not usefull here
-                 retina->activateMovingContoursProcessing(false);
-
-         // declare retina output buffers
-         cv::Mat retinaOutput_parvo;
-
-         /////////////////////////////////////////////
-         // prepare displays and interactions
-         histogramClippingValue=0; // default value... updated with interface slider
-
-         std::string retinaInputCorrected("Retina input image (with cut edges histogram for basic pixels error avoidance)");
-         cv::namedWindow(retinaInputCorrected,1);
-         cv::createTrackbar("histogram edges clipping limit", "Retina input image (with cut edges histogram for basic pixels error avoidance)",&histogramClippingValue,50,callBack_rescaleGrayLevelMat);
-
-         std::string RetinaParvoWindow("Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping");
-         cv::namedWindow(RetinaParvoWindow, 1);
-         colorSaturationFactor=3;
-         cv::createTrackbar("Color saturation", "Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping", &colorSaturationFactor,5,callback_saturateColors);
-
-         retinaHcellsGain=40;
-         cv::createTrackbar("Hcells gain", "Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping",&retinaHcellsGain,100,callBack_updateRetinaParams);
-
-         localAdaptation_photoreceptors=197;
-         localAdaptation_Gcells=190;
-         cv::createTrackbar("Ph sensitivity", "Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping", &localAdaptation_photoreceptors,199,callBack_updateRetinaParams);
-         cv::createTrackbar("Gcells sensitivity", "Retina Parvocellular pathway output : 16bit=>8bit image retina tonemapping", &localAdaptation_Gcells,199,callBack_updateRetinaParams);
-
-        std::string powerTransformedInput("EXR image with basic processing : 16bits=>8bits with gamma correction");
-
-         /////////////////////////////////////////////
-         // apply default parameters of user interaction variables
-         callBack_updateRetinaParams(1,NULL); // first call for default parameters setup
-         callback_saturateColors(1, NULL);
-
-         // processing loop with stop condition
-         currentFrameIndex=startFrameIndex;
-         while(currentFrameIndex <= endFrameIndex)
-         {
-             loadNewFrame(inputImageNamePrototype, currentFrameIndex, false);
-
-             if (inputImage.empty())
-             {
-                std::cout<<"Could not load new image (index = "<<currentFrameIndex<<"), program end"<<std::endl;
-                return -1;
-             }
-            // display input & process standard power transformation
-            imshow("EXR image original image, 16bits=>8bits linear rescaling ", imageInputRescaled);
-            cv::Mat gammaTransformedImage;
-            cv::pow(imageInputRescaled, 1./5, gammaTransformedImage); // apply gamma curve: img = img ** (1./5)
-            imshow(powerTransformedInput, gammaTransformedImage);
-             // run retina filter
-             retina->run(imageInputRescaled);
-             // Retrieve and display retina output
-             retina->getParvo(retinaOutput_parvo);
-             cv::imshow(retinaInputCorrected, imageInputRescaled/255.f);
-             cv::imshow(RetinaParvoWindow, retinaOutput_parvo);
-             cv::waitKey(4);
-            // jump to next frame
-            ++currentFrameIndex;
-         }
-     }catch(cv::Exception e)
-     {
-         std::cerr<<"Error using Retina : "<<e.what()<<std::endl;
-     }
-
-     // Program end message
-     std::cout<<"Retina demo end"<<std::endl;
-
-     return 0;
- }
diff --git a/samples/cpp/Qt_sample/CMakeLists.txt b/samples/cpp/Qt_sample/CMakeLists.txt
index e831f752f..f465947db 100644
--- a/samples/cpp/Qt_sample/CMakeLists.txt
+++ b/samples/cpp/Qt_sample/CMakeLists.txt
@@ -7,6 +7,6 @@ FIND_PACKAGE( OpenCV REQUIRED )
 find_package (OpenGL REQUIRED)
 
 
-ADD_EXECUTABLE(OpenGL_Qt_Binding main.cpp)
+ADD_EXECUTABLE(OpenGL_Qt_Binding qt_opengl.cpp)
 TARGET_LINK_LIBRARIES(OpenGL_Qt_Binding ${OpenCV_LIBS} ${OPENGL_LIBRARIES} )
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cube4.avi ${CMAKE_CURRENT_BINARY_DIR}/cube4.avi COPYONLY)
diff --git a/samples/cpp/Qt_sample/main.cpp b/samples/cpp/Qt_sample/main.cpp
deleted file mode 100644
index 92bc2b549..000000000
--- a/samples/cpp/Qt_sample/main.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//Yannick Verdie 2010
-
-//--- Please read help() below: ---
-
-#include <iostream>
-#include <vector>
-#include <opencv2/core/core_c.h>
-#include <opencv2/calib3d/calib3d_c.h>
-#include <opencv2/imgproc.hpp>
-#include <opencv2/highgui.hpp>
-#include <opencv2/legacy/compat.hpp>
-
-#if defined WIN32 || defined _WIN32 || defined WINCE
-    #include <windows.h>
-    #undef small
-    #undef min
-    #undef max
-    #undef abs
-#endif
-
-#ifdef __APPLE__
-    #include <OpenGL/gl.h>
-#else
-    #include <GL/gl.h>
-#endif
-
-using namespace std;
-using namespace cv;
-
-static void help()
-{
-    cout << "\nThis demo demonstrates the use of the Qt enhanced version of the highgui GUI interface\n"
-            "  and dang if it doesn't throw in the use of of the POSIT 3D tracking algorithm too\n"
-            "It works off of the video: cube4.avi\n"
-            "Using OpenCV version %s\n" << CV_VERSION << "\n\n"
-" 1). This demo is mainly based on work from Javier Barandiaran Martirena\n"
-"     See this page http://code.opencv.org/projects/opencv/wiki/Posit.\n"
-" 2). This is a demo to illustrate how to use **OpenGL Callback**.\n"
-" 3). You need Qt binding to compile this sample with OpenGL support enabled.\n"
-" 4). The features' detection is very basic and could highly be improved \n"
-"     (basic thresholding tuned for the specific video) but 2).\n"
-" 5) THANKS TO Google Summer of Code 2010 for supporting this work!\n" << endl;
-}
-
-#define FOCAL_LENGTH 600
-#define CUBE_SIZE 10
-
-static void renderCube(float size)
-{
-    glBegin(GL_QUADS);
-    // Front Face
-    glNormal3f( 0.0f, 0.0f, 1.0f);
-    glVertex3f( 0.0f,  0.0f,  0.0f);
-    glVertex3f( size,  0.0f,  0.0f);
-    glVertex3f( size,  size,  0.0f);
-    glVertex3f( 0.0f,  size,  0.0f);
-    // Back Face
-    glNormal3f( 0.0f, 0.0f,-1.0f);
-    glVertex3f( 0.0f,  0.0f, size);
-    glVertex3f( 0.0f,  size, size);
-    glVertex3f( size,  size, size);
-    glVertex3f( size,  0.0f, size);
-    // Top Face
-    glNormal3f( 0.0f, 1.0f, 0.0f);
-    glVertex3f( 0.0f,  size,  0.0f);
-    glVertex3f( size,  size,  0.0f);
-    glVertex3f( size,  size, size);
-    glVertex3f( 0.0f,  size, size);
-    // Bottom Face
-    glNormal3f( 0.0f,-1.0f, 0.0f);
-    glVertex3f( 0.0f,  0.0f,  0.0f);
-    glVertex3f( 0.0f,  0.0f, size);
-    glVertex3f( size,  0.0f, size);
-    glVertex3f( size,  0.0f,  0.0f);
-    // Right face
-    glNormal3f( 1.0f, 0.0f, 0.0f);
-    glVertex3f( size,  0.0f, 0.0f);
-    glVertex3f( size,  0.0f, size);
-    glVertex3f( size,  size, size);
-    glVertex3f( size,  size, 0.0f);
-    // Left Face
-    glNormal3f(-1.0f, 0.0f, 0.0f);
-    glVertex3f( 0.0f,  0.0f, 0.0f);
-    glVertex3f( 0.0f,  size, 0.0f);
-    glVertex3f( 0.0f,  size, size);
-    glVertex3f( 0.0f,  0.0f, size);
-    glEnd();
-}
-
-
-static void on_opengl(void* param)
-{
-    //Draw the object with the estimated pose
-    glLoadIdentity();
-    glScalef( 1.0f, 1.0f, -1.0f);
-    glMultMatrixf( (float*)param );
-    glEnable( GL_LIGHTING );
-    glEnable( GL_LIGHT0 );
-    glEnable( GL_BLEND );
-    glBlendFunc(GL_SRC_ALPHA, GL_ONE);
-    renderCube( CUBE_SIZE );
-    glDisable(GL_BLEND);
-    glDisable( GL_LIGHTING );
-}
-
-static void initPOSIT(std::vector<CvPoint3D32f> *modelPoints)
-{
-    //Create the model pointss
-    modelPoints->push_back(cvPoint3D32f(0.0f, 0.0f, 0.0f)); //The first must be (0,0,0)
-    modelPoints->push_back(cvPoint3D32f(0.0f, 0.0f, CUBE_SIZE));
-    modelPoints->push_back(cvPoint3D32f(CUBE_SIZE, 0.0f, 0.0f));
-    modelPoints->push_back(cvPoint3D32f(0.0f, CUBE_SIZE, 0.0f));
-}
-
-static void foundCorners(vector<CvPoint2D32f> *srcImagePoints, const Mat& source, Mat& grayImage)
-{
-    cvtColor(source, grayImage, COLOR_RGB2GRAY);
-    GaussianBlur(grayImage, grayImage, Size(11,11), 0, 0);
-    normalize(grayImage, grayImage, 0, 255, NORM_MINMAX);
-    threshold(grayImage, grayImage, 26, 255, THRESH_BINARY_INV); //25
-
-    vector<vector<Point> > contours;
-    vector<Vec4i> hierarchy;
-    findContours(grayImage, contours, hierarchy, RETR_EXTERNAL, CHAIN_APPROX_NONE);
-
-    Point p;
-    vector<CvPoint2D32f> srcImagePoints_temp(4,cvPoint2D32f(0,0));
-
-    if (contours.size() == srcImagePoints_temp.size())
-    {
-
-        for(size_t i = 0 ; i<contours.size(); i++ )
-        {
-
-            p.x = p.y = 0;
-
-            for(size_t j = 0 ; j<contours[i].size(); j++ )
-                p+=contours[i][j];
-
-            srcImagePoints_temp.at(i)=cvPoint2D32f(float(p.x)/contours[i].size(),float(p.y)/contours[i].size());
-        }
-
-        //Need to keep the same order
-        //> y = 0
-        //> x = 1
-        //< x = 2
-        //< y = 3
-
-        //get point 0;
-        size_t index = 0;
-        for(size_t i = 1 ; i<srcImagePoints_temp.size(); i++ )
-        {
-            if (srcImagePoints_temp.at(i).y > srcImagePoints_temp.at(index).y)
-                index = i;
-        }
-        srcImagePoints->at(0) = srcImagePoints_temp.at(index);
-
-        //get point 1;
-        index = 0;
-        for(size_t i = 1 ; i<srcImagePoints_temp.size(); i++ )
-        {
-            if (srcImagePoints_temp.at(i).x > srcImagePoints_temp.at(index).x)
-                index = i;
-        }
-        srcImagePoints->at(1) = srcImagePoints_temp.at(index);
-
-        //get point 2;
-        index = 0;
-        for(size_t i = 1 ; i<srcImagePoints_temp.size(); i++ )
-        {
-            if (srcImagePoints_temp.at(i).x < srcImagePoints_temp.at(index).x)
-                index = i;
-        }
-        srcImagePoints->at(2) = srcImagePoints_temp.at(index);
-
-        //get point 3;
-        index = 0;
-        for(size_t i = 1 ; i<srcImagePoints_temp.size(); i++ )
-        {
-            if (srcImagePoints_temp.at(i).y < srcImagePoints_temp.at(index).y)
-                index = i;
-        }
-        srcImagePoints->at(3) = srcImagePoints_temp.at(index);
-
-        Mat Msource = source;
-        stringstream ss;
-        for(size_t i = 0 ; i<srcImagePoints_temp.size(); i++ )
-        {
-            ss<<i;
-            circle(Msource,srcImagePoints->at(i),5,Scalar(0,0,255));
-            putText(Msource,ss.str(),srcImagePoints->at(i),FONT_HERSHEY_SIMPLEX,1,Scalar(0,0,255));
-            ss.str("");
-
-            //new coordinate system in the middle of the frame and reversed (camera coordinate system)
-            srcImagePoints->at(i) = cvPoint2D32f(srcImagePoints_temp.at(i).x-source.cols/2,source.rows/2-srcImagePoints_temp.at(i).y);
-        }
-    }
-
-}
-
-static void createOpenGLMatrixFrom(float *posePOSIT,const CvMatr32f &rotationMatrix, const CvVect32f &translationVector)
-{
-
-
-    //coordinate system returned is relative to the first 3D input point
-    for (int f=0; f<3; f++)
-    {
-        for (int c=0; c<3; c++)
-        {
-            posePOSIT[c*4+f] = rotationMatrix[f*3+c];	//transposed
-        }
-    }
-    posePOSIT[3] = 0.0;
-    posePOSIT[7] = 0.0;
-    posePOSIT[11] = 0.0;
-    posePOSIT[12] = translationVector[0];
-    posePOSIT[13] = translationVector[1];
-    posePOSIT[14] = translationVector[2];
-    posePOSIT[15] = 1.0;
-}
-
-int main(void)
-{
-    help();
-    VideoCapture video("cube4.avi");
-    CV_Assert(video.isOpened());
-
-    Mat source, grayImage;
-
-    video >> source;
-
-    namedWindow("original", WINDOW_AUTOSIZE | WINDOW_FREERATIO);
-    namedWindow("POSIT", WINDOW_AUTOSIZE | WINDOW_FREERATIO);
-    displayOverlay("POSIT", "We lost the 4 corners' detection quite often (the red circles disappear). This demo is only to illustrate how to use OpenGL callback.\n -- Press ESC to exit.", 10000);
-
-    float OpenGLMatrix[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-    setOpenGlDrawCallback("POSIT",on_opengl,OpenGLMatrix);
-
-    vector<CvPoint3D32f> modelPoints;
-    initPOSIT(&modelPoints);
-
-    //Create the POSIT object with the model points
-    CvPOSITObject* positObject = cvCreatePOSITObject( &modelPoints[0], (int)modelPoints.size() );
-
-    CvMatr32f rotation_matrix = new float[9];
-    CvVect32f translation_vector = new float[3];
-    CvTermCriteria criteria = cvTermCriteria(CV_TERMCRIT_EPS | CV_TERMCRIT_ITER, 100, 1.0e-4f);
-
-    vector<CvPoint2D32f> srcImagePoints(4,cvPoint2D32f(0,0));
-
-
-    while(waitKey(33) != 27)
-    {
-        video >> source;
-        imshow("original",source);
-
-        foundCorners(&srcImagePoints, source, grayImage);
-        cvPOSIT( positObject, &srcImagePoints[0], FOCAL_LENGTH, criteria, rotation_matrix, translation_vector );
-        createOpenGLMatrixFrom(OpenGLMatrix,rotation_matrix,translation_vector);
-
-        imshow("POSIT",source);
-
-        if (video.get(CAP_PROP_POS_AVI_RATIO) > 0.99)
-            video.set(CAP_PROP_POS_AVI_RATIO, 0);
-    }
-
-    destroyAllWindows();
-    cvReleasePOSITObject(&positObject);
-
-    return 0;
-}
diff --git a/samples/cpp/Qt_sample/qt_opengl.cpp b/samples/cpp/Qt_sample/qt_opengl.cpp
new file mode 100644
index 000000000..ff4a0b0ca
--- /dev/null
+++ b/samples/cpp/Qt_sample/qt_opengl.cpp
@@ -0,0 +1,269 @@
+// Yannick Verdie 2010
+// --- Please read help() below: ---
+
+#include <iostream>
+#include <vector>
+
+#include <opencv2/calib3d/calib3d.hpp>
+#include <opencv2/calib3d/calib3d_c.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/legacy/compat.hpp>
+
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif
+
+using namespace std;
+using namespace cv;
+
+static void help()
+{
+    cout << "This demo demonstrates the use of the Qt enhanced version of the highgui GUI interface\n"
+            "and dang if it doesn't throw in the use of of the POSIT 3D tracking algorithm too\n"
+            "It works off of the video: cube4.avi\n"
+            "Using OpenCV version " << CV_VERSION << "\n\n"
+
+            " 1) This demo is mainly based on work from Javier Barandiaran Martirena\n"
+            "    See this page http://code.opencv.org/projects/opencv/wiki/Posit.\n"
+            " 2) This is a demo to illustrate how to use **OpenGL Callback**.\n"
+            " 3) You need Qt binding to compile this sample with OpenGL support enabled.\n"
+            " 4) The features' detection is very basic and could highly be improved\n"
+            "    (basic thresholding tuned for the specific video) but 2).\n"
+            " 5) Thanks to Google Summer of Code 2010 for supporting this work!\n" << endl;
+}
+
+#define FOCAL_LENGTH 600
+#define CUBE_SIZE 0.5
+
+static void renderCube(float size)
+{
+    glBegin(GL_QUADS);
+    // Front Face
+    glNormal3f( 0.0f, 0.0f, 1.0f);
+    glVertex3f( 0.0f,  0.0f,  0.0f);
+    glVertex3f( size,  0.0f,  0.0f);
+    glVertex3f( size,  size,  0.0f);
+    glVertex3f( 0.0f,  size,  0.0f);
+    // Back Face
+    glNormal3f( 0.0f, 0.0f,-1.0f);
+    glVertex3f( 0.0f,  0.0f, size);
+    glVertex3f( 0.0f,  size, size);
+    glVertex3f( size,  size, size);
+    glVertex3f( size,  0.0f, size);
+    // Top Face
+    glNormal3f( 0.0f, 1.0f, 0.0f);
+    glVertex3f( 0.0f,  size,  0.0f);
+    glVertex3f( size,  size,  0.0f);
+    glVertex3f( size,  size, size);
+    glVertex3f( 0.0f,  size, size);
+    // Bottom Face
+    glNormal3f( 0.0f,-1.0f, 0.0f);
+    glVertex3f( 0.0f,  0.0f,  0.0f);
+    glVertex3f( 0.0f,  0.0f, size);
+    glVertex3f( size,  0.0f, size);
+    glVertex3f( size,  0.0f,  0.0f);
+    // Right face
+    glNormal3f( 1.0f, 0.0f, 0.0f);
+    glVertex3f( size,  0.0f, 0.0f);
+    glVertex3f( size,  0.0f, size);
+    glVertex3f( size,  size, size);
+    glVertex3f( size,  size, 0.0f);
+    // Left Face
+    glNormal3f(-1.0f, 0.0f, 0.0f);
+    glVertex3f( 0.0f,  0.0f, 0.0f);
+    glVertex3f( 0.0f,  size, 0.0f);
+    glVertex3f( 0.0f,  size, size);
+    glVertex3f( 0.0f,  0.0f, size);
+    glEnd();
+}
+
+static void on_opengl(void* param)
+{
+    //Draw the object with the estimated pose
+    glLoadIdentity();
+    glScalef( 1.0f, 1.0f, -1.0f);
+    glMultMatrixf( (float*)param );
+    glEnable( GL_LIGHTING );
+    glEnable( GL_LIGHT0 );
+    glEnable( GL_BLEND );
+    glBlendFunc(GL_SRC_ALPHA, GL_ONE);
+    renderCube( CUBE_SIZE );
+    glDisable(GL_BLEND);
+    glDisable( GL_LIGHTING );
+}
+
+static void initPOSIT(std::vector<CvPoint3D32f> * modelPoints)
+{
+    // Create the model pointss
+    modelPoints->push_back(cvPoint3D32f(0.0f, 0.0f, 0.0f)); // The first must be (0, 0, 0)
+    modelPoints->push_back(cvPoint3D32f(0.0f, 0.0f, CUBE_SIZE));
+    modelPoints->push_back(cvPoint3D32f(CUBE_SIZE, 0.0f, 0.0f));
+    modelPoints->push_back(cvPoint3D32f(0.0f, CUBE_SIZE, 0.0f));
+}
+
+static void foundCorners(vector<CvPoint2D32f> * srcImagePoints, const Mat & source, Mat & grayImage)
+{
+    cvtColor(source, grayImage, COLOR_RGB2GRAY);
+    GaussianBlur(grayImage, grayImage, Size(11, 11), 0, 0);
+    normalize(grayImage, grayImage, 0, 255, NORM_MINMAX);
+    threshold(grayImage, grayImage, 26, 255, THRESH_BINARY_INV); //25
+
+    Mat MgrayImage = grayImage;
+    vector<vector<Point> > contours;
+    vector<Vec4i> hierarchy;
+    findContours(MgrayImage, contours, hierarchy, CV_RETR_EXTERNAL, CV_CHAIN_APPROX_NONE);
+
+    Point p;
+    vector<CvPoint2D32f> srcImagePoints_temp(4, cvPoint2D32f(0, 0));
+
+    if (contours.size() == srcImagePoints_temp.size())
+    {
+        for (size_t i = 0; i < contours.size(); i++ )
+        {
+            p.x = p.y = 0;
+
+            for (size_t j = 0 ; j < contours[i].size(); j++)
+                p += contours[i][j];
+
+            srcImagePoints_temp.at(i) = cvPoint2D32f(float(p.x) / contours[i].size(), float(p.y) / contours[i].size());
+        }
+
+        // Need to keep the same order
+        // > y = 0
+        // > x = 1
+        // < x = 2
+        // < y = 3
+
+        // get point 0;
+        size_t index = 0;
+        for (size_t i = 1 ; i<srcImagePoints_temp.size(); i++)
+            if (srcImagePoints_temp.at(i).y > srcImagePoints_temp.at(index).y)
+                index = i;
+        srcImagePoints->at(0) = srcImagePoints_temp.at(index);
+
+        // get point 1;
+        index = 0;
+        for (size_t i = 1 ; i<srcImagePoints_temp.size(); i++)
+            if (srcImagePoints_temp.at(i).x > srcImagePoints_temp.at(index).x)
+                index = i;
+        srcImagePoints->at(1) = srcImagePoints_temp.at(index);
+
+        // get point 2;
+        index = 0;
+        for (size_t i = 1 ; i<srcImagePoints_temp.size(); i++)
+            if (srcImagePoints_temp.at(i).x < srcImagePoints_temp.at(index).x)
+                index = i;
+        srcImagePoints->at(2) = srcImagePoints_temp.at(index);
+
+        // get point 3;
+        index = 0;
+        for (size_t i = 1 ; i<srcImagePoints_temp.size(); i++ )
+            if (srcImagePoints_temp.at(i).y < srcImagePoints_temp.at(index).y)
+                index = i;
+        srcImagePoints->at(3) = srcImagePoints_temp.at(index);
+
+        Mat Msource = source;
+        stringstream ss;
+        for (size_t i = 0; i<srcImagePoints_temp.size(); i++ )
+        {
+            ss << i;
+            circle(Msource, srcImagePoints->at(i), 5, Scalar(0, 0, 255));
+            putText(Msource, ss.str(), srcImagePoints->at(i), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255));
+            ss.str("");
+
+            // new coordinate system in the middle of the frame and reversed (camera coordinate system)
+            srcImagePoints->at(i) = cvPoint2D32f(srcImagePoints_temp.at(i).x - source.cols / 2,
+                                                 source.rows / 2 - srcImagePoints_temp.at(i).y);
+        }
+    }
+}
+
+static void createOpenGLMatrixFrom(float * posePOSIT, const CvMatr32f & rotationMatrix,
+                                   const CvVect32f & translationVector)
+{
+    // coordinate system returned is relative to the first 3D input point
+    for (int f = 0; f < 3; f++)
+        for (int c = 0; c < 3; c++)
+            posePOSIT[c * 4 + f] = rotationMatrix[f * 3 + c]; // transposed
+
+    posePOSIT[3] = translationVector[0];
+    posePOSIT[7] = translationVector[1];
+    posePOSIT[11] = translationVector[2];
+    posePOSIT[12] = 0.0f;
+    posePOSIT[13] = 0.0f;
+    posePOSIT[14] = 0.0f;
+    posePOSIT[15] = 1.0f;
+}
+
+int main(void)
+{
+    help();
+
+    string fileName = "cube4.avi";
+    VideoCapture video(fileName);
+    if (!video.isOpened())
+    {
+        cerr << "Video file " << fileName << " could not be opened" << endl;
+        return EXIT_FAILURE;
+    }
+
+    Mat source, grayImage;
+    video >> source;
+
+    namedWindow("Original", WINDOW_AUTOSIZE | WINDOW_FREERATIO);
+    namedWindow("POSIT", WINDOW_OPENGL | WINDOW_FREERATIO);
+    resizeWindow("POSIT", source.cols, source.rows);
+
+    displayOverlay("POSIT", "We lost the 4 corners' detection quite often (the red circles disappear).\n"
+                   "This demo is only to illustrate how to use OpenGL callback.\n"
+                   " -- Press ESC to exit.", 10000);
+
+    float OpenGLMatrix[] = { 0, 0, 0, 0,
+                             0, 0, 0, 0,
+                             0, 0, 0, 0,
+                             0, 0, 0, 0 };
+    setOpenGlContext("POSIT");
+    setOpenGlDrawCallback("POSIT", on_opengl, OpenGLMatrix);
+
+    vector<CvPoint3D32f> modelPoints;
+    initPOSIT(&modelPoints);
+
+    // Create the POSIT object with the model points
+    CvPOSITObject* positObject = cvCreatePOSITObject( &modelPoints[0], (int)modelPoints.size());
+
+    CvMatr32f rotation_matrix = new float[9];
+    CvVect32f translation_vector = new float[3];
+    CvTermCriteria criteria = cvTermCriteria(CV_TERMCRIT_EPS | CV_TERMCRIT_ITER, 100, 1e-4f);
+    vector<CvPoint2D32f> srcImagePoints(4, cvPoint2D32f(0, 0));
+
+    while (waitKey(33) != 27)
+    {
+        video >> source;
+        if (source.empty())
+            break;
+
+        imshow("Original", source);
+
+        foundCorners(&srcImagePoints, source, grayImage);
+        cvPOSIT(positObject, &srcImagePoints[0], FOCAL_LENGTH, criteria, rotation_matrix, translation_vector);
+        createOpenGLMatrixFrom(OpenGLMatrix, rotation_matrix, translation_vector);
+
+        updateWindow("POSIT");
+
+        if (video.get(CAP_PROP_POS_AVI_RATIO) > 0.99)
+            video.set(CAP_PROP_POS_AVI_RATIO, 0);
+    }
+
+    setOpenGlDrawCallback("POSIT", NULL, NULL);
+    destroyAllWindows();
+    cvReleasePOSITObject(&positObject);
+
+    delete[]rotation_matrix;
+    delete[]translation_vector;
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/cpp/calibration_artificial.cpp b/samples/cpp/calibration_artificial.cpp
index 3023127b7..478cd05c4 100644
--- a/samples/cpp/calibration_artificial.cpp
+++ b/samples/cpp/calibration_artificial.cpp
@@ -46,7 +46,7 @@ private:
     Point3f generateChessBoardCenter(const Mat& camMat, const Size& imgSize) const;
     Mat rvec, tvec;
 };
-};
+}
 
 
 
diff --git a/samples/cpp/freak_demo.cpp b/samples/cpp/freak_demo.cpp
index e6429ae83..140b28f90 100644
--- a/samples/cpp/freak_demo.cpp
+++ b/samples/cpp/freak_demo.cpp
@@ -87,7 +87,7 @@ int main( int argc, char** argv ) {
 
     // DESCRIPTOR
     // Our proposed FREAK descriptor
-    // (roation invariance, scale invariance, pattern radius corresponding to SMALLEST_KP_SIZE,
+    // (rotation invariance, scale invariance, pattern radius corresponding to SMALLEST_KP_SIZE,
     // number of octaves, optional vector containing the selected pairs)
     // FREAK extractor(true, true, 22, 4, std::vector<int>());
     FREAK extractor;
diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp
new file mode 100644
index 000000000..40349e0fb
--- /dev/null
+++ b/samples/cpp/intelperc_capture.cpp
@@ -0,0 +1,376 @@
+// testOpenCVCam.cpp : Defines the entry point for the console application.
+//
+
+#include "opencv2/highgui/highgui.hpp"
+
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+static bool g_printStreamSetting        = false;
+static int g_imageStreamProfileIdx      = -1;
+static int g_depthStreamProfileIdx      = -1;
+static bool g_irStreamShow              = false;
+static double g_imageBrightness         = -DBL_MAX;
+static double g_imageContrast           = -DBL_MAX;
+static bool g_printTiming               = false;
+static bool g_showClosedPoint           = false;
+
+
+static int g_closedDepthPoint[2];
+
+static void printUsage(const char *arg0)
+{
+    const char *filename = arg0;
+    while (*filename)
+        filename++;
+    while ((arg0 <= filename) && ('\\' != *filename) && ('/' != *filename))
+        filename--;
+    filename++;
+
+    cout << "This program demonstrates usage of camera supported\nby Intel Perceptual computing SDK." << endl << endl;
+    cout << "usage: " << filename << "[-ps] [-isp IDX] [-dsp IDX]\n [-ir] [-imb VAL] [-imc VAL]" << endl << endl;
+    cout << "   -ps,            print streams setting and profiles" << endl;
+    cout << "   -isp IDX,       set profile index of the image stream" << endl;
+    cout << "   -dsp IDX,       set profile index of the depth stream" << endl;
+    cout << "   -ir,            show data from IR stream" << endl;
+    cout << "   -imb VAL,       set brighness value for a image stream" << endl;
+    cout << "   -imc VAL,       set contrast value for a image stream" << endl;
+    cout << "   -pts,           print frame index and frame time" << endl;
+    cout << "   --show-closed,  print frame index and frame time" << endl;
+    cout <<  endl;
+}
+
+static void parseCMDLine(int argc, char* argv[])
+{
+    if( argc == 1 )
+    {
+        printUsage(argv[0]);
+    }
+    else
+    {
+        for( int i = 1; i < argc; i++ )
+        {
+            if ((0 == strcmp(argv[i], "--help")) || (0 == strcmp( argv[i], "-h")))
+            {
+                printUsage(argv[0]);
+                exit(0);
+            }
+            else if ((0 == strcmp( argv[i], "--print-streams")) || (0 == strcmp( argv[i], "-ps")))
+            {
+                g_printStreamSetting = true;
+            }
+            else if ((0 == strcmp( argv[i], "--image-stream-prof")) || (0 == strcmp( argv[i], "-isp")))
+            {
+                g_imageStreamProfileIdx = atoi(argv[++i]);
+            }
+            else if ((0 == strcmp( argv[i], "--depth-stream-prof")) || (0 == strcmp( argv[i], "-dsp")))
+            {
+                g_depthStreamProfileIdx = atoi(argv[++i]);
+            }
+            else if (0 == strcmp( argv[i], "-ir"))
+            {
+                g_irStreamShow = true;
+            }
+            else if (0 == strcmp( argv[i], "-imb"))
+            {
+                g_imageBrightness = atof(argv[++i]);
+            }
+            else if (0 == strcmp( argv[i], "-imc"))
+            {
+                g_imageContrast = atof(argv[++i]);
+            }
+            else if (0 == strcmp(argv[i], "-pts"))
+            {
+                g_printTiming = true;
+            }
+            else if (0 == strcmp(argv[i], "--show-closed"))
+            {
+                g_showClosedPoint = true;
+            }
+            else
+            {
+                cout << "Unsupported command line argument: " << argv[i] << "." << endl;
+                exit(-1);
+            }
+        }
+        if (g_showClosedPoint && (-1 == g_depthStreamProfileIdx))
+        {
+            cerr << "For --show-closed depth profile has be selected" << endl;
+            exit(-1);
+        }
+    }
+}
+
+static void printStreamProperties(VideoCapture &capture)
+{
+    size_t profilesCount = (size_t)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
+    cout << "Image stream." << endl;
+    cout << "  Brightness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS) << endl;
+    cout << "  Contrast = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_CONTRAST) << endl;
+    cout << "  Saturation = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SATURATION) << endl;
+    cout << "  Hue = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_HUE) << endl;
+    cout << "  Gamma = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAMMA) << endl;
+    cout << "  Sharpness = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_SHARPNESS) << endl;
+    cout << "  Gain = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_GAIN) << endl;
+    cout << "  Backligh = " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BACKLIGHT) << endl;
+    cout << "Image streams profiles:" << endl;
+    for (size_t i = 0; i < profilesCount; i++)
+    {
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
+        cout << "  Profile[" << i << "]: ";
+        cout << "width = " <<
+            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_WIDTH);
+        cout << ", height = " <<
+            (int)capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FRAME_HEIGHT);
+        cout << ", fps = " <<
+            capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_FPS);
+        cout << endl;
+    }
+
+    profilesCount = (size_t)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_COUNT);
+    cout << "Depth stream." << endl;
+    cout << "  Low confidence value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl;
+    cout << "  Saturation value = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl;
+    cout << "  Confidence threshold = " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl;
+    cout << "  Focal length = (" << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", "
+        << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl;
+    cout << "Depth streams profiles:" << endl;
+    for (size_t i = 0; i < profilesCount; i++)
+    {
+        capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)i);
+        cout << "  Profile[" << i << "]: ";
+        cout << "width = " <<
+            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_WIDTH);
+        cout << ", height = " <<
+            (int)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FRAME_HEIGHT);
+        cout << ", fps = " <<
+            capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_FPS);
+        cout << endl;
+    }
+}
+
+static void imshowImage(const char *winname, Mat &image, VideoCapture &capture)
+{
+    if (g_showClosedPoint)
+    {
+        Mat uvMap;
+        if (capture.retrieve(uvMap, CAP_INTELPERC_UVDEPTH_MAP))
+        {
+            float *uvmap = (float *)uvMap.ptr() + 2 * (g_closedDepthPoint[0] * uvMap.cols + g_closedDepthPoint[1]);
+            int x = (int)((*uvmap) * image.cols); uvmap++;
+            int y = (int)((*uvmap) * image.rows);
+
+            if ((0 <= x) && (0 <= y))
+            {
+                static const int pointSize = 4;
+                for (int row = y; row < min(y + pointSize, image.rows); row++)
+                {
+                    uchar* ptrDst = image.ptr(row) + x * 3 + 2;//+2 -> Red
+                    for (int col = 0; col < min(pointSize, image.cols - x); col++, ptrDst+=3)
+                    {
+                        *ptrDst = 255;
+                    }
+                }
+            }
+        }
+    }
+    imshow(winname, image);
+}
+static void imshowIR(const char *winname, Mat &ir)
+{
+    Mat image;
+    if (g_showClosedPoint)
+    {
+        image.create(ir.rows, ir.cols, CV_8UC3);
+        for (int row = 0; row < ir.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)ir.ptr(row);
+            for (int col = 0; col < ir.cols; col++, ptrSrc++)
+            {
+                uchar val = (uchar) ((*ptrSrc) >> 2);
+                *ptrDst = val;  ptrDst++;
+                *ptrDst = val;  ptrDst++;
+                *ptrDst = val;  ptrDst++;
+            }
+        }
+
+        static const int pointSize = 4;
+        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
+        {
+            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
+            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
+            {
+                *ptrDst = 255;
+            }
+        }
+    }
+    else
+    {
+        image.create(ir.rows, ir.cols, CV_8UC1);
+        for (int row = 0; row < ir.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)ir.ptr(row);
+            for (int col = 0; col < ir.cols; col++, ptrSrc++, ptrDst++)
+            {
+                *ptrDst = (uchar) ((*ptrSrc) >> 2);
+            }
+        }
+    }
+
+    imshow(winname, image);
+}
+static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture)
+{
+    short lowValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE);
+    short saturationValue = (short)capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE);
+
+    Mat image;
+    if (g_showClosedPoint)
+    {
+        image.create(depth.rows, depth.cols, CV_8UC3);
+        for (int row = 0; row < depth.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)depth.ptr(row);
+            for (int col = 0; col < depth.cols; col++, ptrSrc++)
+            {
+                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
+                {
+                    *ptrDst = 0; ptrDst++;
+                    *ptrDst = 0; ptrDst++;
+                    *ptrDst = 0; ptrDst++;
+                }
+                else
+                {
+                    uchar val = (uchar) ((*ptrSrc) >> 2);
+                    *ptrDst = val;  ptrDst++;
+                    *ptrDst = val;  ptrDst++;
+                    *ptrDst = val;  ptrDst++;
+                }
+            }
+        }
+
+        static const int pointSize = 4;
+        for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++)
+        {
+            uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red
+            for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3)
+            {
+                *ptrDst = 255;
+            }
+        }
+    }
+    else
+    {
+        image.create(depth.rows, depth.cols, CV_8UC1);
+        for (int row = 0; row < depth.rows; row++)
+        {
+            uchar* ptrDst = image.ptr(row);
+            short* ptrSrc = (short*)depth.ptr(row);
+            for (int col = 0; col < depth.cols; col++, ptrSrc++, ptrDst++)
+            {
+                if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc)))
+                    *ptrDst = 0;
+                else
+                    *ptrDst = (uchar) ((*ptrSrc) >> 2);
+            }
+        }
+    }
+    imshow(winname, image);
+}
+
+int main(int argc, char* argv[])
+{
+    parseCMDLine(argc, argv);
+
+    VideoCapture capture;
+    capture.open(CAP_INTELPERC);
+    if (!capture.isOpened())
+    {
+        cerr << "Can not open a capture object." << endl;
+        return -1;
+    }
+
+    if (g_printStreamSetting)
+        printStreamProperties(capture);
+
+    if (-1 != g_imageStreamProfileIdx)
+    {
+        if (!capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx))
+        {
+            cerr << "Can not setup a image stream." << endl;
+            return -1;
+        }
+    }
+    if (-1 != g_depthStreamProfileIdx)
+    {
+        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx))
+        {
+            cerr << "Can not setup a depth stream." << endl;
+            return -1;
+        }
+    }
+    else if (g_irStreamShow)
+    {
+        if (!capture.set(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_INTELPERC_PROFILE_IDX, 0.0))
+        {
+            cerr << "Can not setup a IR stream." << endl;
+            return -1;
+        }
+    }
+    else
+    {
+        cout << "Streams not selected" << endl;
+        return 0;
+    }
+
+    //Setup additional properies only after set profile of the stream
+    if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0))
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageBrightness);
+    if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0))
+        capture.set(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_BRIGHTNESS, g_imageContrast);
+
+    int frame = 0;
+    for(;;frame++)
+    {
+        Mat bgrImage;
+        Mat depthImage;
+        Mat irImage;
+
+        if (!capture.grab())
+        {
+            cout << "Can not grab images." << endl;
+            return -1;
+        }
+
+        if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CAP_INTELPERC_DEPTH_MAP)))
+        {
+            if (g_showClosedPoint)
+            {
+                double minVal = 0.0; double maxVal = 0.0;
+                minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint);
+            }
+            imshowDepth("depth image", depthImage, capture);
+        }
+        if ((g_irStreamShow) && (capture.retrieve(irImage, CAP_INTELPERC_IR_MAP)))
+            imshowIR("ir image", irImage);
+        if ((-1 != g_imageStreamProfileIdx) && (capture.retrieve(bgrImage, CAP_INTELPERC_IMAGE)))
+            imshowImage("color image", bgrImage, capture);
+
+        if (g_printTiming)
+        {
+            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_FRAMES)
+                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_FRAMES) << endl;
+            cout << "Image frame: " << capture.get(CAP_INTELPERC_IMAGE_GENERATOR | CAP_PROP_POS_MSEC)
+                 << ", Depth(IR) frame: " << capture.get(CAP_INTELPERC_DEPTH_GENERATOR | CAP_PROP_POS_MSEC) << endl;
+        }
+        if( waitKey(30) >= 0 )
+            break;
+    }
+
+    return 0;
+}
diff --git a/samples/cpp/kmeans.cpp b/samples/cpp/kmeans.cpp
index 0cc313f75..5a5140280 100644
--- a/samples/cpp/kmeans.cpp
+++ b/samples/cpp/kmeans.cpp
@@ -33,10 +33,10 @@ int main( int /*argc*/, char** /*argv*/ )
     {
         int k, clusterCount = rng.uniform(2, MAX_CLUSTERS+1);
         int i, sampleCount = rng.uniform(1, 1001);
-        Mat points(sampleCount, 1, CV_32FC2), labels;
+        Mat points(sampleCount, 2, CV_32F), labels;
 
         clusterCount = MIN(clusterCount, sampleCount);
-        Mat centers(clusterCount, 1, points.type());
+        Mat centers;
 
         /* generate random sample from multigaussian distribution */
         for( k = 0; k < clusterCount; k++ )
diff --git a/samples/cpp/retinaDemo.cpp b/samples/cpp/retinaDemo.cpp
deleted file mode 100644
index 790e75383..000000000
--- a/samples/cpp/retinaDemo.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//============================================================================
-// Name        : retinademo.cpp
-// Author      : Alexandre Benoit, benoit.alexandre.vision@gmail.com
-// Version     : 0.1
-// Copyright   : LISTIC/GIPSA French Labs, july 2011
-// Description : Gipsa/LISTIC Labs retina demo in C++, Ansi-style
-//============================================================================
-
-#include <iostream>
-#include <cstring>
-
-#include "opencv2/bioinspired.hpp"
-#include "opencv2/highgui.hpp"
-
-static void help(std::string errorMessage)
-{
-    std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : retinaDemo [processing mode] [Optional : media target] [Optional LAST parameter: \"log\" to activate retina log sampling]"<<std::endl;
-    std::cout<<"\t[processing mode] :"<<std::endl;
-    std::cout<<"\t -image : for still image processing"<<std::endl;
-    std::cout<<"\t -video : for video stream processing"<<std::endl;
-    std::cout<<"\t[Optional : media target] :"<<std::endl;
-    std::cout<<"\t if processing an image or video file, then, specify the path and filename of the target to process"<<std::endl;
-    std::cout<<"\t leave empty if processing video stream coming from a connected video device"<<std::endl;
-    std::cout<<"\t[Optional : activate retina log sampling] : an optional last parameter can be specified for retina spatial log sampling"<<std::endl;
-    std::cout<<"\t set \"log\" without quotes to activate this sampling, output frame size will be divided by 4"<<std::endl;
-    std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./retinaDemo -image lena.jpg"<<std::endl;
-    std::cout<<"\t-Image processing with log sampling : ./retinaDemo -image lena.jpg log"<<std::endl;
-    std::cout<<"\t-Video processing : ./retinaDemo -video myMovie.mp4"<<std::endl;
-    std::cout<<"\t-Live video processing : ./retinaDemo -video"<<std::endl;
-    std::cout<<"\nPlease start again with new parameters"<<std::endl;
-}
-
-int main(int argc, char* argv[]) {
-    // welcome message
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<"* Retina demonstration : demonstrates the use of is a wrapper class of the Gipsa/Listic Labs retina model."<<std::endl;
-    std::cout<<"* This retina model allows spatio-temporal image processing (applied on still images, video sequences)."<<std::endl;
-    std::cout<<"* As a summary, these are the retina model properties:"<<std::endl;
-    std::cout<<"* => It applies a spectral whithening (mid-frequency details enhancement)"<<std::endl;
-    std::cout<<"* => high frequency spatio-temporal noise reduction"<<std::endl;
-    std::cout<<"* => low frequency luminance to be reduced (luminance range compression)"<<std::endl;
-    std::cout<<"* => local logarithmic luminance compression allows details to be enhanced in low light conditions\n"<<std::endl;
-    std::cout<<"* for more information, reer to the following papers :"<<std::endl;
-    std::cout<<"* Benoit A., Caplier A., Durette B., Herault, J., \"USING HUMAN VISUAL SYSTEM MODELING FOR BIO-INSPIRED LOW LEVEL IMAGE PROCESSING\", Elsevier, Computer Vision and Image Understanding 114 (2010), pp. 758-773, DOI: http://dx.doi.org/10.1016/j.cviu.2010.01.011"<<std::endl;
-    std::cout<<"* Vision: Images, Signals and Neural Networks: Models of Neural Processing in Visual Perception (Progress in Neural Processing),By: Jeanny Herault, ISBN: 9814273686. WAPI (Tower ID): 113266891."<<std::endl;
-    std::cout<<"* => reports comments/remarks at benoit.alexandre.vision@gmail.com"<<std::endl;
-    std::cout<<"* => more informations and papers at : http://sites.google.com/site/benoitalexandrevision/"<<std::endl;
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<" NOTE : this program generates the default retina parameters file 'RetinaDefaultParameters.xml'"<<std::endl;
-    std::cout<<" => you can use this to fine tune parameters and load them if you save to file 'RetinaSpecificParameters.xml'"<<std::endl;
-
-    // basic input arguments checking
-    if (argc<2)
-    {
-        help("bad number of parameter");
-        return -1;
-    }
-
-    bool useLogSampling = !strcmp(argv[argc-1], "log"); // check if user wants retina log sampling processing
-
-    std::string inputMediaType=argv[1];
-
-    // declare the retina input buffer... that will be fed differently in regard of the input media
-    cv::Mat inputFrame;
-    cv::VideoCapture videoCapture; // in case a video media is used, its manager is declared here
-
-    //////////////////////////////////////////////////////////////////////////////
-    // checking input media type (still image, video file, live video acquisition)
-    if (!strcmp(inputMediaType.c_str(), "-image") && argc >= 3)
-    {
-        std::cout<<"RetinaDemo: processing image "<<argv[2]<<std::endl;
-        // image processing case
-        inputFrame = cv::imread(std::string(argv[2]), 1); // load image in RGB mode
-    }else
-        if (!strcmp(inputMediaType.c_str(), "-video"))
-        {
-            if (argc == 2 || (argc == 3 && useLogSampling)) // attempt to grab images from a video capture device
-            {
-                videoCapture.open(0);
-            }else// attempt to grab images from a video filestream
-            {
-                std::cout<<"RetinaDemo: processing video stream "<<argv[2]<<std::endl;
-                videoCapture.open(argv[2]);
-            }
-
-            // grab a first frame to check if everything is ok
-            videoCapture>>inputFrame;
-        }else
-        {
-            // bad command parameter
-            help("bad command parameter");
-            return -1;
-        }
-
-    if (inputFrame.empty())
-    {
-        help("Input media could not be loaded, aborting");
-        return -1;
-    }
-
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Program start in a try/catch safety context (Retina may throw errors)
-    try
-    {
-        // create a retina instance with default parameters setup, uncomment the initialisation you wanna test
-        cv::Ptr<cv::bioinspired::Retina> myRetina;
-
-        // if the last parameter is 'log', then activate log sampling (favour foveal vision and subsamples peripheral vision)
-        if (useLogSampling)
-                {
-                        myRetina = cv::bioinspired::createRetina(inputFrame.size(), true, cv::bioinspired::RETINA_COLOR_BAYER, true, 2.0, 10.0);
-                }
-        else// -> else allocate "classical" retina :
-            myRetina = cv::bioinspired::createRetina(inputFrame.size());
-
-        // save default retina parameters file in order to let you see this and maybe modify it and reload using method "setup"
-        myRetina->write("RetinaDefaultParameters.xml");
-
-        // load parameters if file exists
-        myRetina->setup("RetinaSpecificParameters.xml");
-        myRetina->clearBuffers();
-
-        // declare retina output buffers
-        cv::Mat retinaOutput_parvo;
-        cv::Mat retinaOutput_magno;
-
-        // processing loop with stop condition
-        bool continueProcessing=true; // FIXME : not yet managed during process...
-        while(continueProcessing)
-        {
-            // if using video stream, then, grabbing a new frame, else, input remains the same
-            if (videoCapture.isOpened())
-                videoCapture>>inputFrame;
-
-            // run retina filter
-            myRetina->run(inputFrame);
-            // Retrieve and display retina output
-            myRetina->getParvo(retinaOutput_parvo);
-            myRetina->getMagno(retinaOutput_magno);
-            cv::imshow("retina input", inputFrame);
-            cv::imshow("Retina Parvo", retinaOutput_parvo);
-            cv::imshow("Retina Magno", retinaOutput_magno);
-
-            cv::waitKey(5);
-        }
-    }catch(cv::Exception e)
-    {
-        std::cerr<<"Error using Retina : "<<e.what()<<std::endl;
-    }
-
-    // Program end message
-    std::cout<<"Retina demo end"<<std::endl;
-
-    return 0;
-}
diff --git a/samples/cpp/stereo_calib.cpp b/samples/cpp/stereo_calib.cpp
index 367df4bfe..a989e2a01 100644
--- a/samples/cpp/stereo_calib.cpp
+++ b/samples/cpp/stereo_calib.cpp
@@ -172,12 +172,12 @@ StereoCalib(const vector<string>& imagelist, Size boardSize, bool useCalibrated=
                     cameraMatrix[0], distCoeffs[0],
                     cameraMatrix[1], distCoeffs[1],
                     imageSize, R, T, E, F,
-                    TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 100, 1e-5),
                     CALIB_FIX_ASPECT_RATIO +
                     CALIB_ZERO_TANGENT_DIST +
                     CALIB_SAME_FOCAL_LENGTH +
                     CALIB_RATIONAL_MODEL +
-                    CALIB_FIX_K3 + CALIB_FIX_K4 + CALIB_FIX_K5);
+                    CALIB_FIX_K3 + CALIB_FIX_K4 + CALIB_FIX_K5,
+                    TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 100, 1e-5) );
     cout << "done with RMS error=" << rms << endl;
 
 // CALIBRATION QUALITY CHECK
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index b6eefc6f9..5eb3df46c 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -71,8 +71,11 @@ static void printUsage()
         "  --preview\n"
         "      Run stitching in the preview mode. Works faster than usual mode,\n"
         "      but output image will have lower resolution.\n"
-        "  --try_gpu (yes|no)\n"
-        "      Try to use GPU. The default value is 'no'. All default values\n"
+        "  --try_cuda (yes|no)\n"
+        "      Try to use CUDA. The default value is 'no'. All default values\n"
+        "      are for CPU mode.\n"
+        "  --try_ocl (yes|no)\n"
+        "      Try to use OpenCL. The default value is 'no'. All default values\n"
         "      are for CPU mode.\n"
         "\nMotion Estimation Flags:\n"
         "  --work_megapix <float>\n"
@@ -123,7 +126,8 @@ static void printUsage()
 // Default command line args
 vector<String> img_names;
 bool preview = false;
-bool try_gpu = false;
+bool try_cuda = false;
+bool try_ocl = false;
 double work_megapix = 0.6;
 double seam_megapix = 0.1;
 double compose_megapix = -1;
@@ -161,15 +165,28 @@ static int parseCmdArgs(int argc, char** argv)
         {
             preview = true;
         }
-        else if (string(argv[i]) == "--try_gpu")
+        else if (string(argv[i]) == "--try_cuda")
         {
             if (string(argv[i + 1]) == "no")
-                try_gpu = false;
+                try_cuda = false;
             else if (string(argv[i + 1]) == "yes")
-                try_gpu = true;
+                try_cuda = true;
             else
             {
-                cout << "Bad --try_gpu flag value\n";
+                cout << "Bad --try_cuda flag value\n";
+                return -1;
+            }
+            i++;
+        }
+        else if (string(argv[i]) == "--try_ocl")
+        {
+            if (string(argv[i + 1]) == "no")
+                try_ocl = false;
+            else if (string(argv[i + 1]) == "yes")
+                try_ocl = true;
+            else
+            {
+                cout << "Bad --try_ocl flag value\n";
                 return -1;
             }
             i++;
@@ -357,7 +374,7 @@ int main(int argc, char* argv[])
     if (features_type == "surf")
     {
 #ifdef HAVE_OPENCV_NONFREE
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             finder = makePtr<SurfFeaturesFinderGpu>();
         else
 #endif
@@ -430,7 +447,7 @@ int main(int argc, char* argv[])
     t = getTickCount();
 #endif
     vector<MatchesInfo> pairwise_matches;
-    BestOf2NearestMatcher matcher(try_gpu, match_conf);
+    BestOf2NearestMatcher matcher(try_cuda, match_conf);
     matcher(features, pairwise_matches);
     matcher.collectGarbage();
     LOGLN("Pairwise matching, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
@@ -552,8 +569,17 @@ int main(int argc, char* argv[])
     // Warp images and their masks
 
     Ptr<WarperCreator> warper_creator;
+    if (try_ocl)
+    {
+        if (warp_type == "plane")
+            warper_creator = makePtr<cv::PlaneWarperOcl>();
+        else if (warp_type == "cylindrical")
+            warper_creator = makePtr<cv::CylindricalWarperOcl>();
+        else if (warp_type == "spherical")
+            warper_creator = makePtr<cv::SphericalWarperOcl>();
+    }
 #ifdef HAVE_OPENCV_CUDAWARPING
-    if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+    else if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
     {
         if (warp_type == "plane")
             warper_creator = makePtr<cv::PlaneWarperGpu>();
@@ -636,7 +662,7 @@ int main(int argc, char* argv[])
     else if (seam_find_type == "gc_color")
     {
 #ifdef HAVE_OPENCV_CUDA
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             seam_finder = makePtr<detail::GraphCutSeamFinderGpu>(GraphCutSeamFinderBase::COST_COLOR);
         else
 #endif
@@ -645,7 +671,7 @@ int main(int argc, char* argv[])
     else if (seam_find_type == "gc_colorgrad")
     {
 #ifdef HAVE_OPENCV_CUDA
-        if (try_gpu && cuda::getCudaEnabledDeviceCount() > 0)
+        if (try_cuda && cuda::getCudaEnabledDeviceCount() > 0)
             seam_finder = makePtr<detail::GraphCutSeamFinderGpu>(GraphCutSeamFinderBase::COST_COLOR_GRAD);
         else
 #endif
@@ -755,11 +781,11 @@ int main(int argc, char* argv[])
 
         if (!blender)
         {
-            blender = Blender::createDefault(blend_type, try_gpu);
+            blender = Blender::createDefault(blend_type, try_cuda);
             Size dst_sz = resultRoi(corners, sizes).size();
             float blend_width = sqrt(static_cast<float>(dst_sz.area())) * blend_strength / 100.f;
             if (blend_width < 1.f)
-                blender = Blender::createDefault(Blender::NO, try_gpu);
+                blender = Blender::createDefault(Blender::NO, try_cuda);
             else if (blend_type == Blender::MULTI_BAND)
             {
                 MultiBandBlender* mb = dynamic_cast<MultiBandBlender*>(blender.get());
diff --git a/samples/cpp/train_HOG.cpp b/samples/cpp/train_HOG.cpp
new file mode 100644
index 000000000..e3ee190fc
--- /dev/null
+++ b/samples/cpp/train_HOG.cpp
@@ -0,0 +1,461 @@
+#include <opencv2/opencv.hpp>
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include <time.h>
+
+using namespace cv;
+using namespace std;
+
+void get_svm_detector(const SVM& svm, vector< float > & hog_detector );
+void convert_to_ml(const std::vector< cv::Mat > & train_samples, cv::Mat& trainData );
+void load_images( const string & prefix, const string & filename, vector< Mat > & img_lst );
+void sample_neg( const vector< Mat > & full_neg_lst, vector< Mat > & neg_lst, const Size & size );
+Mat get_hogdescriptor_visu(const Mat& color_origImg, vector<float>& descriptorValues, const Size & size );
+void compute_hog( const vector< Mat > & img_lst, vector< Mat > & gradient_lst, const Size & size );
+void train_svm( const vector< Mat > & gradient_lst, const vector< int > & labels );
+void draw_locations( Mat & img, const vector< Rect > & locations, const Scalar & color );
+void test_it( const Size & size );
+
+void get_svm_detector(const SVM& svm, vector< float > & hog_detector )
+{
+    // get the number of variables
+    const int var_all = svm.get_var_count();
+    // get the number of support vectors
+    const int sv_total = svm.get_support_vector_count();
+    // get the decision function
+    const CvSVMDecisionFunc* decision_func = svm.get_decision_function();
+    // get the support vectors
+    const float** sv = new const float*[ sv_total ];
+    for( int i = 0 ; i < sv_total ; ++i )
+        sv[ i ] = svm.get_support_vector(i);
+
+    CV_Assert( var_all > 0 &&
+        sv_total > 0 &&
+        decision_func != 0 &&
+        decision_func->alpha != 0 &&
+        decision_func->sv_count == sv_total );
+
+    float svi = 0.f;
+
+    hog_detector.clear(); //clear stuff in vector.
+    hog_detector.reserve( var_all + 1 ); //reserve place for memory efficiency.
+
+     /**
+    * hog_detector^i = \sum_j support_vector_j^i * \alpha_j
+    * hog_detector^dim = -\rho
+    */
+   for( int i = 0 ; i < var_all ; ++i )
+    {
+        svi = 0.f;
+        for( int j = 0 ; j < sv_total ; ++j )
+        {
+            if( decision_func->sv_index != NULL ) // sometime the sv_index isn't store on YML/XML.
+                svi += (float)( sv[decision_func->sv_index[j]][i] * decision_func->alpha[ j ] );
+            else
+                svi += (float)( sv[j][i] * decision_func->alpha[ j ] );
+        }
+        hog_detector.push_back( svi );
+    }
+    hog_detector.push_back( (float)-decision_func->rho );
+
+    delete[] sv;
+}
+
+
+/*
+* Convert training/testing set to be used by OpenCV Machine Learning algorithms.
+* TrainData is a matrix of size (#samples x max(#cols,#rows) per samples), in 32FC1.
+* Transposition of samples are made if needed.
+*/
+void convert_to_ml(const std::vector< cv::Mat > & train_samples, cv::Mat& trainData )
+{
+    //--Convert data
+    const int rows = (int)train_samples.size();
+    const int cols = (int)std::max( train_samples[0].cols, train_samples[0].rows );
+    cv::Mat tmp(1, cols, CV_32FC1); //< used for transposition if needed
+    trainData = cv::Mat(rows, cols, CV_32FC1 );
+    vector< Mat >::const_iterator itr = train_samples.begin();
+    vector< Mat >::const_iterator end = train_samples.end();
+    for( int i = 0 ; itr != end ; ++itr, ++i )
+    {
+        CV_Assert( itr->cols == 1 ||
+            itr->rows == 1 );
+        if( itr->cols == 1 )
+        {
+            transpose( *(itr), tmp );
+            tmp.copyTo( trainData.row( i ) );
+        }
+        else if( itr->rows == 1 )
+        {
+            itr->copyTo( trainData.row( i ) );
+        }
+    }
+}
+
+void load_images( const string & prefix, const string & filename, vector< Mat > & img_lst )
+{
+    string line;
+    ifstream file;
+
+    file.open( (prefix+filename).c_str() );
+    if( !file.is_open() )
+    {
+        cerr << "Unable to open the list of images from " << filename << " filename." << endl;
+        exit( -1 );
+    }
+
+    bool end_of_parsing = false;
+    while( !end_of_parsing )
+    {
+        getline( file, line );
+        if( line == "" ) // no more file to read
+        {
+            end_of_parsing = true;
+            break;
+        }
+        Mat img = imread( (prefix+line).c_str() ); // load the image
+        if( !img.data ) // invalid image, just skip it.
+            continue;
+#ifdef _DEBUG
+        imshow( "image", img );
+        waitKey( 10 );
+#endif
+        img_lst.push_back( img.clone() );
+    }
+}
+
+void sample_neg( const vector< Mat > & full_neg_lst, vector< Mat > & neg_lst, const Size & size )
+{
+    Rect box;
+    box.width = size.width;
+    box.height = size.height;
+
+    const int size_x = box.width;
+    const int size_y = box.height;
+
+    srand( (unsigned int)time( NULL ) );
+
+    vector< Mat >::const_iterator img = full_neg_lst.begin();
+    vector< Mat >::const_iterator end = full_neg_lst.end();
+    for( ; img != end ; ++img )
+    {
+        box.x = rand() % (img->cols - size_x);
+        box.y = rand() % (img->rows - size_y);
+        Mat roi = (*img)(box);
+        neg_lst.push_back( roi.clone() );
+#ifdef _DEBUG
+        imshow( "img", roi.clone() );
+        waitKey( 10 );
+#endif
+    }
+}
+
+// From http://www.juergenwiki.de/work/wiki/doku.php?id=public:hog_descriptor_computation_and_visualization
+Mat get_hogdescriptor_visu(const Mat& color_origImg, vector<float>& descriptorValues, const Size & size )
+{
+    const int DIMX = size.width;
+    const int DIMY = size.height;
+    float zoomFac = 3;
+    Mat visu;
+    resize(color_origImg, visu, Size( (int)(color_origImg.cols*zoomFac), (int)(color_origImg.rows*zoomFac) ) );
+
+    int cellSize        = 8;
+    int gradientBinSize = 9;
+    float radRangeForOneBin = (float)(CV_PI/(float)gradientBinSize); // dividing 180� into 9 bins, how large (in rad) is one bin?
+
+    // prepare data structure: 9 orientation / gradient strenghts for each cell
+    int cells_in_x_dir = DIMX / cellSize;
+    int cells_in_y_dir = DIMY / cellSize;
+    float*** gradientStrengths = new float**[cells_in_y_dir];
+    int** cellUpdateCounter   = new int*[cells_in_y_dir];
+    for (int y=0; y<cells_in_y_dir; y++)
+    {
+        gradientStrengths[y] = new float*[cells_in_x_dir];
+        cellUpdateCounter[y] = new int[cells_in_x_dir];
+        for (int x=0; x<cells_in_x_dir; x++)
+        {
+            gradientStrengths[y][x] = new float[gradientBinSize];
+            cellUpdateCounter[y][x] = 0;
+
+            for (int bin=0; bin<gradientBinSize; bin++)
+                gradientStrengths[y][x][bin] = 0.0;
+        }
+    }
+
+    // nr of blocks = nr of cells - 1
+    // since there is a new block on each cell (overlapping blocks!) but the last one
+    int blocks_in_x_dir = cells_in_x_dir - 1;
+    int blocks_in_y_dir = cells_in_y_dir - 1;
+
+    // compute gradient strengths per cell
+    int descriptorDataIdx = 0;
+    int cellx = 0;
+    int celly = 0;
+
+    for (int blockx=0; blockx<blocks_in_x_dir; blockx++)
+    {
+        for (int blocky=0; blocky<blocks_in_y_dir; blocky++)
+        {
+            // 4 cells per block ...
+            for (int cellNr=0; cellNr<4; cellNr++)
+            {
+                // compute corresponding cell nr
+                cellx = blockx;
+                celly = blocky;
+                if (cellNr==1) celly++;
+                if (cellNr==2) cellx++;
+                if (cellNr==3)
+                {
+                    cellx++;
+                    celly++;
+                }
+
+                for (int bin=0; bin<gradientBinSize; bin++)
+                {
+                    float gradientStrength = descriptorValues[ descriptorDataIdx ];
+                    descriptorDataIdx++;
+
+                    gradientStrengths[celly][cellx][bin] += gradientStrength;
+
+                } // for (all bins)
+
+
+                // note: overlapping blocks lead to multiple updates of this sum!
+                // we therefore keep track how often a cell was updated,
+                // to compute average gradient strengths
+                cellUpdateCounter[celly][cellx]++;
+
+            } // for (all cells)
+
+
+        } // for (all block x pos)
+    } // for (all block y pos)
+
+
+    // compute average gradient strengths
+    for (celly=0; celly<cells_in_y_dir; celly++)
+    {
+        for (cellx=0; cellx<cells_in_x_dir; cellx++)
+        {
+
+            float NrUpdatesForThisCell = (float)cellUpdateCounter[celly][cellx];
+
+            // compute average gradient strenghts for each gradient bin direction
+            for (int bin=0; bin<gradientBinSize; bin++)
+            {
+                gradientStrengths[celly][cellx][bin] /= NrUpdatesForThisCell;
+            }
+        }
+    }
+
+    // draw cells
+    for (celly=0; celly<cells_in_y_dir; celly++)
+    {
+        for (cellx=0; cellx<cells_in_x_dir; cellx++)
+        {
+            int drawX = cellx * cellSize;
+            int drawY = celly * cellSize;
+
+            int mx = drawX + cellSize/2;
+            int my = drawY + cellSize/2;
+
+            rectangle(visu, Point((int)(drawX*zoomFac), (int)(drawY*zoomFac)), Point((int)((drawX+cellSize)*zoomFac), (int)((drawY+cellSize)*zoomFac)), CV_RGB(100,100,100), 1);
+
+            // draw in each cell all 9 gradient strengths
+            for (int bin=0; bin<gradientBinSize; bin++)
+            {
+                float currentGradStrength = gradientStrengths[celly][cellx][bin];
+
+                // no line to draw?
+                if (currentGradStrength==0)
+                    continue;
+
+                float currRad = bin * radRangeForOneBin + radRangeForOneBin/2;
+
+                float dirVecX = cos( currRad );
+                float dirVecY = sin( currRad );
+                float maxVecLen = (float)(cellSize/2.f);
+                float scale = 2.5; // just a visualization scale, to see the lines better
+
+                // compute line coordinates
+                float x1 = mx - dirVecX * currentGradStrength * maxVecLen * scale;
+                float y1 = my - dirVecY * currentGradStrength * maxVecLen * scale;
+                float x2 = mx + dirVecX * currentGradStrength * maxVecLen * scale;
+                float y2 = my + dirVecY * currentGradStrength * maxVecLen * scale;
+
+                // draw gradient visualization
+                line(visu, Point((int)(x1*zoomFac),(int)(y1*zoomFac)), Point((int)(x2*zoomFac),(int)(y2*zoomFac)), CV_RGB(0,255,0), 1);
+
+            } // for (all bins)
+
+        } // for (cellx)
+    } // for (celly)
+
+
+    // don't forget to free memory allocated by helper data structures!
+    for (int y=0; y<cells_in_y_dir; y++)
+    {
+        for (int x=0; x<cells_in_x_dir; x++)
+        {
+            delete[] gradientStrengths[y][x];
+        }
+        delete[] gradientStrengths[y];
+        delete[] cellUpdateCounter[y];
+    }
+    delete[] gradientStrengths;
+    delete[] cellUpdateCounter;
+
+    return visu;
+
+} // get_hogdescriptor_visu
+
+void compute_hog( const vector< Mat > & img_lst, vector< Mat > & gradient_lst, const Size & size )
+{
+    HOGDescriptor hog;
+    hog.winSize = size;
+    Mat gray;
+    vector< Point > location;
+    vector< float > descriptors;
+
+    vector< Mat >::const_iterator img = img_lst.begin();
+    vector< Mat >::const_iterator end = img_lst.end();
+    for( ; img != end ; ++img )
+    {
+        cvtColor( *img, gray, COLOR_BGR2GRAY );
+        hog.compute( gray, descriptors, Size( 8, 8 ), Size( 0, 0 ), location );
+        gradient_lst.push_back( Mat( descriptors ).clone() );
+#ifdef _DEBUG
+        imshow( "gradient", get_hogdescriptor_visu( img->clone(), descriptors, size ) );
+        waitKey( 10 );
+#endif
+    }
+}
+
+void train_svm( const vector< Mat > & gradient_lst, const vector< int > & labels )
+{
+    SVM svm;
+
+    /* Default values to train SVM */
+    SVMParams params;
+    params.coef0 = 0.0;
+    params.degree = 3;
+    params.term_crit.epsilon = 1e-3;
+    params.gamma = 0;
+    params.kernel_type = SVM::LINEAR;
+    params.nu = 0.5;
+    params.p = 0.1; // for EPSILON_SVR, epsilon in loss function?
+    params.C = 0.01; // From paper, soft classifier
+    params.svm_type = SVM::EPS_SVR; // C_SVC; // EPSILON_SVR; // may be also NU_SVR; // do regression task
+
+    Mat train_data;
+    convert_to_ml( gradient_lst, train_data );
+
+    clog << "Start training...";
+    svm.train( train_data, Mat( labels ), Mat(), Mat(), params );
+    clog << "...[done]" << endl;
+
+    svm.save( "my_people_detector.yml" );
+}
+
+void draw_locations( Mat & img, const vector< Rect > & locations, const Scalar & color )
+{
+    if( !locations.empty() )
+    {
+        vector< Rect >::const_iterator loc = locations.begin();
+        vector< Rect >::const_iterator end = locations.end();
+        for( ; loc != end ; ++loc )
+        {
+            rectangle( img, *loc, color, 2 );
+        }
+    }
+}
+
+void test_it( const Size & size )
+{
+    char key = 27;
+    Scalar reference( 0, 255, 0 );
+    Scalar trained( 0, 0, 255 );
+    Mat img, draw;
+    SVM svm;
+    HOGDescriptor hog;
+    HOGDescriptor my_hog;
+    my_hog.winSize = size;
+    VideoCapture video;
+    vector< Rect > locations;
+
+    // Load the trained SVM.
+    svm.load( "my_people_detector.yml" );
+    // Set the trained svm to my_hog
+    vector< float > hog_detector;
+    get_svm_detector( svm, hog_detector );
+    my_hog.setSVMDetector( hog_detector );
+    // Set the people detector.
+    hog.setSVMDetector( hog.getDefaultPeopleDetector() );
+    // Open the camera.
+    video.open(0);
+    if( !video.isOpened() )
+    {
+        cerr << "Unable to open the device 0" << endl;
+        exit( -1 );
+    }
+
+    bool end_of_process = false;
+    while( !end_of_process )
+    {
+        video >> img;
+        if( !img.data )
+            break;
+
+        draw = img.clone();
+
+        locations.clear();
+        hog.detectMultiScale( img, locations );
+        draw_locations( draw, locations, reference );
+
+        locations.clear();
+        my_hog.detectMultiScale( img, locations );
+        draw_locations( draw, locations, trained );
+
+        imshow( "Video", draw );
+        key = (char)waitKey( 10 );
+        if( 27 == key )
+            end_of_process = true;
+    }
+}
+
+int main( int argc, char** argv )
+{
+    if( argc != 4 )
+    {
+        cout << "Wrong number of parameters." << endl
+            << "Usage: " << argv[0] << " pos_dir pos.lst neg_dir neg.lst" << endl
+            << "example: " << argv[0] << " /INRIA_dataset/ Train/pos.lst /INRIA_dataset/ Train/neg.lst" << endl;
+        exit( -1 );
+    }
+    vector< Mat > pos_lst;
+    vector< Mat > full_neg_lst;
+    vector< Mat > neg_lst;
+    vector< Mat > gradient_lst;
+    vector< int > labels;
+
+    load_images( argv[1], argv[2], pos_lst );
+    labels.assign( pos_lst.size(), +1 );
+    const unsigned int old = (unsigned int)labels.size();
+    load_images( argv[3], argv[4], full_neg_lst );
+    sample_neg( full_neg_lst, neg_lst, Size( 96,160 ) );
+    labels.insert( labels.end(), neg_lst.size(), -1 );
+    CV_Assert( old < labels.size() );
+
+    compute_hog( pos_lst, gradient_lst, Size( 96, 160 ) );
+    compute_hog( neg_lst, gradient_lst, Size( 96, 160 ) );
+
+    train_svm( gradient_lst, labels );
+
+    test_it( Size( 96, 160 ) ); // change with your parameters
+
+    return 0;
+}
diff --git a/samples/cpp/tutorial_code/ImgProc/Threshold.cpp b/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
index d98cc1182..96d5686a8 100644
--- a/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
+++ b/samples/cpp/tutorial_code/ImgProc/Threshold.cpp
@@ -14,7 +14,7 @@ using namespace cv;
 /// Global variables
 
 int threshold_value = 0;
-int threshold_type = 3;;
+int threshold_type = 3;
 int const max_value = 255;
 int const max_type = 4;
 int const max_BINARY_value = 255;
diff --git a/samples/cpp/tutorial_code/bioinspired/retina_tutorial.cpp b/samples/cpp/tutorial_code/bioinspired/retina_tutorial.cpp
deleted file mode 100644
index 994d881de..000000000
--- a/samples/cpp/tutorial_code/bioinspired/retina_tutorial.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//============================================================================
-// Name        : retina_tutorial.cpp
-// Author      : Alexandre Benoit, benoit.alexandre.vision@gmail.com
-// Version     : 0.1
-// Copyright   : LISTIC/GIPSA French Labs, july 2012
-// Description : Gipsa/LISTIC Labs retina demo in C++, Ansi-style
-//============================================================================
-
-#include <iostream>
-#include <cstring>
-
-#include "opencv2/bioinspired.hpp"
-#include "opencv2/highgui.hpp"
-
-static void help(std::string errorMessage)
-{
-    std::cout<<"Program init error : "<<errorMessage<<std::endl;
-    std::cout<<"\nProgram call procedure : retinaDemo [processing mode] [Optional : media target] [Optional LAST parameter: \"log\" to activate retina log sampling]"<<std::endl;
-    std::cout<<"\t[processing mode] :"<<std::endl;
-    std::cout<<"\t -image : for still image processing"<<std::endl;
-    std::cout<<"\t -video : for video stream processing"<<std::endl;
-    std::cout<<"\t[Optional : media target] :"<<std::endl;
-    std::cout<<"\t if processing an image or video file, then, specify the path and filename of the target to process"<<std::endl;
-    std::cout<<"\t leave empty if processing video stream coming from a connected video device"<<std::endl;
-    std::cout<<"\t[Optional : activate retina log sampling] : an optional last parameter can be specified for retina spatial log sampling"<<std::endl;
-    std::cout<<"\t set \"log\" without quotes to activate this sampling, output frame size will be divided by 4"<<std::endl;
-    std::cout<<"\nExamples:"<<std::endl;
-    std::cout<<"\t-Image processing : ./retinaDemo -image lena.jpg"<<std::endl;
-    std::cout<<"\t-Image processing with log sampling : ./retinaDemo -image lena.jpg log"<<std::endl;
-    std::cout<<"\t-Video processing : ./retinaDemo -video myMovie.mp4"<<std::endl;
-    std::cout<<"\t-Live video processing : ./retinaDemo -video"<<std::endl;
-    std::cout<<"\nPlease start again with new parameters"<<std::endl;
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<" NOTE : this program generates the default retina parameters file 'RetinaDefaultParameters.xml'"<<std::endl;
-    std::cout<<" => you can use this to fine tune parameters and load them if you save to file 'RetinaSpecificParameters.xml'"<<std::endl;
-}
-
-int main(int argc, char* argv[]) {
-    // welcome message
-    std::cout<<"****************************************************"<<std::endl;
-    std::cout<<"* Retina demonstration : demonstrates the use of is a wrapper class of the Gipsa/Listic Labs retina model."<<std::endl;
-    std::cout<<"* This demo will try to load the file 'RetinaSpecificParameters.xml' (if exists).\nTo create it, copy the autogenerated template 'RetinaDefaultParameters.xml'.\nThen tweak it with your own retina parameters."<<std::endl;
-    // basic input arguments checking
-    if (argc<2)
-    {
-        help("bad number of parameter");
-        return -1;
-    }
-
-    bool useLogSampling = !strcmp(argv[argc-1], "log"); // check if user wants retina log sampling processing
-
-    std::string inputMediaType=argv[1];
-
-    // declare the retina input buffer... that will be fed differently in regard of the input media
-    cv::Mat inputFrame;
-    cv::VideoCapture videoCapture; // in case a video media is used, its manager is declared here
-
-    //////////////////////////////////////////////////////////////////////////////
-    // checking input media type (still image, video file, live video acquisition)
-    if (!strcmp(inputMediaType.c_str(), "-image") && argc >= 3)
-    {
-        std::cout<<"RetinaDemo: processing image "<<argv[2]<<std::endl;
-        // image processing case
-        inputFrame = cv::imread(std::string(argv[2]), 1); // load image in RGB mode
-    }else
-        if (!strcmp(inputMediaType.c_str(), "-video"))
-        {
-            if (argc == 2 || (argc == 3 && useLogSampling)) // attempt to grab images from a video capture device
-            {
-                videoCapture.open(0);
-            }else// attempt to grab images from a video filestream
-            {
-                std::cout<<"RetinaDemo: processing video stream "<<argv[2]<<std::endl;
-                videoCapture.open(argv[2]);
-            }
-
-            // grab a first frame to check if everything is ok
-            videoCapture>>inputFrame;
-        }else
-        {
-            // bad command parameter
-            help("bad command parameter");
-            return -1;
-        }
-
-    if (inputFrame.empty())
-    {
-        help("Input media could not be loaded, aborting");
-        return -1;
-    }
-
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Program start in a try/catch safety context (Retina may throw errors)
-    try
-    {
-        // create a retina instance with default parameters setup, uncomment the initialisation you wanna test
-        cv::Ptr<cv::bioinspired::Retina> myRetina;
-
-        // if the last parameter is 'log', then activate log sampling (favour foveal vision and subsamples peripheral vision)
-        if (useLogSampling)
-        {
-            myRetina = cv::bioinspired::createRetina(inputFrame.size(), true, cv::bioinspired::RETINA_COLOR_BAYER, true, 2.0, 10.0);
-        }
-        else// -> else allocate "classical" retina :
-        {
-            myRetina = cv::bioinspired::createRetina(inputFrame.size());
-        }
-
-        // save default retina parameters file in order to let you see this and maybe modify it and reload using method "setup"
-        myRetina->write("RetinaDefaultParameters.xml");
-
-        // load parameters if file exists
-        myRetina->setup("RetinaSpecificParameters.xml");
-
-        // reset all retina buffers (imagine you close your eyes for a long time)
-        myRetina->clearBuffers();
-
-        // declare retina output buffers
-        cv::Mat retinaOutput_parvo;
-        cv::Mat retinaOutput_magno;
-
-        // processing loop with no stop condition
-        for(;;)
-        {
-            // if using video stream, then, grabbing a new frame, else, input remains the same
-            if (videoCapture.isOpened())
-                videoCapture>>inputFrame;
-
-            // run retina filter on the loaded input frame
-            myRetina->run(inputFrame);
-            // Retrieve and display retina output
-            myRetina->getParvo(retinaOutput_parvo);
-            myRetina->getMagno(retinaOutput_magno);
-            cv::imshow("retina input", inputFrame);
-            cv::imshow("Retina Parvo", retinaOutput_parvo);
-            cv::imshow("Retina Magno", retinaOutput_magno);
-            cv::waitKey(10);
-        }
-    }catch(cv::Exception e)
-    {
-        std::cerr<<"Error using Retina or end of video sequence reached : "<<e.what()<<std::endl;
-    }
-
-    // Program end message
-    std::cout<<"Retina demo end"<<std::endl;
-
-    return 0;
-}
diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
index 480229b53..1c8dbd24a 100644
--- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
+++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
@@ -32,13 +32,13 @@ int main()
     for (int i = 0; i < image.rows; ++i)
         for (int j = 0; j < image.cols; ++j)
         {
-            Mat sampleMat = (Mat_<float>(1,2) << i,j);
+            Mat sampleMat = (Mat_<float>(1,2) << j,i);
             float response = SVM.predict(sampleMat);
 
             if (response == 1)
-                image.at<Vec3b>(j, i)  = green;
+                image.at<Vec3b>(i,j)  = green;
             else if (response == -1)
-                 image.at<Vec3b>(j, i)  = blue;
+                 image.at<Vec3b>(i,j)  = blue;
         }
 
     // Show the training data
diff --git a/samples/cpp/ufacedetect.cpp b/samples/cpp/ufacedetect.cpp
index a1726a57e..5e13a8211 100644
--- a/samples/cpp/ufacedetect.cpp
+++ b/samples/cpp/ufacedetect.cpp
@@ -231,9 +231,14 @@ void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
     smallImg.copyTo(canvas);
 
     double fps = getTickFrequency()/t;
+    static double avgfps = 0;
+    static int nframes = 0;
+    nframes++;
+    double alpha = nframes > 50 ? 0.01 : 1./nframes;
+    avgfps = avgfps*(1-alpha) + fps*alpha;
 
-    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50),
-            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
+    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", avgfps), Point(50, 30),
+            FONT_HERSHEY_SIMPLEX, 0.8, Scalar(0,255,0), 2);
 
     for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
     {
diff --git a/samples/directx/d3d_base.inl.hpp b/samples/directx/d3d_base.inl.hpp
index 6b6de300d..0ef5c00a0 100644
--- a/samples/directx/d3d_base.inl.hpp
+++ b/samples/directx/d3d_base.inl.hpp
@@ -134,7 +134,7 @@ static void renderToD3DObject(void)
 
     const float fps = getFps();
 
-    String deviceName = cv::ocl::useOpenCL() ? cv::ocl::Context2::getDefault().device(0).name() : "No OpenCL device";
+    String deviceName = cv::ocl::useOpenCL() ? cv::ocl::Context::getDefault().device(0).name() : "No OpenCL device";
 
     if ((frame % std::max(1, (int)(fps / 25))) == 0)
     {
@@ -360,7 +360,7 @@ static cv::Mat getInputTexture()
     {
         cv::resize(inputMat, inputMat, cv::Size(WIDTH, HEIGHT));
     }
-    String deviceName = cv::ocl::useOpenCL() ? cv::ocl::Context2::getDefault().device(0).name() : "No OpenCL device";
+    String deviceName = cv::ocl::useOpenCL() ? cv::ocl::Context::getDefault().device(0).name() : "No OpenCL device";
     cv::Scalar color(64, 255, 64, 255);
     cv::putText(inputMat,
             cv::format("OpenCL Device name: %s", deviceName.c_str()),
@@ -396,13 +396,13 @@ static int mainLoop()
     if (cv::ocl::haveOpenCL())
     {
 #if defined(USE_D3D9)
-        cv::ocl::Context2& ctx = cv::directx::ocl::initializeContextFromDirect3DDevice9(dev);
+        cv::ocl::Context& ctx = cv::directx::ocl::initializeContextFromDirect3DDevice9(dev);
 #elif defined (USE_D3DEX)
-        cv::ocl::Context2& ctx = cv::directx::ocl::initializeContextFromDirect3DDevice9Ex(dev);
+        cv::ocl::Context& ctx = cv::directx::ocl::initializeContextFromDirect3DDevice9Ex(dev);
 #elif defined(USE_D3D10)
-        cv::ocl::Context2& ctx = cv::directx::ocl::initializeContextFromD3D10Device(dev);
+        cv::ocl::Context& ctx = cv::directx::ocl::initializeContextFromD3D10Device(dev);
 #elif defined(USE_D3D11)
-        cv::ocl::Context2& ctx = cv::directx::ocl::initializeContextFromD3D11Device(dev);
+        cv::ocl::Context& ctx = cv::directx::ocl::initializeContextFromD3D11Device(dev);
 #else
 #error "Invalid USE_D3D value"
 #endif
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 64c25fc09..46b465a87 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -76,7 +76,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
       if(MSVC AND NOT BUILD_SHARED_LIBS)
         set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
       endif()
-      install(TARGETS ${the_target} RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/${project}" COMPONENT main)
+      install(TARGETS ${the_target} RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/${project}" COMPONENT samples)
     endif()
   ENDMACRO()
 
@@ -91,9 +91,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   include("performance/CMakeLists.txt")
 endif()
 
-if (INSTALL_C_EXAMPLES AND NOT WIN32)
+if(INSTALL_C_EXAMPLES AND NOT WIN32)
   file(GLOB install_list *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
   install(FILES ${install_list}
-          DESTINATION share/OpenCV/samples/${project}
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/gpu
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
 endif()
diff --git a/samples/gpu/performance/CMakeLists.txt b/samples/gpu/performance/CMakeLists.txt
index 22657b56e..9289180af 100644
--- a/samples/gpu/performance/CMakeLists.txt
+++ b/samples/gpu/performance/CMakeLists.txt
@@ -23,12 +23,13 @@ if(ENABLE_SOLUTION_FOLDERS)
 endif()
 
 if(WIN32)
-  install(TARGETS ${the_target} RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/gpu" COMPONENT main)
+  install(TARGETS ${the_target} RUNTIME DESTINATION "${OPENCV_SAMPLES_BIN_INSTALL_PATH}/gpu" COMPONENT samples)
 endif()
 
 if(INSTALL_C_EXAMPLES AND NOT WIN32)
   file(GLOB CUDA_FILES performance/*.cpp performance/*.h)
   install(FILES ${CUDA_FILES}
-          DESTINATION share/OpenCV/samples/gpu/performance
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/gpu/performance
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ
+          COMPONENT samples)
 endif()
diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp
index 67d0532a6..3066e8f74 100644
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@@ -10,14 +10,10 @@
 #include "opencv2/superres/optical_flow.hpp"
 #include "opencv2/opencv_modules.hpp"
 
-#if defined(HAVE_OPENCV_OCL)
-#include "opencv2/ocl/ocl.hpp"
-#endif
-
 using namespace std;
 using namespace cv;
 using namespace cv::superres;
-bool useOclChanged;
+
 #define MEASURE_TIME(op) \
     { \
         TickMeter tm; \
@@ -50,43 +46,13 @@ static Ptr<DenseOpticalFlowExt> createOptFlow(const string& name, bool useGpu)
     else if (name == "pyrlk")
         return createOptFlow_PyrLK_CUDA();
     else
-    {
         cerr << "Incorrect Optical Flow algorithm - " << name << endl;
-    }
+
     return Ptr<DenseOpticalFlowExt>();
 }
-#if defined(HAVE_OPENCV_OCL)
-static Ptr<DenseOpticalFlowExt> createOptFlow(const string& name)
-{
-    if (name == "farneback")
-    {
-        return createOptFlow_Farneback_OCL();
-    }
-    else if (name == "simple")
-    {
-        useOclChanged = true;
-        std::cout<<"simple on OpenCL has not been implemented. Use CPU instead!\n";
-        return createOptFlow_Simple();
-    }
-    else if (name == "tvl1")
-        return createOptFlow_DualTVL1_OCL();
-    else if (name == "brox")
-    {
-        std::cout<<"brox has not been implemented!\n";
-        return Ptr<DenseOpticalFlowExt>();
-    }
-    else if (name == "pyrlk")
-        return createOptFlow_PyrLK_OCL();
-    else
-    {
-        cerr << "Incorrect Optical Flow algorithm - " << name << endl;
-    }
-    return Ptr<DenseOpticalFlowExt>();
-}
-#endif
+
 int main(int argc, const char* argv[])
 {
-    useOclChanged = false;
     CommandLineParser cmd(argc, argv,
         "{ v video      |           | Input video }"
         "{ o output     |           | Output video }"
@@ -94,7 +60,7 @@ int main(int argc, const char* argv[])
         "{ i iterations | 180       | Iteration count }"
         "{ t temporal   | 4         | Radius of the temporal search area }"
         "{ f flow       | farneback | Optical flow algorithm (farneback, simple, tvl1, brox, pyrlk) }"
-        "{ g            | false     | CPU as default device, cuda for CUDA and ocl for OpenCL }"
+        "{ g            | false     | CPU as default device, cuda for CUDA }"
         "{ h help       | false     | Print help message }"
     );
 
@@ -102,7 +68,7 @@ int main(int argc, const char* argv[])
     {
         cout << "This sample demonstrates Super Resolution algorithms for video sequence" << endl;
         cmd.printMessage();
-        return 0;
+        return EXIT_SUCCESS;
     }
 
     const string inputVideoName = cmd.get<string>("video");
@@ -115,60 +81,19 @@ int main(int argc, const char* argv[])
 
     std::transform(gpuOption.begin(), gpuOption.end(), gpuOption.begin(), ::tolower);
 
-    bool useCuda = false;
-    bool useOcl = false;
-
-    if(gpuOption.compare("ocl") == 0)
-        useOcl = true;
-    else if(gpuOption.compare("cuda") == 0)
-        useCuda = true;
-
-#ifndef HAVE_OPENCV_OCL
-    if(useOcl)
-    {
-        {
-            cout<<"OPENCL is not compiled\n";
-            return 0;
-        }
-    }
-#endif
-#if defined(HAVE_OPENCV_OCL)
-    if(useCuda)
-    {
-        CV_Assert(!useOcl);
-    }
-#endif
+    bool useCuda = gpuOption.compare("cuda") == 0;
     Ptr<SuperResolution> superRes;
 
-
-#if defined(HAVE_OPENCV_OCL)
-    if(useOcl)
-    {
-        Ptr<DenseOpticalFlowExt> of = createOptFlow(optFlow);
-        if (of.empty())
-            exit(-1);
-        if(useOclChanged)
-        {
-            superRes = createSuperResolution_BTVL1();
-            useOcl = !useOcl;
-        }else
-            superRes = createSuperResolution_BTVL1_OCL();
-        superRes->set("opticalFlow", of);
-    }
+    if (useCuda)
+        superRes = createSuperResolution_BTVL1_CUDA();
     else
-#endif
-    {
-        if (useCuda)
-            superRes = createSuperResolution_BTVL1_CUDA();
-        else
-            superRes = createSuperResolution_BTVL1();
+        superRes = createSuperResolution_BTVL1();
 
-        Ptr<DenseOpticalFlowExt> of = createOptFlow(optFlow, useCuda);
+    Ptr<DenseOpticalFlowExt> of = createOptFlow(optFlow, useCuda);
 
-        if (of.empty())
-            exit(-1);
-        superRes->set("opticalFlow", of);
-    }
+    if (of.empty())
+        return EXIT_FAILURE;
+    superRes->set("opticalFlow", of);
 
     superRes->set("scale", scale);
     superRes->set("iterations", iterations);
@@ -201,11 +126,7 @@ int main(int argc, const char* argv[])
         cout << "Iterations      : " << iterations << endl;
         cout << "Temporal radius : " << temporalAreaRadius << endl;
         cout << "Optical Flow    : " << optFlow << endl;
-#if defined(HAVE_OPENCV_OCL)
-        cout << "Mode            : " << (useCuda ? "CUDA" : useOcl? "OpenCL" : "CPU") << endl;
-#else
         cout << "Mode            : " << (useCuda ? "CUDA" : "CPU") << endl;
-#endif
     }
 
     superRes->setInput(frameSource);
@@ -217,32 +138,8 @@ int main(int argc, const char* argv[])
         cout << '[' << setw(3) << i << "] : ";
         Mat result;
 
-#if defined(HAVE_OPENCV_OCL)
-        cv::ocl::oclMat result_;
+        MEASURE_TIME(superRes->nextFrame(result));
 
-        if(useOcl)
-        {
-            MEASURE_TIME(
-            {
-                superRes->nextFrame(result_);
-                ocl::finish();
-            });
-        }
-        else
-#endif
-        {
-            MEASURE_TIME(superRes->nextFrame(result));
-        }
-
-#ifdef HAVE_OPENCV_OCL
-        if(useOcl)
-        {
-            if(!result_.empty())
-            {
-                result_.download(result);
-            }
-        }
-#endif
         if (result.empty())
             break;
 
diff --git a/samples/ocl/adaptive_bilateral_filter.cpp b/samples/ocl/adaptive_bilateral_filter.cpp
deleted file mode 100644
index df085c49d..000000000
--- a/samples/ocl/adaptive_bilateral_filter.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// This sample shows the difference of adaptive bilateral filter and bilateral filter.
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/ocl.hpp"
-
-using namespace cv;
-using namespace std;
-
-
-int main( int argc, const char** argv )
-{
-    const char* keys =
-        "{ i input   |          | specify input image }"
-        "{ k ksize   |     11   | specify kernel size }"
-        "{ s sSpace  |     3    | specify sigma space }"
-        "{ c sColor  |     30   | specify max color }"
-        "{ h help    | false    | print help message }";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.has("help"))
-    {
-        cout << "Usage : adaptive_bilateral_filter [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
-    string src_path = cmd.get<string>("i");
-    int ks = cmd.get<int>("k");
-    const char * winName[] = {"input", "ABF OpenCL", "BF OpenCL"};
-
-    Mat src = imread(src_path);
-    if (src.empty())
-    {
-        cout << "error read image: " << src_path << endl;
-        return EXIT_FAILURE;
-    }
-
-    double sigmaSpace = cmd.get<int>("s");
-
-    // sigma for checking pixel values. This is used as is in the "normal" bilateral filter,
-    // and it is used as an upper clamp on the adaptive case.
-    double sigmacolor = cmd.get<int>("c");
-
-    ocl::oclMat dsrc(src), dABFilter, dBFilter;
-    Size ksize(ks, ks);
-
-    // ksize is the total width/height of neighborhood used to calculate local variance.
-    // sigmaSpace is not a priori related to ksize/2.
-    ocl::adaptiveBilateralFilter(dsrc, dABFilter, ksize, sigmaSpace, sigmacolor);
-    ocl::bilateralFilter(dsrc, dBFilter, ks, sigmacolor, sigmaSpace);
-    Mat abFilter = dABFilter, bFilter = dBFilter;
-
-    ocl::finish();
-
-    imshow(winName[0], src);
-    imshow(winName[1], abFilter);
-    imshow(winName[2], bFilter);
-
-    waitKey();
-
-    return EXIT_SUCCESS;
-}
diff --git a/samples/ocl/bgfg_segm.cpp b/samples/ocl/bgfg_segm.cpp
deleted file mode 100644
index 19d87ef03..000000000
--- a/samples/ocl/bgfg_segm.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#include <iostream>
-#include <string>
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/ocl.hpp"
-#include "opencv2/highgui.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::ocl;
-
-#define M_MOG  1
-#define M_MOG2 2
-
-int main(int argc, const char** argv)
-{
-    cv::CommandLineParser cmd(argc, argv,
-        "{ c camera | false       | use camera }"
-        "{ f file   | 768x576.avi | input video file }"
-        "{ m method | mog         | method (mog, mog2) }"
-        "{ h help   | false       | print help message }");
-
-    if (cmd.has("help"))
-    {
-        cout << "Usage : bgfg_segm [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
-    bool useCamera = cmd.get<bool>("camera");
-    string file = cmd.get<string>("file");
-    string method = cmd.get<string>("method");
-
-    if (method != "mog" && method != "mog2")
-    {
-        cerr << "Incorrect method" << endl;
-        return EXIT_FAILURE;
-    }
-
-    int m = method == "mog" ? M_MOG : M_MOG2;
-
-    VideoCapture cap;
-    if (useCamera)
-        cap.open(0);
-    else
-        cap.open(file);
-
-    if (!cap.isOpened())
-    {
-        cout << "can not open camera or video file" << endl;
-        return EXIT_FAILURE;
-    }
-
-    Mat frame;
-    cap >> frame;
-
-    oclMat d_frame(frame);
-
-    cv::ocl::MOG mog;
-    cv::ocl::MOG2 mog2;
-
-    oclMat d_fgmask, d_fgimg, d_bgimg;
-
-    d_fgimg.create(d_frame.size(), d_frame.type());
-
-    Mat fgmask, fgimg, bgimg;
-
-    switch (m)
-    {
-    case M_MOG:
-        mog(d_frame, d_fgmask, 0.01f);
-        break;
-
-    case M_MOG2:
-        mog2(d_frame, d_fgmask);
-        break;
-    }
-
-    for (;;)
-    {
-        cap >> frame;
-        if (frame.empty())
-            break;
-        d_frame.upload(frame);
-
-        int64 start = cv::getTickCount();
-
-        //update the model
-        switch (m)
-        {
-        case M_MOG:
-            mog(d_frame, d_fgmask, 0.01f);
-            mog.getBackgroundImage(d_bgimg);
-            break;
-
-        case M_MOG2:
-            mog2(d_frame, d_fgmask);
-            mog2.getBackgroundImage(d_bgimg);
-            break;
-        }
-
-        double fps = cv::getTickFrequency() / (cv::getTickCount() - start);
-        std::cout << "FPS : " << fps << std::endl;
-
-        d_fgimg.setTo(Scalar::all(0));
-        d_frame.copyTo(d_fgimg, d_fgmask);
-
-        d_fgmask.download(fgmask);
-        d_fgimg.download(fgimg);
-        if (!d_bgimg.empty())
-            d_bgimg.download(bgimg);
-
-        imshow("image", frame);
-        imshow("foreground mask", fgmask);
-        imshow("foreground image", fgimg);
-        if (!bgimg.empty())
-            imshow("mean background image", bgimg);
-
-        if (27 == waitKey(30))
-            break;
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp
deleted file mode 100644
index fd570b515..000000000
--- a/samples/ocl/facedetect.cpp
+++ /dev/null
@@ -1,381 +0,0 @@
-#include "opencv2/objdetect/objdetect.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/ocl/ocl.hpp"
-
-#include "opencv2/highgui/highgui_c.h"
-
-#include <iostream>
-#include <stdio.h>
-
-#if defined(_MSC_VER) && (_MSC_VER >= 1700)
-    # include <thread>
-#endif
-
-using namespace std;
-using namespace cv;
-#define LOOP_NUM 1
-
-///////////////////////////single-threading faces detecting///////////////////////////////
-
-const static Scalar colors[] =  { CV_RGB(0,0,255),
-                                  CV_RGB(0,128,255),
-                                  CV_RGB(0,255,255),
-                                  CV_RGB(0,255,0),
-                                  CV_RGB(255,128,0),
-                                  CV_RGB(255,255,0),
-                                  CV_RGB(255,0,0),
-                                  CV_RGB(255,0,255)
-                                } ;
-
-
-int64 work_begin = 0;
-int64 work_end = 0;
-string inputName, outputName, cascadeName;
-
-static void workBegin()
-{
-    work_begin = getTickCount();
-}
-
-static void workEnd()
-{
-    work_end += (getTickCount() - work_begin);
-}
-
-static double getTime()
-{
-    return work_end /((double)cvGetTickFrequency() * 1000.);
-}
-
-
-static void detect( Mat& img, vector<Rect>& faces,
-             ocl::OclCascadeClassifier& cascade,
-             double scale);
-
-
-static void detectCPU( Mat& img, vector<Rect>& faces,
-                CascadeClassifier& cascade,
-                double scale);
-
-static void Draw(Mat& img, vector<Rect>& faces, double scale);
-
-
-// This function test if gpu_rst matches cpu_rst.
-// If the two vectors are not equal, it will return the difference in vector size
-// Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
-double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);
-
-static int facedetect_one_thread(bool useCPU, double scale )
-{
-    CvCapture* capture = 0;
-    Mat frame, frameCopy0, frameCopy, image;
-
-    ocl::OclCascadeClassifier cascade;
-    CascadeClassifier  cpu_cascade;
-
-    if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
-    {
-        cout << "ERROR: Could not load classifier cascade: " << cascadeName << endl;
-        return EXIT_FAILURE;
-    }
-
-    if( inputName.empty() )
-    {
-        capture = cvCaptureFromCAM(0);
-        if(!capture)
-            cout << "Capture from CAM 0 didn't work" << endl;
-    }
-    else
-    {
-        image = imread( inputName, CV_LOAD_IMAGE_COLOR );
-        if( image.empty() )
-        {
-            capture = cvCaptureFromAVI( inputName.c_str() );
-            if(!capture)
-                cout << "Capture from AVI didn't work" << endl;
-            return EXIT_FAILURE;
-        }
-    }
-
-    cvNamedWindow( "result", 1 );
-    if( capture )
-    {
-        cout << "In capture ..." << endl;
-        for(;;)
-        {
-            IplImage* iplImg = cvQueryFrame( capture );
-            frame = cv::cvarrToMat(iplImg);
-            vector<Rect> faces;
-            if( frame.empty() )
-                break;
-            if( iplImg->origin == IPL_ORIGIN_TL )
-                frame.copyTo( frameCopy0 );
-            else
-                flip( frame, frameCopy0, 0 );
-            if( scale == 1)
-                frameCopy0.copyTo(frameCopy);
-            else
-                resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR);
-
-            work_end = 0;
-            if(useCPU)
-                detectCPU(frameCopy, faces, cpu_cascade, 1);
-            else
-                detect(frameCopy, faces, cascade, 1);
-
-            Draw(frameCopy, faces, 1);
-            if( waitKey( 10 ) >= 0 )
-                break;
-        }
-        cvReleaseCapture( &capture );
-    }
-    else
-    {
-        cout << "In image read" << endl;
-        vector<Rect> faces;
-        vector<Rect> ref_rst;
-        double accuracy = 0.;
-        detectCPU(image, ref_rst, cpu_cascade, scale);
-        work_end = 0;
-
-        for(int i = 0; i <= LOOP_NUM; i ++)
-        {
-            cout << "loop" << i << endl;
-            if(useCPU)
-                detectCPU(image, faces, cpu_cascade, scale);
-            else
-            {
-                detect(image, faces, cascade, scale);
-                if(i == 0)
-                {
-                    accuracy = checkRectSimilarity(image.size(), ref_rst, faces);
-                }
-            }
-            if (i == LOOP_NUM)
-            {
-                if (useCPU)
-                    cout << "average CPU time (noCamera) : ";
-                else
-                    cout << "average GPU time (noCamera) : ";
-                cout << getTime() / LOOP_NUM << " ms" << endl;
-                cout << "accuracy value: " << accuracy <<endl;
-            }
-        }
-        Draw(image, faces, scale);
-        waitKey(0);
-    }
-
-    cvDestroyWindow("result");
-    std::cout<< "single-threaded sample has finished" <<std::endl;
-    return 0;
-}
-
-///////////////////////////////////////detectfaces with multithreading////////////////////////////////////////////
-#if defined(_MSC_VER) && (_MSC_VER >= 1700)
-
-#define MAX_THREADS 10
-
-static void detectFaces(std::string fileName)
-{
-    ocl::OclCascadeClassifier cascade;
-    if(!cascade.load(cascadeName))
-    {
-        std::cout << "ERROR: Could not load classifier cascade: " << cascadeName << std::endl;
-        return;
-    }
-
-    Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR);
-    if (img.empty())
-    {
-        std::cout << "cann't open file " + fileName <<std::endl;
-        return;
-    }
-
-    ocl::oclMat d_img;
-    d_img.upload(img);
-
-    std::vector<Rect> oclfaces;
-    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0 | CASCADE_SCALE_IMAGE, Size(30, 30), Size(0, 0));
-
-    for(unsigned int i = 0; i<oclfaces.size(); i++)
-        rectangle(img, Point(oclfaces[i].x, oclfaces[i].y), Point(oclfaces[i].x + oclfaces[i].width, oclfaces[i].y + oclfaces[i].height), colors[i%8], 3);
-
-    std::string::size_type pos = outputName.rfind('.');
-    std::string outputNameTid = outputName + '-' + std::to_string(_threadid);
-    if(pos == std::string::npos)
-    {
-        std::cout << "Invalid output file name: " << outputName << std::endl;
-    }
-    else
-    {
-        outputNameTid = outputName.substr(0, pos) + "_" + std::to_string(_threadid) + outputName.substr(pos);
-        imwrite(outputNameTid, img);
-    }
-    imshow(outputNameTid, img);
-    waitKey(0);
-}
-
-static void facedetect_multithreading(int nthreads)
-{
-    int thread_number = MAX_THREADS < nthreads ? MAX_THREADS : nthreads;
-    std::vector<std::thread> threads;
-    for(int i = 0; i<thread_number; i++)
-        threads.push_back(std::thread(detectFaces, inputName));
-    for(int i = 0; i<thread_number; i++)
-        threads[i].join();
-}
-#endif
-
-int main( int argc, const char** argv )
-{
-
-    const char* keys =
-        "{ h help       | false       | print help message }"
-        "{ i input      |             | specify input image }"
-        "{ t template   | haarcascade_frontalface_alt.xml |"
-        " specify template file path }"
-        "{ c scale      |   1.0       | scale image }"
-        "{ s use_cpu    | false       | use cpu or gpu to process the image }"
-        "{ o output     | facedetect_output.jpg  |"
-        " specify output image save path(only works when input is images) }"
-        "{ n thread_num |      1      | set number of threads >= 1 }";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.has("help"))
-    {
-        cout << "Usage : facedetect [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-    bool useCPU = cmd.get<bool>("s");
-    inputName = cmd.get<string>("i");
-    outputName = cmd.get<string>("o");
-    cascadeName = cmd.get<string>("t");
-    double scale = cmd.get<double>("c");
-    int n = cmd.get<int>("n");
-
-    if(n > 1)
-    {
-#if defined(_MSC_VER) && (_MSC_VER >= 1700)
-            std::cout<<"multi-threaded sample is running" <<std::endl;
-            facedetect_multithreading(n);
-            std::cout<<"multi-threaded sample has finished" <<std::endl;
-            return 0;
-#else
-            std::cout << "std::thread is not supported, running a single-threaded version" << std::endl;
-#endif
-    }
-    if (n<0)
-        std::cout<<"incorrect number of threads:" << n << ", running a single-threaded version" <<std::endl;
-    else
-        std::cout<<"single-threaded sample is running" <<std::endl;
-    return facedetect_one_thread(useCPU, scale);
-
-}
-
-void detect( Mat& img, vector<Rect>& faces,
-             ocl::OclCascadeClassifier& cascade,
-             double scale)
-{
-    ocl::oclMat image(img);
-    ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
-    workBegin();
-    ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
-    ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
-    ocl::equalizeHist( smallImg, smallImg );
-
-    cascade.detectMultiScale( smallImg, faces, 1.1,
-                              3, 0
-                              |CASCADE_SCALE_IMAGE
-                              , Size(30,30), Size(0, 0) );
-    workEnd();
-}
-
-void detectCPU( Mat& img, vector<Rect>& faces,
-                CascadeClassifier& cascade,
-                double scale)
-{
-    workBegin();
-    Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
-    cvtColor(img, cpu_gray, COLOR_BGR2GRAY);
-    resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
-    equalizeHist(cpu_smallImg, cpu_smallImg);
-    cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
-                             3, 0 | CASCADE_SCALE_IMAGE,
-                             Size(30, 30), Size(0, 0));
-    workEnd();
-}
-
-
-void Draw(Mat& img, vector<Rect>& faces, double scale)
-{
-    int i = 0;
-    putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50),
-            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
-    for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
-    {
-        Point center;
-        Scalar color = colors[i%8];
-        int radius;
-        center.x = cvRound((r->x + r->width*0.5)*scale);
-        center.y = cvRound((r->y + r->height*0.5)*scale);
-        radius = cvRound((r->width + r->height)*0.25*scale);
-        circle( img, center, radius, color, 3, 8, 0 );
-    }
-    //imwrite( outputName, img );
-    if(abs(scale-1.0)>.001)
-    {
-        resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));
-    }
-    imshow( "result", img );
-
-}
-
-
-double checkRectSimilarity(Size sz, vector<Rect>& ob1, vector<Rect>& ob2)
-{
-    double final_test_result = 0.0;
-    size_t sz1 = ob1.size();
-    size_t sz2 = ob2.size();
-
-    if(sz1 != sz2)
-    {
-        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
-    }
-    else
-    {
-        if(sz1==0 && sz2==0)
-            return 0;
-        Mat cpu_result(sz, CV_8UC1);
-        cpu_result.setTo(0);
-
-        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {
-            Mat cpu_result_roi(cpu_result, *r);
-            cpu_result_roi.setTo(1);
-            cpu_result.copyTo(cpu_result);
-        }
-        int cpu_area = countNonZero(cpu_result > 0);
-
-
-        Mat gpu_result(sz, CV_8UC1);
-        gpu_result.setTo(0);
-        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
-        {
-            cv::Mat gpu_result_roi(gpu_result, *r2);
-            gpu_result_roi.setTo(1);
-            gpu_result.copyTo(gpu_result);
-        }
-
-        Mat result_;
-        multiply(cpu_result, gpu_result, result_);
-        int result = countNonZero(result_ > 0);
-        if(cpu_area!=0 && result!=0)
-            final_test_result = 1.0 - (double)result/(double)cpu_area;
-        else if(cpu_area==0 && result!=0)
-            final_test_result = -1;
-    }
-    return final_test_result;
-}
diff --git a/samples/ocl/retina_ocl.cpp b/samples/ocl/retina_ocl.cpp
deleted file mode 100644
index 6c58f7325..000000000
--- a/samples/ocl/retina_ocl.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-#include <iostream>
-#include <cstring>
-
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/ocl.hpp"
-#include "opencv2/bioinspired.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-using namespace std;
-
-const int total_loop_count = 50;
-
-static void help(CommandLineParser cmd, const String& errorMessage)
-{
-    cout << errorMessage << endl;
-    cout << "Avaible options:" << endl;
-    cmd.printMessage();
-}
-
-int main(int argc, char* argv[])
-{
-    //set this to save kernel compile time from second time you run
-    ocl::setBinaryDiskCache();
-    const char* keys =
-        "{ h   | help     | false           | print help message }"
-        "{ c   | cpu      | false           | use cpu (original version) or gpu(OpenCL) to process the image }"
-        "{ i   | image    | cat.jpg         | specify the input image }";
-
-    CommandLineParser cmd(argc, argv, keys);
-
-    if(cmd.get<bool>("help"))
-    {
-        help(cmd, "Usage: ./retina_ocl [options]");
-        return EXIT_FAILURE;
-    }
-
-    String fname = cmd.get<String>("i");
-    bool useCPU = cmd.get<bool>("c");
-
-    cv::Mat input = imread(fname);
-    if(input.empty())
-    {
-        help(cmd, "Error opening: " + fname);
-        return EXIT_FAILURE;
-    }
-    //////////////////////////////////////////////////////////////////////////////
-    // Program start in a try/catch safety context (Retina may throw errors)
-    try
-    {
-        // create a retina instance with default parameters setup, uncomment the initialisation you wanna test
-        cv::Ptr<cv::bioinspired::Retina> oclRetina;
-        cv::Ptr<cv::bioinspired::Retina> retina;
-        // declare retina output buffers
-        cv::ocl::oclMat retina_parvo_ocl;
-        cv::ocl::oclMat retina_magno_ocl;
-        cv::Mat retina_parvo;
-        cv::Mat retina_magno;
-
-        if(useCPU)
-        {
-            retina = cv::bioinspired::createRetina(input.size());
-            retina->clearBuffers();
-        }
-        else
-        {
-            oclRetina = cv::bioinspired::createRetina_OCL(input.size());
-            oclRetina->clearBuffers();
-        }
-
-        int64 temp_time = 0, total_time = 0;
-
-        int loop_counter = 0;
-        for(; loop_counter <= total_loop_count; ++loop_counter)
-        {
-            if(useCPU)
-            {
-                temp_time = cv::getTickCount();
-                retina->run(input);
-                retina->getParvo(retina_parvo);
-                retina->getMagno(retina_magno);
-            }
-            else
-            {
-                cv::ocl::oclMat input_ocl(input);
-                temp_time = cv::getTickCount();
-                oclRetina->run(input_ocl);
-                oclRetina->getParvo(retina_parvo_ocl);
-                oclRetina->getMagno(retina_magno_ocl);
-            }
-            // will not count the first loop, which is considered as warm-up period
-            if(loop_counter > 0)
-            {
-                temp_time = (cv::getTickCount() - temp_time);
-                total_time += temp_time;
-                printf("Frame id %2d: %3.4fms\n", loop_counter, (double)temp_time / cv::getTickFrequency() * 1000.0);
-            }
-            if(!useCPU)
-            {
-                retina_parvo = retina_parvo_ocl;
-                retina_magno = retina_magno_ocl;
-            }
-            cv::imshow("retina input", input);
-            cv::imshow("Retina Parvo", retina_parvo);
-            cv::imshow("Retina Magno", retina_magno);
-            cv::waitKey(10);
-        }
-        printf("Average: %.4fms\n", (double)total_time / total_loop_count / cv::getTickFrequency() * 1000.0);
-    }
-    catch(cv::Exception e)
-    {
-        std::cerr << "Error using Retina : " << e.what() << std::endl;
-    }
-    // Program end message
-    std::cout << "Retina demo end" << std::endl;
-    return EXIT_SUCCESS;
-}
diff --git a/samples/ocl/squares.cpp b/samples/ocl/squares.cpp
deleted file mode 100644
index b53648f3f..000000000
--- a/samples/ocl/squares.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-// The "Square Detector" program.
-// It loads several images sequentially and tries to find squares in
-// each image
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/ocl/ocl.hpp"
-#include <iostream>
-#include <math.h>
-#include <string.h>
-
-using namespace cv;
-using namespace std;
-
-#define ACCURACY_CHECK
-
-#ifdef ACCURACY_CHECK
-// check if two vectors of vector of points are near or not
-// prior assumption is that they are in correct order
-static bool checkPoints(
-    vector< vector<Point> > set1,
-    vector< vector<Point> > set2,
-    int maxDiff = 5)
-{
-    if(set1.size() != set2.size())
-    {
-        return false;
-    }
-
-    for(vector< vector<Point> >::iterator it1 = set1.begin(), it2 = set2.begin();
-            it1 < set1.end() && it2 < set2.end(); it1 ++, it2 ++)
-    {
-        vector<Point> pts1 = *it1;
-        vector<Point> pts2 = *it2;
-
-
-        if(pts1.size() != pts2.size())
-        {
-            return false;
-        }
-        for(size_t i = 0; i < pts1.size(); i ++)
-        {
-            Point pt1 = pts1[i], pt2 = pts2[i];
-            if(std::abs(pt1.x - pt2.x) > maxDiff ||
-                    std::abs(pt1.y - pt2.y) > maxDiff)
-            {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-#endif
-
-int thresh = 50, N = 11;
-const char* wndname = "OpenCL Square Detection Demo";
-
-
-// helper function:
-// finds a cosine of angle between vectors
-// from pt0->pt1 and from pt0->pt2
-static double angle( Point pt1, Point pt2, Point pt0 )
-{
-    double dx1 = pt1.x - pt0.x;
-    double dy1 = pt1.y - pt0.y;
-    double dx2 = pt2.x - pt0.x;
-    double dy2 = pt2.y - pt0.y;
-    return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
-}
-
-
-// returns sequence of squares detected on the image.
-// the sequence is stored in the specified memory storage
-static void findSquares( const Mat& image, vector<vector<Point> >& squares )
-{
-    squares.clear();
-    Mat pyr, timg, gray0(image.size(), CV_8U), gray;
-
-    // down-scale and upscale the image to filter out the noise
-    pyrDown(image, pyr, Size(image.cols/2, image.rows/2));
-    pyrUp(pyr, timg, image.size());
-    vector<vector<Point> > contours;
-
-    // find squares in every color plane of the image
-    for( int c = 0; c < 3; c++ )
-    {
-        int ch[] = {c, 0};
-        mixChannels(&timg, 1, &gray0, 1, ch, 1);
-
-        // try several threshold levels
-        for( int l = 0; l < N; l++ )
-        {
-            // hack: use Canny instead of zero threshold level.
-            // Canny helps to catch squares with gradient shading
-            if( l == 0 )
-            {
-                // apply Canny. Take the upper threshold from slider
-                // and set the lower to 0 (which forces edges merging)
-                Canny(gray0, gray, 0, thresh, 5);
-                // dilate canny output to remove potential
-                // holes between edge segments
-                dilate(gray, gray, Mat(), Point(-1,-1));
-            }
-            else
-            {
-                // apply threshold if l!=0:
-                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
-                cv::threshold(gray0, gray, (l+1)*255/N, 255, THRESH_BINARY);
-            }
-
-            // find contours and store them all as a list
-            findContours(gray, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
-
-            vector<Point> approx;
-
-            // test each contour
-            for( size_t i = 0; i < contours.size(); i++ )
-            {
-                // approximate contour with accuracy proportional
-                // to the contour perimeter
-                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
-
-                // square contours should have 4 vertices after approximation
-                // relatively large area (to filter out noisy contours)
-                // and be convex.
-                // Note: absolute value of an area is used because
-                // area may be positive or negative - in accordance with the
-                // contour orientation
-                if( approx.size() == 4 &&
-                        fabs(contourArea(Mat(approx))) > 1000 &&
-                        isContourConvex(Mat(approx)) )
-                {
-                    double maxCosine = 0;
-
-                    for( int j = 2; j < 5; j++ )
-                    {
-                        // find the maximum cosine of the angle between joint edges
-                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
-                        maxCosine = MAX(maxCosine, cosine);
-                    }
-
-                    // if cosines of all angles are small
-                    // (all angles are ~90 degree) then write quandrange
-                    // vertices to resultant sequence
-                    if( maxCosine < 0.3 )
-                        squares.push_back(approx);
-                }
-            }
-        }
-    }
-}
-
-
-// returns sequence of squares detected on the image.
-// the sequence is stored in the specified memory storage
-static void findSquares_ocl( const Mat& image, vector<vector<Point> >& squares )
-{
-    squares.clear();
-
-    Mat gray;
-    cv::ocl::oclMat pyr_ocl, timg_ocl, gray0_ocl, gray_ocl;
-
-    // down-scale and upscale the image to filter out the noise
-    ocl::pyrDown(ocl::oclMat(image), pyr_ocl);
-    ocl::pyrUp(pyr_ocl, timg_ocl);
-
-    vector<vector<Point> > contours;
-    vector<cv::ocl::oclMat> gray0s;
-    ocl::split(timg_ocl, gray0s); // split 3 channels into a vector of oclMat
-    // find squares in every color plane of the image
-    for( int c = 0; c < 3; c++ )
-    {
-        gray0_ocl = gray0s[c];
-        // try several threshold levels
-        for( int l = 0; l < N; l++ )
-        {
-            // hack: use Canny instead of zero threshold level.
-            // Canny helps to catch squares with gradient shading
-            if( l == 0 )
-            {
-                // do canny on OpenCL device
-                // apply Canny. Take the upper threshold from slider
-                // and set the lower to 0 (which forces edges merging)
-                cv::ocl::Canny(gray0_ocl, gray_ocl, 0, thresh, 5);
-                // dilate canny output to remove potential
-                // holes between edge segments
-                ocl::dilate(gray_ocl, gray_ocl, Mat(), Point(-1,-1));
-                gray = Mat(gray_ocl);
-            }
-            else
-            {
-                // apply threshold if l!=0:
-                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
-                cv::ocl::threshold(gray0_ocl, gray_ocl, (l+1)*255/N, 255, THRESH_BINARY);
-                gray = gray_ocl;
-            }
-
-            // find contours and store them all as a list
-            findContours(gray, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
-
-            vector<Point> approx;
-            // test each contour
-            for( size_t i = 0; i < contours.size(); i++ )
-            {
-                // approximate contour with accuracy proportional
-                // to the contour perimeter
-                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
-
-                // square contours should have 4 vertices after approximation
-                // relatively large area (to filter out noisy contours)
-                // and be convex.
-                // Note: absolute value of an area is used because
-                // area may be positive or negative - in accordance with the
-                // contour orientation
-                if( approx.size() == 4 &&
-                        fabs(contourArea(Mat(approx))) > 1000 &&
-                        isContourConvex(Mat(approx)) )
-                {
-                    double maxCosine = 0;
-                    for( int j = 2; j < 5; j++ )
-                    {
-                        // find the maximum cosine of the angle between joint edges
-                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
-                        maxCosine = MAX(maxCosine, cosine);
-                    }
-
-                    // if cosines of all angles are small
-                    // (all angles are ~90 degree) then write quandrange
-                    // vertices to resultant sequence
-                    if( maxCosine < 0.3 )
-                        squares.push_back(approx);
-                }
-            }
-        }
-    }
-}
-
-
-// the function draws all the squares in the image
-static void drawSquares( Mat& image, const vector<vector<Point> >& squares )
-{
-    for( size_t i = 0; i < squares.size(); i++ )
-    {
-        const Point* p = &squares[i][0];
-        int n = (int)squares[i].size();
-        polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, LINE_AA);
-    }
-}
-
-
-// draw both pure-C++ and ocl square results onto a single image
-static Mat drawSquaresBoth( const Mat& image,
-                            const vector<vector<Point> >& sqsCPP,
-                            const vector<vector<Point> >& sqsOCL
-)
-{
-    Mat imgToShow(Size(image.cols * 2, image.rows), image.type());
-    Mat lImg = imgToShow(Rect(Point(0, 0), image.size()));
-    Mat rImg = imgToShow(Rect(Point(image.cols, 0), image.size()));
-    image.copyTo(lImg);
-    image.copyTo(rImg);
-    drawSquares(lImg, sqsCPP);
-    drawSquares(rImg, sqsOCL);
-    float fontScale = 0.8f;
-    Scalar white = Scalar::all(255), black = Scalar::all(0);
-
-    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
-    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, black, 2);
-    putText(lImg, "C++", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
-    putText(rImg, "OCL", Point(10, 20), FONT_HERSHEY_COMPLEX_SMALL, fontScale, white, 1);
-
-    return imgToShow;
-}
-
-
-int main(int argc, char** argv)
-{
-    const char* keys =
-        "{ i | input   |                    | specify input image }"
-        "{ o | output  | squares_output.jpg | specify output save path}"
-        "{ h | help    | false              | print help message }";
-    CommandLineParser cmd(argc, argv, keys);
-    string inputName = cmd.get<string>("i");
-    string outfile = cmd.get<string>("o");
-
-    if(cmd.get<bool>("help"))
-    {
-        cout << "Usage : squares [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
-    int iterations = 10;
-    namedWindow( wndname, WINDOW_AUTOSIZE );
-    vector<vector<Point> > squares_cpu, squares_ocl;
-
-    Mat image = imread(inputName, 1);
-    if( image.empty() )
-    {
-        cout << "Couldn't load " << inputName << endl;
-        return EXIT_FAILURE;
-    }
-
-    int j = iterations;
-    int64 t_ocl = 0, t_cpp = 0;
-    //warm-ups
-    cout << "warming up ..." << endl;
-    findSquares(image, squares_cpu);
-    findSquares_ocl(image, squares_ocl);
-
-
-#ifdef ACCURACY_CHECK
-    cout << "Checking ocl accuracy ... " << endl;
-    cout << (checkPoints(squares_cpu, squares_ocl) ? "Pass" : "Failed") << endl;
-#endif
-    do
-    {
-        int64 t_start = cv::getTickCount();
-        findSquares(image, squares_cpu);
-        t_cpp += cv::getTickCount() - t_start;
-
-
-        t_start  = cv::getTickCount();
-        findSquares_ocl(image, squares_ocl);
-        t_ocl += cv::getTickCount() - t_start;
-        cout << "run loop: " << j << endl;
-    }
-    while(--j);
-    cout << "cpp average time: " << 1000.0f * (double)t_cpp / getTickFrequency() / iterations << "ms" << endl;
-    cout << "ocl average time: " << 1000.0f * (double)t_ocl / getTickFrequency() / iterations << "ms" << endl;
-
-    Mat result = drawSquaresBoth(image, squares_cpu, squares_ocl);
-    imshow(wndname, result);
-    imwrite(outfile, result);
-    waitKey(0);
-
-    return EXIT_SUCCESS;
-}
diff --git a/samples/ocl/stereo_match.cpp b/samples/ocl/stereo_match.cpp
deleted file mode 100644
index 880ad51c0..000000000
--- a/samples/ocl/stereo_match.cpp
+++ /dev/null
@@ -1,384 +0,0 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <iomanip>
-#include <stdexcept>
-
-#include "opencv2/core/utility.hpp"
-#include "opencv2/ocl/ocl.hpp"
-#include "opencv2/highgui/highgui.hpp"
-
-using namespace cv;
-using namespace std;
-using namespace ocl;
-
-
-struct App
-{
-    App(CommandLineParser& cmd);
-    void run();
-    void handleKey(char key);
-    void printParams() const;
-
-    void workBegin()
-    {
-        work_begin = getTickCount();
-    }
-    void workEnd()
-    {
-        int64 d = getTickCount() - work_begin;
-        double f = getTickFrequency();
-        work_fps = f / d;
-    }
-    string method_str() const
-    {
-        switch (method)
-        {
-        case BM:
-            return "BM";
-        case BP:
-            return "BP";
-        case CSBP:
-            return "CSBP";
-        }
-        return "";
-    }
-    string text() const
-    {
-        stringstream ss;
-        ss << "(" << method_str() << ") FPS: " << setiosflags(ios::left)
-           << setprecision(4) << work_fps;
-        return ss.str();
-    }
-private:
-    bool running, write_once;
-
-    Mat left_src, right_src;
-    Mat left, right;
-    oclMat d_left, d_right;
-
-    StereoBM_OCL bm;
-    StereoBeliefPropagation bp;
-    StereoConstantSpaceBP csbp;
-
-    int64 work_begin;
-    double work_fps;
-
-    string l_img, r_img;
-    string out_img;
-    enum {BM, BP, CSBP} method;
-    int ndisp; // Max disparity + 1
-    enum {GPU, CPU} type;
-};
-
-int main(int argc, char** argv)
-{
-    const char* keys =
-        "{ h | help     | false                     | print help message }"
-        "{ l | left     |                           | specify left image }"
-        "{ r | right    |                           | specify right image }"
-        "{ m | method   | BM                        | specify match method(BM/BP/CSBP) }"
-        "{ n | ndisp    | 64                        | specify number of disparity levels }"
-        "{ o | output   | stereo_match_output.jpg   | specify output path when input is images}";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
-    {
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return 0;
-    }
-
-    try
-    {
-        App app(cmd);
-        cout << "Device name:" << cv::ocl::Context::getContext()->getDeviceInfo().deviceName << endl;
-
-        app.run();
-    }
-    catch (const exception& e)
-    {
-        cout << "error: " << e.what() << endl;
-    }
-
-    return EXIT_SUCCESS;
-}
-
-App::App(CommandLineParser& cmd)
-    : running(false),method(BM)
-{
-    cout << "stereo_match_ocl sample\n";
-    cout << "\nControls:\n"
-         << "\tesc - exit\n"
-         << "\to - save output image once\n"
-         << "\tp - print current parameters\n"
-         << "\tg - convert source images into gray\n"
-         << "\tm - change stereo match method\n"
-         << "\ts - change Sobel prefiltering flag (for BM only)\n"
-         << "\t1/q - increase/decrease maximum disparity\n"
-         << "\t2/w - increase/decrease window size (for BM only)\n"
-         << "\t3/e - increase/decrease iteration count (for BP and CSBP only)\n"
-         << "\t4/r - increase/decrease level count (for BP and CSBP only)\n";
-
-    l_img = cmd.get<string>("l");
-    r_img = cmd.get<string>("r");
-    string mstr = cmd.get<string>("m");
-    if(mstr == "BM") method = BM;
-    else if(mstr == "BP") method = BP;
-    else if(mstr == "CSBP") method = CSBP;
-    else cout << "unknown method!\n";
-    ndisp = cmd.get<int>("n");
-    out_img = cmd.get<string>("o");
-    write_once = false;
-}
-
-
-void App::run()
-{
-    // Load images
-    left_src = imread(l_img);
-    right_src = imread(r_img);
-    if (left_src.empty()) throw runtime_error("can't open file \"" + l_img + "\"");
-    if (right_src.empty()) throw runtime_error("can't open file \"" + r_img + "\"");
-
-    cvtColor(left_src, left, COLOR_BGR2GRAY);
-    cvtColor(right_src, right, COLOR_BGR2GRAY);
-
-    d_left.upload(left);
-    d_right.upload(right);
-
-    imshow("left", left);
-    imshow("right", right);
-
-    // Set common parameters
-    bm.ndisp = ndisp;
-    bp.ndisp = ndisp;
-    csbp.ndisp = ndisp;
-
-    cout << endl;
-    printParams();
-
-    running = true;
-    while (running)
-    {
-        // Prepare disparity map of specified type
-        Mat disp;
-        oclMat d_disp;
-        workBegin();
-        switch (method)
-        {
-        case BM:
-            if (d_left.channels() > 1 || d_right.channels() > 1)
-            {
-                cout << "BM doesn't support color images\n";
-                cvtColor(left_src, left, COLOR_BGR2GRAY);
-                cvtColor(right_src, right, COLOR_BGR2GRAY);
-                cout << "image_channels: " << left.channels() << endl;
-                d_left.upload(left);
-                d_right.upload(right);
-                imshow("left", left);
-                imshow("right", right);
-            }
-            bm(d_left, d_right, d_disp);
-            break;
-        case BP:
-            bp(d_left, d_right, d_disp);
-            break;
-        case CSBP:
-            csbp(d_left, d_right, d_disp);
-            break;
-        }
-
-        // Show results
-        d_disp.download(disp);
-        workEnd();
-
-        if (method != BM)
-        {
-            disp.convertTo(disp, 0);
-        }
-        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
-        imshow("disparity", disp);
-        if(write_once)
-        {
-            imwrite(out_img, disp);
-            write_once = false;
-        }
-        handleKey((char)waitKey(3));
-    }
-}
-
-
-void App::printParams() const
-{
-    cout << "--- Parameters ---\n";
-    cout << "image_size: (" << left.cols << ", " << left.rows << ")\n";
-    cout << "image_channels: " << left.channels() << endl;
-    cout << "method: " << method_str() << endl
-         << "ndisp: " << ndisp << endl;
-    switch (method)
-    {
-    case BM:
-        cout << "win_size: " << bm.winSize << endl;
-        cout << "prefilter_sobel: " << bm.preset << endl;
-        break;
-    case BP:
-        cout << "iter_count: " << bp.iters << endl;
-        cout << "level_count: " << bp.levels << endl;
-        break;
-    case CSBP:
-        cout << "iter_count: " << csbp.iters << endl;
-        cout << "level_count: " << csbp.levels << endl;
-        break;
-    }
-    cout << endl;
-}
-
-
-void App::handleKey(char key)
-{
-    switch (key)
-    {
-    case 27:
-        running = false;
-        break;
-    case 'p':
-    case 'P':
-        printParams();
-        break;
-    case 'g':
-    case 'G':
-        if (left.channels() == 1 && method != BM)
-        {
-            left = left_src;
-            right = right_src;
-        }
-        else
-        {
-            cvtColor(left_src, left, COLOR_BGR2GRAY);
-            cvtColor(right_src, right, COLOR_BGR2GRAY);
-        }
-        d_left.upload(left);
-        d_right.upload(right);
-        cout << "image_channels: " << left.channels() << endl;
-        imshow("left", left);
-        imshow("right", right);
-        break;
-    case 'm':
-    case 'M':
-        switch (method)
-        {
-        case BM:
-            method = BP;
-            break;
-        case BP:
-            method = CSBP;
-            break;
-        case CSBP:
-            method = BM;
-            break;
-        }
-        cout << "method: " << method_str() << endl;
-        break;
-    case 's':
-    case 'S':
-        if (method == BM)
-        {
-            switch (bm.preset)
-            {
-            case StereoBM_OCL::BASIC_PRESET:
-                bm.preset = StereoBM_OCL::PREFILTER_XSOBEL;
-                break;
-            case StereoBM_OCL::PREFILTER_XSOBEL:
-                bm.preset = StereoBM_OCL::BASIC_PRESET;
-                break;
-            }
-            cout << "prefilter_sobel: " << bm.preset << endl;
-        }
-        break;
-    case '1':
-        ndisp == 1 ? ndisp = 8 : ndisp += 8;
-        cout << "ndisp: " << ndisp << endl;
-        bm.ndisp = ndisp;
-        bp.ndisp = ndisp;
-        csbp.ndisp = ndisp;
-        break;
-    case 'q':
-    case 'Q':
-        ndisp = max(ndisp - 8, 1);
-        cout << "ndisp: " << ndisp << endl;
-        bm.ndisp = ndisp;
-        bp.ndisp = ndisp;
-        csbp.ndisp = ndisp;
-        break;
-    case '2':
-        if (method == BM)
-        {
-            bm.winSize = min(bm.winSize + 1, 51);
-            cout << "win_size: " << bm.winSize << endl;
-        }
-        break;
-    case 'w':
-    case 'W':
-        if (method == BM)
-        {
-            bm.winSize = max(bm.winSize - 1, 2);
-            cout << "win_size: " << bm.winSize << endl;
-        }
-        break;
-    case '3':
-        if (method == BP)
-        {
-            bp.iters += 1;
-            cout << "iter_count: " << bp.iters << endl;
-        }
-        else if (method == CSBP)
-        {
-            csbp.iters += 1;
-            cout << "iter_count: " << csbp.iters << endl;
-        }
-        break;
-    case 'e':
-    case 'E':
-        if (method == BP)
-        {
-            bp.iters = max(bp.iters - 1, 1);
-            cout << "iter_count: " << bp.iters << endl;
-        }
-        else if (method == CSBP)
-        {
-            csbp.iters = max(csbp.iters - 1, 1);
-            cout << "iter_count: " << csbp.iters << endl;
-        }
-        break;
-    case '4':
-        if (method == BP)
-        {
-            bp.levels += 1;
-            cout << "level_count: " << bp.levels << endl;
-        }
-        else if (method == CSBP)
-        {
-            csbp.levels += 1;
-            cout << "level_count: " << csbp.levels << endl;
-        }
-        break;
-    case 'r':
-    case 'R':
-        if (method == BP)
-        {
-            bp.levels = max(bp.levels - 1, 1);
-            cout << "level_count: " << bp.levels << endl;
-        }
-        else if (method == CSBP)
-        {
-            csbp.levels = max(csbp.levels - 1, 1);
-            cout << "level_count: " << csbp.levels << endl;
-        }
-        break;
-    case 'o':
-    case 'O':
-        write_once = true;
-        break;
-    }
-}
diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
deleted file mode 100644
index f88678b7b..000000000
--- a/samples/ocl/surf_matcher.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-#include <iostream>
-#include <stdio.h>
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/ocl/ocl.hpp"
-#include "opencv2/nonfree/ocl.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-
-using namespace cv;
-using namespace cv::ocl;
-
-const int LOOP_NUM = 10;
-const int GOOD_PTS_MAX = 50;
-const float GOOD_PORTION = 0.15f;
-
-int64 work_begin = 0;
-int64 work_end = 0;
-
-static void workBegin()
-{
-    work_begin = getTickCount();
-}
-
-static void workEnd()
-{
-    work_end = getTickCount() - work_begin;
-}
-
-static double getTime()
-{
-    return work_end /((double)getTickFrequency() * 1000.);
-}
-
-template<class KPDetector>
-struct SURFDetector
-{
-    KPDetector surf;
-    SURFDetector(double hessian = 800.0)
-        :surf(hessian)
-    {
-    }
-    template<class T>
-    void operator()(const T& in, const T& mask, std::vector<cv::KeyPoint>& pts, T& descriptors, bool useProvided = false)
-    {
-        surf(in, mask, pts, descriptors, useProvided);
-    }
-};
-
-template<class KPMatcher>
-struct SURFMatcher
-{
-    KPMatcher matcher;
-    template<class T>
-    void match(const T& in1, const T& in2, std::vector<cv::DMatch>& matches)
-    {
-        matcher.match(in1, in2, matches);
-    }
-};
-
-static Mat drawGoodMatches(
-    const Mat& cpu_img1,
-    const Mat& cpu_img2,
-    const std::vector<KeyPoint>& keypoints1,
-    const std::vector<KeyPoint>& keypoints2,
-    std::vector<DMatch>& matches,
-    std::vector<Point2f>& scene_corners_
-    )
-{
-    //-- Sort matches and preserve top 10% matches
-    std::sort(matches.begin(), matches.end());
-    std::vector< DMatch > good_matches;
-    double minDist = matches.front().distance,
-           maxDist = matches.back().distance;
-
-    const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
-    for( int i = 0; i < ptsPairs; i++ )
-    {
-        good_matches.push_back( matches[i] );
-    }
-    std::cout << "\nMax distance: " << maxDist << std::endl;
-    std::cout << "Min distance: " << minDist << std::endl;
-
-    std::cout << "Calculating homography using " << ptsPairs << " point pairs." << std::endl;
-
-    // drawing the results
-    Mat img_matches;
-    drawMatches( cpu_img1, keypoints1, cpu_img2, keypoints2,
-                 good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
-                 std::vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
-
-    //-- Localize the object
-    std::vector<Point2f> obj;
-    std::vector<Point2f> scene;
-
-    for( size_t i = 0; i < good_matches.size(); i++ )
-    {
-        //-- Get the keypoints from the good matches
-        obj.push_back( keypoints1[ good_matches[i].queryIdx ].pt );
-        scene.push_back( keypoints2[ good_matches[i].trainIdx ].pt );
-    }
-    //-- Get the corners from the image_1 ( the object to be "detected" )
-    std::vector<Point2f> obj_corners(4);
-    obj_corners[0] = Point(0,0);
-    obj_corners[1] = Point( cpu_img1.cols, 0 );
-    obj_corners[2] = Point( cpu_img1.cols, cpu_img1.rows );
-    obj_corners[3] = Point( 0, cpu_img1.rows );
-    std::vector<Point2f> scene_corners(4);
-
-    Mat H = findHomography( obj, scene, RANSAC );
-    perspectiveTransform( obj_corners, scene_corners, H);
-
-    scene_corners_ = scene_corners;
-
-    //-- Draw lines between the corners (the mapped object in the scene - image_2 )
-    line( img_matches,
-          scene_corners[0] + Point2f( (float)cpu_img1.cols, 0), scene_corners[1] + Point2f( (float)cpu_img1.cols, 0),
-          Scalar( 0, 255, 0), 2, LINE_AA );
-    line( img_matches,
-          scene_corners[1] + Point2f( (float)cpu_img1.cols, 0), scene_corners[2] + Point2f( (float)cpu_img1.cols, 0),
-          Scalar( 0, 255, 0), 2, LINE_AA );
-    line( img_matches,
-          scene_corners[2] + Point2f( (float)cpu_img1.cols, 0), scene_corners[3] + Point2f( (float)cpu_img1.cols, 0),
-          Scalar( 0, 255, 0), 2, LINE_AA );
-    line( img_matches,
-          scene_corners[3] + Point2f( (float)cpu_img1.cols, 0), scene_corners[0] + Point2f( (float)cpu_img1.cols, 0),
-          Scalar( 0, 255, 0), 2, LINE_AA );
-    return img_matches;
-}
-
-////////////////////////////////////////////////////
-// This program demonstrates the usage of SURF_OCL.
-// use cpu findHomography interface to calculate the transformation matrix
-int main(int argc, char* argv[])
-{
-    const char* keys =
-        "{ help h    | false           | print help message  }"
-        "{ left l    |                 | specify left image  }"
-        "{ right r   |                 | specify right image }"
-        "{ output o  | SURF_output.jpg | specify output save path (only works in CPU or GPU only mode) }"
-        "{ use_cpu c | false           | use CPU algorithms  }"
-        "{ use_all a | false           | use both CPU and GPU algorithms}";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
-    {
-        std::cout << "Usage: surf_matcher [options]" << std::endl;
-        std::cout << "Available options:" << std::endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
-    Mat cpu_img1, cpu_img2, cpu_img1_grey, cpu_img2_grey;
-    oclMat img1, img2;
-    bool useCPU = cmd.get<bool>("c");
-    bool useGPU = false;
-    bool useALL = cmd.get<bool>("a");
-
-    std::string outpath = cmd.get<std::string>("o");
-
-    cpu_img1 = imread(cmd.get<std::string>("l"));
-    CV_Assert(!cpu_img1.empty());
-    cvtColor(cpu_img1, cpu_img1_grey, COLOR_BGR2GRAY);
-    img1 = cpu_img1_grey;
-
-    cpu_img2 = imread(cmd.get<std::string>("r"));
-    CV_Assert(!cpu_img2.empty());
-    cvtColor(cpu_img2, cpu_img2_grey, COLOR_BGR2GRAY);
-    img2 = cpu_img2_grey;
-
-    if (useALL)
-        useCPU = useGPU = false;
-    else if(!useCPU && !useALL)
-        useGPU = true;
-
-    if(!useCPU)
-        std::cout
-                << "Device name:"
-                << cv::ocl::Context::getContext()->getDeviceInfo().deviceName
-                << std::endl;
-
-    double surf_time = 0.;
-
-    //declare input/output
-    std::vector<KeyPoint> keypoints1, keypoints2;
-    std::vector<DMatch> matches;
-
-    std::vector<KeyPoint> gpu_keypoints1;
-    std::vector<KeyPoint> gpu_keypoints2;
-    std::vector<DMatch> gpu_matches;
-
-    Mat descriptors1CPU, descriptors2CPU;
-
-    oclMat keypoints1GPU, keypoints2GPU;
-    oclMat descriptors1GPU, descriptors2GPU;
-
-    //instantiate detectors/matchers
-    SURFDetector<SURF>     cpp_surf;
-    SURFDetector<SURF_OCL> ocl_surf;
-
-    SURFMatcher<BFMatcher>      cpp_matcher;
-    SURFMatcher<BFMatcher_OCL>  ocl_matcher;
-
-    //-- start of timing section
-    if (useCPU)
-    {
-        for (int i = 0; i <= LOOP_NUM; i++)
-        {
-            if(i == 1) workBegin();
-            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
-            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
-            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
-        }
-        workEnd();
-        std::cout << "CPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
-        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
-
-        surf_time = getTime();
-        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
-    }
-    else if(useGPU)
-    {
-        for (int i = 0; i <= LOOP_NUM; i++)
-        {
-            if(i == 1) workBegin();
-            ocl_surf(img1, oclMat(), keypoints1, descriptors1GPU);
-            ocl_surf(img2, oclMat(), keypoints2, descriptors2GPU);
-            ocl_matcher.match(descriptors1GPU, descriptors2GPU, matches);
-        }
-        workEnd();
-        std::cout << "OCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
-        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
-
-        surf_time = getTime();
-        std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
-    }
-    else
-    {
-        //cpu runs
-        for (int i = 0; i <= LOOP_NUM; i++)
-        {
-            if(i == 1) workBegin();
-            cpp_surf(cpu_img1_grey, Mat(), keypoints1, descriptors1CPU);
-            cpp_surf(cpu_img2_grey, Mat(), keypoints2, descriptors2CPU);
-            cpp_matcher.match(descriptors1CPU, descriptors2CPU, matches);
-        }
-        workEnd();
-        std::cout << "\nCPP: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
-        std::cout << "CPP: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
-
-        surf_time = getTime();
-        std::cout << "(CPP)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl;
-
-        //gpu runs
-        for (int i = 0; i <= LOOP_NUM; i++)
-        {
-            if(i == 1) workBegin();
-            ocl_surf(img1, oclMat(), gpu_keypoints1, descriptors1GPU);
-            ocl_surf(img2, oclMat(), gpu_keypoints2, descriptors2GPU);
-            ocl_matcher.match(descriptors1GPU, descriptors2GPU, gpu_matches);
-        }
-        workEnd();
-        std::cout << "\nOCL: FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
-        std::cout << "OCL: FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
-
-        surf_time = getTime();
-        std::cout << "(OCL)SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
-
-    }
-
-    //--------------------------------------------------------------------------
-    std::vector<Point2f> cpu_corner;
-    Mat img_matches = drawGoodMatches(cpu_img1, cpu_img2, keypoints1, keypoints2, matches, cpu_corner);
-
-    std::vector<Point2f> gpu_corner;
-    Mat ocl_img_matches;
-    if(useALL || (!useCPU&&!useGPU))
-    {
-        ocl_img_matches = drawGoodMatches(cpu_img1, cpu_img2, gpu_keypoints1, gpu_keypoints2, gpu_matches, gpu_corner);
-
-        //check accuracy
-        std::cout<<"\nCheck accuracy:\n";
-
-        if(cpu_corner.size()!=gpu_corner.size())
-            std::cout<<"Failed\n";
-        else
-        {
-            bool result = false;
-            for(size_t i = 0; i < cpu_corner.size(); i++)
-            {
-                if((std::abs(cpu_corner[i].x - gpu_corner[i].x) > 10)
-                        ||(std::abs(cpu_corner[i].y - gpu_corner[i].y) > 10))
-                {
-                    std::cout<<"Failed\n";
-                    result = false;
-                    break;
-                }
-                result = true;
-            }
-            if(result)
-                std::cout<<"Passed\n";
-        }
-    }
-
-    //-- Show detected matches
-    if (useCPU)
-    {
-        namedWindow("cpu surf matches", 0);
-        imshow("cpu surf matches", img_matches);
-        imwrite(outpath, img_matches);
-    }
-    else if(useGPU)
-    {
-        namedWindow("ocl surf matches", 0);
-        imshow("ocl surf matches", img_matches);
-        imwrite(outpath, img_matches);
-    }
-    else
-    {
-        namedWindow("cpu surf matches", 0);
-        imshow("cpu surf matches", img_matches);
-
-        namedWindow("ocl surf matches", 0);
-        imshow("ocl surf matches", ocl_img_matches);
-    }
-    waitKey(0);
-    return EXIT_SUCCESS;
-}
diff --git a/samples/python2/grabcut.py b/samples/python2/grabcut.py
index 1d5b823dd..42d9743be 100644
--- a/samples/python2/grabcut.py
+++ b/samples/python2/grabcut.py
@@ -5,7 +5,7 @@ Interactive Image Segmentation using GrabCut algorithm.
 
 This sample shows interactive image segmentation using grabcut algorithm.
 
-USAGE :
+USAGE:
     python grabcut.py <filename>
 
 README FIRST:
@@ -63,14 +63,14 @@ def onmouse(event,x,y,flags,param):
         if rectangle == True:
             img = img2.copy()
             cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
-            rect = (ix,iy,abs(ix-x),abs(iy-y))
+            rect = (min(ix,x),min(iy,y),abs(ix-x),abs(iy-y))
             rect_or_mask = 0
 
     elif event == cv2.EVENT_RBUTTONUP:
         rectangle = False
         rect_over = True
         cv2.rectangle(img,(ix,iy),(x,y),BLUE,2)
-        rect = (ix,iy,abs(ix-x),abs(iy-y))
+        rect = (min(ix,x),min(iy,y),abs(ix-x),abs(iy-y))
         rect_or_mask = 0
         print " Now press the key 'n' a few times until no further change \n"
 
@@ -103,7 +103,7 @@ if len(sys.argv) == 2:
     filename = sys.argv[1] # for drawing purposes
 else:
     print "No input image given, so loading default image, lena.jpg \n"
-    print "Correct Usage : python grabcut.py <filename> \n"
+    print "Correct Usage: python grabcut.py <filename> \n"
     filename = '../cpp/lena.jpg'
 
 img = cv2.imread(filename)
@@ -117,7 +117,7 @@ cv2.namedWindow('input')
 cv2.setMouseCallback('input',onmouse)
 cv2.moveWindow('input',img.shape[1]+10,90)
 
-print " Instructions : \n"
+print " Instructions: \n"
 print " Draw a rectangle around the object using right mouse button \n"
 
 while(1):
diff --git a/samples/ocl/CMakeLists.txt b/samples/tapi/CMakeLists.txt
similarity index 62%
rename from samples/ocl/CMakeLists.txt
rename to samples/tapi/CMakeLists.txt
index b4f7afa21..e1fc8552c 100644
--- a/samples/ocl/CMakeLists.txt
+++ b/samples/tapi/CMakeLists.txt
@@ -1,30 +1,23 @@
-SET(OPENCV_OCL_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
-                                     opencv_ml opencv_video opencv_objdetect opencv_features2d
-                                     opencv_calib3d opencv_legacy opencv_contrib opencv_ocl
-                                     opencv_nonfree opencv_bioinspired)
+SET(OPENCV_TAPI_SAMPLES_REQUIRED_DEPS opencv_core opencv_imgproc opencv_video opencv_highgui opencv_objdetect opencv_features2d opencv_calib3d opencv_nonfree opencv_flann)
 
-ocv_check_dependencies(${OPENCV_OCL_SAMPLES_REQUIRED_DEPS})
+ocv_check_dependencies(${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 
 if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
-  set(project "ocl")
+  set(project "tapi")
   string(TOUPPER "${project}" project_upper)
 
   project("${project}_samples")
 
-  ocv_include_modules(${OPENCV_OCL_SAMPLES_REQUIRED_DEPS})
-
-  if(HAVE_OPENCL)
-    ocv_include_directories(${OPENCL_INCLUDE_DIR})
-  endif()
+  ocv_include_modules(${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 
   # ---------------------------------------------
   #      Define executable targets
   # ---------------------------------------------
-  MACRO(OPENCV_DEFINE_OCL_EXAMPLE name srcs)
+  MACRO(OPENCV_DEFINE_TAPI_EXAMPLE name srcs)
     set(the_target "example_${project}_${name}")
     add_executable(${the_target} ${srcs})
 
-    target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_OCL_SAMPLES_REQUIRED_DEPS})
+    target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_TAPI_SAMPLES_REQUIRED_DEPS})
 
     set_target_properties(${the_target} PROPERTIES
       OUTPUT_NAME "${project}-example-${name}"
@@ -47,13 +40,13 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   foreach(sample_filename ${all_samples})
     get_filename_component(sample ${sample_filename} NAME_WE)
     file(GLOB sample_srcs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${sample}.*)
-    OPENCV_DEFINE_OCL_EXAMPLE(${sample} ${sample_srcs})
+    OPENCV_DEFINE_TAPI_EXAMPLE(${sample} ${sample_srcs})
   endforeach()
 endif()
 
-if (INSTALL_C_EXAMPLES AND NOT WIN32)
+if(INSTALL_C_EXAMPLES AND NOT WIN32)
   file(GLOB install_list *.c *.cpp *.jpg *.png *.data makefile.* build_all.sh *.dsp *.cmd )
   install(FILES ${install_list}
-          DESTINATION share/OpenCV/samples/${project}
-          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ)
+          DESTINATION ${OPENCV_SAMPLES_SRC_INSTALL_PATH}/tapi
+          PERMISSIONS OWNER_READ GROUP_READ WORLD_READ COMPONENT samples)
 endif()
diff --git a/samples/tapi/bgfg_segm.cpp b/samples/tapi/bgfg_segm.cpp
new file mode 100644
index 000000000..2fa12bba9
--- /dev/null
+++ b/samples/tapi/bgfg_segm.cpp
@@ -0,0 +1,122 @@
+#include <iostream>
+#include <string>
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/video.hpp"
+
+using namespace std;
+using namespace cv;
+
+#define M_MOG  1
+#define M_MOG2 2
+
+int main(int argc, const char** argv)
+{
+    CommandLineParser cmd(argc, argv,
+        "{ c camera   | false       | use camera }"
+        "{ f file     | 768x576.avi | input video file }"
+        "{ t type     | mog         | method's type (mog, mog2) }"
+        "{ h help     | false       | print help message }"
+        "{ m cpu_mode | false       | press 'm' to switch OpenCL<->CPU}");
+
+    if (cmd.has("help"))
+    {
+        cout << "Usage : bgfg_segm [options]" << endl;
+        cout << "Available options:" << endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+
+    bool useCamera = cmd.has("camera");
+    string file = cmd.get<string>("file");
+    string method = cmd.get<string>("type");
+
+    if (method != "mog" && method != "mog2")
+    {
+        cerr << "Incorrect method" << endl;
+        return EXIT_FAILURE;
+    }
+
+    int m = method == "mog" ? M_MOG : M_MOG2;
+
+    VideoCapture cap;
+    if (useCamera)
+        cap.open(0);
+    else
+        cap.open(file);
+
+    if (!cap.isOpened())
+    {
+        cout << "can not open camera or video file" << endl;
+        return EXIT_FAILURE;
+    }
+
+    UMat frame, fgmask, fgimg;
+    cap >> frame;
+    fgimg.create(frame.size(), frame.type());
+
+    Ptr<BackgroundSubtractorMOG> mog = createBackgroundSubtractorMOG();
+    Ptr<BackgroundSubtractorMOG2> mog2 = createBackgroundSubtractorMOG2();
+
+    switch (m)
+    {
+    case M_MOG:
+        mog->apply(frame, fgmask, 0.01f);
+        break;
+
+    case M_MOG2:
+        mog2->apply(frame, fgmask);
+        break;
+    }
+    bool running=true;
+    for (;;)
+    {
+        if(!running)
+            break;
+        cap >> frame;
+        if (frame.empty())
+            break;
+
+        int64 start = getTickCount();
+
+        //update the model
+        switch (m)
+        {
+        case M_MOG:
+            mog->apply(frame, fgmask, 0.01f);
+            break;
+
+        case M_MOG2:
+            mog2->apply(frame, fgmask);
+            break;
+        }
+
+        double fps = getTickFrequency() / (getTickCount() - start);
+        std::cout << "FPS : " << fps << std::endl;
+        std::cout << fgimg.size() << std::endl;
+        fgimg.setTo(Scalar::all(0));
+        frame.copyTo(fgimg, fgmask);
+
+        imshow("image", frame);
+        imshow("foreground mask", fgmask);
+        imshow("foreground image", fgimg);
+
+        char key = (char)waitKey(30);
+
+        switch (key)
+        {
+        case 27:
+            running = false;
+            break;
+        case 'm':
+        case 'M':
+            ocl::setUseOpenCL(!ocl::useOpenCL());
+            cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL enabled" : "CPU") << " mode\n";
+            break;
+        }
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/samples/tapi/camshift.cpp b/samples/tapi/camshift.cpp
new file mode 100644
index 000000000..22c65bf69
--- /dev/null
+++ b/samples/tapi/camshift.cpp
@@ -0,0 +1,226 @@
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+
+#include <iostream>
+#include <cctype>
+
+static cv::UMat image;
+static bool backprojMode = false;
+static bool selectObject = false;
+static int trackObject = 0;
+static bool showHist = true;
+static cv::Rect selection;
+static int vmin = 10, vmax = 256, smin = 30;
+
+static void onMouse(int event, int x, int y, int, void*)
+{
+    static cv::Point origin;
+
+    if (selectObject)
+    {
+        selection.x = std::min(x, origin.x);
+        selection.y = std::min(y, origin.y);
+        selection.width = std::abs(x - origin.x);
+        selection.height = std::abs(y - origin.y);
+
+        selection &= cv::Rect(0, 0, image.cols, image.rows);
+    }
+
+    switch (event)
+    {
+    case cv::EVENT_LBUTTONDOWN:
+        origin = cv::Point(x, y);
+        selection = cv::Rect(x, y, 0, 0);
+        selectObject = true;
+        break;
+    case cv::EVENT_LBUTTONUP:
+        selectObject = false;
+        if (selection.width > 0 && selection.height > 0)
+            trackObject = -1;
+        break;
+    default:
+        break;
+    }
+}
+
+static void help()
+{
+    std::cout << "\nThis is a demo that shows mean-shift based tracking using Transparent API\n"
+            "You select a color objects such as your face and it tracks it.\n"
+            "This reads from video camera (0 by default, or the camera number the user enters\n"
+            "Usage: \n"
+            "   ./camshiftdemo [camera number]\n";
+
+    std::cout << "\n\nHot keys: \n"
+            "\tESC - quit the program\n"
+            "\ts - stop the tracking\n"
+            "\tb - switch to/from backprojection view\n"
+            "\th - show/hide object histogram\n"
+            "\tp - pause video\n"
+            "\tc - use OpenCL or not\n"
+            "To initialize tracking, select the object with mouse\n";
+}
+
+int main(int argc, const char ** argv)
+{
+    help();
+
+    cv::VideoCapture cap;
+    cv::Rect trackWindow;
+    int hsize = 16;
+    float hranges[2] = { 0, 180 };
+
+    const char * const keys = { "{@camera_number| 0 | camera number}" };
+    cv::CommandLineParser parser(argc, argv, keys);
+    int camNum = parser.get<int>(0);
+
+    cap.open(camNum);
+
+    if (!cap.isOpened())
+    {
+        help();
+
+        std::cout << "***Could not initialize capturing...***\n";
+        std::cout << "Current parameter's value: \n";
+        parser.printMessage();
+
+        return EXIT_FAILURE;
+    }
+
+    cv::namedWindow("Histogram", cv::WINDOW_NORMAL);
+    cv::namedWindow("CamShift Demo", cv::WINDOW_NORMAL);
+    cv::setMouseCallback("CamShift Demo", onMouse);
+    cv::createTrackbar("Vmin", "CamShift Demo", &vmin, 256);
+    cv::createTrackbar("Vmax", "CamShift Demo", &vmax, 256);
+    cv::createTrackbar("Smin", "CamShift Demo", &smin, 256);
+
+    cv::Mat frame, histimg(200, 320, CV_8UC3, cv::Scalar::all(0));
+    cv::UMat hsv, hist, hue, mask, backproj;
+    bool paused = false;
+
+    for ( ; ; )
+    {
+        if (!paused)
+        {
+            cap >> frame;
+            if (frame.empty())
+                break;
+        }
+
+        frame.copyTo(image);
+
+        if (!paused)
+        {
+            cv::cvtColor(image, hsv, cv::COLOR_BGR2HSV);
+
+            if (trackObject)
+            {
+                int _vmin = vmin, _vmax = vmax;
+
+                cv::inRange(hsv, cv::Scalar(0, smin, std::min(_vmin, _vmax)),
+                        cv::Scalar(180, 256, std::max(_vmin, _vmax)), mask);
+
+                int fromTo[2] = { 0,0 };
+                hue.create(hsv.size(), hsv.depth());
+                cv::mixChannels(std::vector<cv::UMat>(1, hsv), std::vector<cv::UMat>(1, hue), fromTo, 1);
+
+                if (trackObject < 0)
+                {
+                    cv::UMat roi(hue, selection), maskroi(mask, selection);
+                    cv::calcHist(std::vector<cv::Mat>(1, roi.getMat(cv::ACCESS_READ)), std::vector<int>(1, 0),
+                                 maskroi, hist, std::vector<int>(1, hsize), std::vector<float>(hranges, hranges + 2));
+                    cv::normalize(hist, hist, 0, 255, cv::NORM_MINMAX);
+
+                    trackWindow = selection;
+                    trackObject = 1;
+
+                    histimg = cv::Scalar::all(0);
+                    int binW = histimg.cols / hsize;
+                    cv::Mat buf (1, hsize, CV_8UC3);
+                    for (int i = 0; i < hsize; i++)
+                        buf.at<cv::Vec3b>(i) = cv::Vec3b(cv::saturate_cast<uchar>(i*180./hsize), 255, 255);
+                    cv::cvtColor(buf, buf, cv::COLOR_HSV2BGR);
+
+                    {
+                        cv::Mat _hist = hist.getMat(cv::ACCESS_READ);
+                        for (int i = 0; i < hsize; i++)
+                        {
+                            int val = cv::saturate_cast<int>(_hist.at<float>(i)*histimg.rows/255);
+                            cv::rectangle(histimg, cv::Point(i*binW, histimg.rows),
+                                       cv::Point((i+1)*binW, histimg.rows - val),
+                                       cv::Scalar(buf.at<cv::Vec3b>(i)), -1, 8);
+                        }
+                    }
+                }
+
+                cv::calcBackProject(std::vector<cv::UMat>(1, hue), std::vector<int>(1, 0), hist, backproj,
+                                    std::vector<float>(hranges, hranges + 2), 1.0);
+                cv::bitwise_and(backproj, mask, backproj);
+
+                cv::RotatedRect trackBox = cv::CamShift(backproj, trackWindow,
+                                    cv::TermCriteria(cv::TermCriteria::EPS | cv::TermCriteria::COUNT, 10, 1));
+                if (trackWindow.area() <= 1)
+                {
+                    int cols = backproj.cols, rows = backproj.rows, r = (std::min(cols, rows) + 5)/6;
+                    trackWindow = cv::Rect(trackWindow.x - r, trackWindow.y - r,
+                                       trackWindow.x + r, trackWindow.y + r) &
+                                  cv::Rect(0, 0, cols, rows);
+                }
+
+                if (backprojMode)
+                    cv::cvtColor(backproj, image, cv::COLOR_GRAY2BGR);
+
+                {
+                    cv::Mat _image = image.getMat(cv::ACCESS_RW);
+                    cv::ellipse(_image, trackBox, cv::Scalar(0, 0, 255), 3, cv::LINE_AA);
+                }
+            }
+        }
+        else if (trackObject < 0)
+            paused = false;
+
+        if (selectObject && selection.width > 0 && selection.height > 0)
+        {
+            cv::UMat roi(image, selection);
+            cv::bitwise_not(roi, roi);
+        }
+
+        cv::imshow("CamShift Demo", image);
+        if (showHist)
+            cv::imshow("Histogram", histimg);
+
+        char c = (char)cv::waitKey(10);
+        if (c == 27)
+            break;
+
+        switch(c)
+        {
+        case 'b':
+            backprojMode = !backprojMode;
+            break;
+        case 't':
+            trackObject = 0;
+            histimg = cv::Scalar::all(0);
+            break;
+        case 'h':
+            showHist = !showHist;
+            if (!showHist)
+                cv::destroyWindow("Histogram");
+            else
+                cv::namedWindow("Histogram", cv::WINDOW_AUTOSIZE);
+            break;
+        case 'p':
+            paused = !paused;
+            break;
+        case 'c':
+            cv::ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+        default:
+            break;
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/ocl/clahe.cpp b/samples/tapi/clahe.cpp
similarity index 69%
rename from samples/ocl/clahe.cpp
rename to samples/tapi/clahe.cpp
index 894a41483..a28f2ab05 100644
--- a/samples/ocl/clahe.cpp
+++ b/samples/tapi/clahe.cpp
@@ -1,9 +1,10 @@
 #include <iostream>
 #include "opencv2/core/core.hpp"
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/highgui/highgui.hpp"
-#include "opencv2/ocl/ocl.hpp"
+
 using namespace cv;
 using namespace std;
 
@@ -27,11 +28,10 @@ static void Clip_Callback(int)
 int main(int argc, char** argv)
 {
     const char* keys =
-        "{ i input   |                    | specify input image }"
-        "{ c camera  |    0               | specify camera id   }"
-        "{ s use_cpu |    false           | use cpu algorithm   }"
-        "{ o output  | clahe_output.jpg   | specify output save path}"
-        "{ h help    | false              | print help message }";
+        "{ i input    |                    | specify input image }"
+        "{ c camera   |  0                 | specify camera id   }"
+        "{ o output   | clahe_output.jpg   | specify output save path}"
+        "{ h help     | false              | print help message }";
 
     cv::CommandLineParser cmd(argc, argv, keys);
     if (cmd.has("help"))
@@ -44,19 +44,17 @@ int main(int argc, char** argv)
 
     string infile = cmd.get<string>("i"), outfile = cmd.get<string>("o");
     int camid = cmd.get<int>("c");
-    bool use_cpu = cmd.get<bool>("s");
     VideoCapture capture;
 
     namedWindow("CLAHE");
     createTrackbar("Tile Size", "CLAHE", &tilesize, 32, (TrackbarCallback)TSize_Callback);
     createTrackbar("Clip Limit", "CLAHE", &cliplimit, 20, (TrackbarCallback)Clip_Callback);
 
-    Mat frame, outframe;
-    ocl::oclMat d_outframe, d_frame;
+    UMat frame, outframe;
 
     int cur_clip;
     Size cur_tilesize;
-    pFilter = use_cpu ? createCLAHE() : ocl::createCLAHE();
+    pFilter = createCLAHE();
 
     cur_clip = (int)pFilter->getClipLimit();
     cur_tilesize = pFilter->getTilesGridSize();
@@ -65,7 +63,7 @@ int main(int argc, char** argv)
 
     if(infile != "")
     {
-        frame = imread(infile);
+        imread(infile).copyTo(frame);
         if(frame.empty())
         {
             cout << "error read image: " << infile << endl;
@@ -77,6 +75,7 @@ int main(int argc, char** argv)
 
     cout << "\nControls:\n"
          << "\to - save output image\n"
+         << "\tm - switch OpenCL <-> CPU mode"
          << "\tESC - exit\n";
 
     for (;;)
@@ -84,21 +83,12 @@ int main(int argc, char** argv)
         if(capture.isOpened())
             capture.read(frame);
         else
-            frame = imread(infile);
+            imread(infile).copyTo(frame);
         if(frame.empty())
             continue;
 
-        if(use_cpu)
-        {
-            cvtColor(frame, frame, COLOR_BGR2GRAY);
-            pFilter->apply(frame, outframe);
-        }
-        else
-        {
-            ocl::cvtColor(d_frame = frame, d_outframe, COLOR_BGR2GRAY);
-            pFilter->apply(d_outframe, d_outframe);
-            d_outframe.download(outframe);
-        }
+        cvtColor(frame, frame, COLOR_BGR2GRAY);
+        pFilter->apply(frame, outframe);
 
         imshow("CLAHE", outframe);
 
@@ -107,6 +97,11 @@ int main(int argc, char** argv)
             imwrite(outfile, outframe);
         else if(key == 27)
             break;
+        else if(key == 'm')
+        {
+            ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+            cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL enabled" : "CPU") << " mode\n";
+        }
     }
     return EXIT_SUCCESS;
 }
diff --git a/samples/ocl/hog.cpp b/samples/tapi/hog.cpp
similarity index 65%
rename from samples/ocl/hog.cpp
rename to samples/tapi/hog.cpp
index a3c5c9936..ee537b310 100644
--- a/samples/ocl/hog.cpp
+++ b/samples/tapi/hog.cpp
@@ -4,9 +4,11 @@
 #include <sstream>
 #include <iomanip>
 #include <stdexcept>
+#include <opencv2/core/ocl.hpp>
 #include <opencv2/core/utility.hpp>
-#include "opencv2/ocl.hpp"
-#include "opencv2/highgui.hpp"
+#include <opencv2/highgui.hpp>
+#include <opencv2/objdetect.hpp>
+#include <opencv2/imgproc.hpp>
 
 using namespace std;
 using namespace cv;
@@ -38,7 +40,6 @@ private:
 
     //Args args;
     bool running;
-    bool use_gpu;
     bool make_gray;
     double scale;
     double resize_scale;
@@ -64,14 +65,13 @@ private:
 int main(int argc, char** argv)
 {
     const char* keys =
-        "{ h |  help    | false          | print help message }"
-        "{ i |  input   |                | specify input image}"
-        "{ c | camera   | -1             | enable camera capturing }"
-        "{ v | video    |                | use video as input }"
-        "{ g |  gray    | false          | convert image to gray one or not}"
-        "{ s |  scale   | 1.0            | resize the image before detect}"
-        "{ l |larger_win| false          | use 64x128 window}"
-        "{ o |  output  |                | specify output path when input is images}";
+        "{ h help      | false          | print help message }"
+        "{ i input     |                | specify input image}"
+        "{ c camera    | -1             | enable camera capturing }"
+        "{ v video     | 768x576.avi    | use video as input }"
+        "{ g gray      | false          | convert image to gray one or not}"
+        "{ s scale     | 1.0            | resize the image before detect}"
+        "{ o output    |                | specify output path when input is images}";
     CommandLineParser cmd(argc, argv, keys);
     if (cmd.has("help"))
     {
@@ -114,21 +114,19 @@ App::App(CommandLineParser& cmd)
          << "\t4/r - increase/decrease hit threshold\n"
          << endl;
 
-
-    use_gpu = true;
-    make_gray = cmd.get<bool>("g");
+    make_gray = cmd.has("gray");
     resize_scale = cmd.get<double>("s");
-    win_width = cmd.get<bool>("l") == true ? 64 : 48;
     vdo_source = cmd.get<string>("v");
     img_source = cmd.get<string>("i");
     output = cmd.get<string>("o");
     camera_id = cmd.get<int>("c");
 
+    win_width = 48;
     win_stride_width = 8;
     win_stride_height = 8;
     gr_threshold = 8;
     nlevels = 13;
-    hit_threshold = win_width == 48 ? 1.4 : 0.;
+    hit_threshold = 1.4;
     scale = 1.05;
     gamma_corr = true;
     write_once = false;
@@ -151,25 +149,15 @@ void App::run()
     Size win_stride(win_stride_width, win_stride_height);
 
     // Create HOG descriptors and detectors here
-    vector<float> detector;
-    if (win_size == Size(64, 128))
-        detector = ocl::HOGDescriptor::getPeopleDetector64x128();
-    else
-        detector = ocl::HOGDescriptor::getPeopleDetector48x96();
 
-
-    ocl::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                               ocl::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                               ocl::HOGDescriptor::DEFAULT_NLEVELS);
-    HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
+    HOGDescriptor hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
                           HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
-    gpu_hog.setSVMDetector(detector);
-    cpu_hog.setSVMDetector(detector);
+    hog.setSVMDetector( HOGDescriptor::getDaimlerPeopleDetector() );
 
     while (running)
     {
         VideoCapture vc;
-        Mat frame;
+        UMat frame;
 
         if (vdo_source!="")
         {
@@ -191,23 +179,21 @@ void App::run()
         }
         else
         {
-            frame = imread(img_source);
+            imread(img_source).copyTo(frame);
             if (frame.empty())
                 throw runtime_error(string("can't open image file: " + img_source));
         }
 
-        Mat img_aux, img, img_to_show;
-        ocl::oclMat gpu_img;
+        UMat img_aux, img;
+        Mat img_to_show;
 
         // Iterate over all frames
-        bool verify = false;
         while (running && !frame.empty())
         {
             workBegin();
 
             // Change format of the image
-            if (make_gray) cvtColor(frame, img_aux, COLOR_BGR2GRAY);
-            else if (use_gpu) cvtColor(frame, img_aux, COLOR_BGR2BGRA);
+            if (make_gray) cvtColor(frame, img_aux, COLOR_BGR2GRAY );
             else frame.copyTo(img_aux);
 
             // Resize image
@@ -217,32 +203,15 @@ void App::run()
                 resize(img_aux, img, sz);
             }
             else img = img_aux;
-            img_to_show = img;
-            gpu_hog.nlevels = nlevels;
-            cpu_hog.nlevels = nlevels;
+            img.copyTo(img_to_show);
+            hog.nlevels = nlevels;
             vector<Rect> found;
 
             // Perform HOG classification
             hogWorkBegin();
-            if (use_gpu)
-            {
-                gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
-                                         Size(0, 0), scale, gr_threshold);
-                if (!verify)
-                {
-                    // verify if GPU output same objects with CPU at 1st run
-                    verify = true;
-                    vector<Rect> ref_rst;
-                    cvtColor(img, img, COLOR_BGRA2BGR);
-                    cpu_hog.detectMultiScale(img, ref_rst, hit_threshold, win_stride,
-                                             Size(0, 0), scale, gr_threshold-2);
-                    double accuracy = checkRectSimilarity(img.size(), ref_rst, found);
-                    cout << "\naccuracy value: " << accuracy << endl;
-                }
-            }
-            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
-                                              Size(0, 0), scale, gr_threshold);
+
+            hog.detectMultiScale(img.getMat(ACCESS_READ), found, hit_threshold, win_stride,
+                    Size(0, 0), scale, gr_threshold);
             hogWorkEnd();
 
 
@@ -253,13 +222,10 @@ void App::run()
                 rectangle(img_to_show, r.tl(), r.br(), Scalar(0, 255, 0), 3);
             }
 
-            if (use_gpu)
-                putText(img_to_show, "Mode: GPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            else
-                putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
+            putText(img_to_show, "Mode: CPU", Point(5, 25), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             putText(img_to_show, "FPS (HOG only): " + hogWorkFps(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
             putText(img_to_show, "FPS (total): " + workFps(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255, 100, 0), 2);
-            imshow("opencv_gpu_hog", img_to_show);
+            imshow("opencv_hog", img_to_show);
             if (vdo_source!="" || camera_id!=-1) vc >> frame;
 
             workEnd();
@@ -284,7 +250,7 @@ void App::run()
                     if (make_gray) cvtColor(img_to_show, img, COLOR_GRAY2BGR);
                     else cvtColor(img_to_show, img, COLOR_BGRA2BGR);
 
-                    video_writer << img;
+                    video_writer << img.getMat(ACCESS_READ);
                 }
             }
 
@@ -302,8 +268,8 @@ void App::handleKey(char key)
         break;
     case 'm':
     case 'M':
-        use_gpu = !use_gpu;
-        cout << "Switched to " << (use_gpu ? "CUDA" : "CPU") << " mode\n";
+        ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+        cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL enabled" : "CPU") << " mode\n";
         break;
     case 'g':
     case 'G':
@@ -396,53 +362,3 @@ inline string App::workFps() const
     ss << work_fps;
     return ss.str();
 }
-
-
-double App::checkRectSimilarity(Size sz,
-                                std::vector<Rect>& ob1,
-                                std::vector<Rect>& ob2)
-{
-    double final_test_result = 0.0;
-    size_t sz1 = ob1.size();
-    size_t sz2 = ob2.size();
-
-    if(sz1 != sz2)
-    {
-        return sz1 > sz2 ? (double)(sz1 - sz2) : (double)(sz2 - sz1);
-    }
-    else
-    {
-        if(sz1==0 && sz2==0)
-            return 0;
-        cv::Mat cpu_result(sz, CV_8UC1);
-        cpu_result.setTo(0);
-
-
-        for(vector<Rect>::const_iterator r = ob1.begin(); r != ob1.end(); r++)
-        {
-            cv::Mat cpu_result_roi(cpu_result, *r);
-            cpu_result_roi.setTo(1);
-            cpu_result.copyTo(cpu_result);
-        }
-        int cpu_area = cv::countNonZero(cpu_result > 0);
-
-
-        cv::Mat gpu_result(sz, CV_8UC1);
-        gpu_result.setTo(0);
-        for(vector<Rect>::const_iterator r2 = ob2.begin(); r2 != ob2.end(); r2++)
-        {
-            cv::Mat gpu_result_roi(gpu_result, *r2);
-            gpu_result_roi.setTo(1);
-            gpu_result.copyTo(gpu_result);
-        }
-
-        cv::Mat result_;
-        multiply(cpu_result, gpu_result, result_);
-        int result = cv::countNonZero(result_ > 0);
-        if(cpu_area!=0 && result!=0)
-            final_test_result = 1.0 - (double)result/(double)cpu_area;
-        else if(cpu_area==0 && result!=0)
-            final_test_result = -1;
-    }
-    return final_test_result;
-}
diff --git a/samples/ocl/pyrlk_optical_flow.cpp b/samples/tapi/pyrlk_optical_flow.cpp
similarity index 63%
rename from samples/ocl/pyrlk_optical_flow.cpp
rename to samples/tapi/pyrlk_optical_flow.cpp
index 89137d96e..d4b77294b 100644
--- a/samples/ocl/pyrlk_optical_flow.cpp
+++ b/samples/tapi/pyrlk_optical_flow.cpp
@@ -4,12 +4,11 @@
 
 #include "opencv2/core/utility.hpp"
 #include "opencv2/highgui/highgui.hpp"
-#include "opencv2/ocl/ocl.hpp"
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/video/video.hpp"
 
 using namespace std;
 using namespace cv;
-using namespace cv::ocl;
 
 typedef unsigned char uchar;
 #define LOOP_NUM 10
@@ -29,25 +28,10 @@ static double getTime()
     return work_end * 1000. / getTickFrequency();
 }
 
-static void download(const oclMat& d_mat, vector<Point2f>& vec)
-{
-    vec.clear();
-    vec.resize(d_mat.cols);
-    Mat mat(1, d_mat.cols, CV_32FC2, (void*)&vec[0]);
-    d_mat.download(mat);
-}
-
-static void download(const oclMat& d_mat, vector<uchar>& vec)
-{
-    vec.clear();
-    vec.resize(d_mat.cols);
-    Mat mat(1, d_mat.cols, CV_8UC1, (void*)&vec[0]);
-    d_mat.download(mat);
-}
-
-static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status,
+static void drawArrows(UMat& _frame, const vector<Point2f>& prevPts, const vector<Point2f>& nextPts, const vector<uchar>& status,
                        Scalar line_color = Scalar(0, 0, 255))
 {
+    Mat frame = _frame.getMat(ACCESS_WRITE);
     for (size_t i = 0; i < prevPts.size(); ++i)
     {
         if (status[i])
@@ -89,15 +73,15 @@ static void drawArrows(Mat& frame, const vector<Point2f>& prevPts, const vector<
 int main(int argc, const char* argv[])
 {
     const char* keys =
-        "{ help h           | false           | print help message }"
-        "{ left l           |                 | specify left image }"
-        "{ right r          |                 | specify right image }"
-        "{ camera c         | 0               | enable camera capturing }"
-        "{ use_cpu s        | false           | use cpu or gpu to process the image }"
-        "{ video v          |                 | use video as input }"
-        "{ output o         | pyrlk_output.jpg| specify output save path when input is images }"
+        "{ h help           | false           | print help message }"
+        "{ l left           |                 | specify left image }"
+        "{ r right          |                 | specify right image }"
+        "{ c camera         | 0               | enable camera capturing }"
+        "{ v video          |                 | use video as input }"
+        "{ o output         | pyrlk_output.jpg| specify output save path when input is images }"
         "{ points           | 1000            | specify points count [GoodFeatureToTrack] }"
-        "{ min_dist         | 0               | specify minimal distance between points [GoodFeatureToTrack] }";
+        "{ min_dist         | 0               | specify minimal distance between points [GoodFeatureToTrack] }"
+        "{ m cpu_mode       | false           | run without OpenCL }";
 
     CommandLineParser cmd(argc, argv, keys);
 
@@ -109,21 +93,20 @@ int main(int argc, const char* argv[])
         return EXIT_SUCCESS;
     }
 
-    bool defaultPicturesFail = false;
+    bool defaultPicturesFail = true;
     string fname0 = cmd.get<string>("left");
     string fname1 = cmd.get<string>("right");
     string vdofile = cmd.get<string>("video");
     string outfile = cmd.get<string>("output");
     int points = cmd.get<int>("points");
     double minDist = cmd.get<double>("min_dist");
-    bool useCPU = cmd.has("s");
     int inputName = cmd.get<int>("c");
 
-    oclMat d_nextPts, d_status;
-    GoodFeaturesToTrackDetector_OCL d_features(points);
-    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
-    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
-    PyrLKOpticalFlow d_pyrLK;
+    UMat frame0;
+    imread(fname0, cv::IMREAD_GRAYSCALE).copyTo(frame0);
+    UMat frame1;
+    imread(fname1, cv::IMREAD_GRAYSCALE).copyTo(frame1);
+
     vector<cv::Point2f> pts(points);
     vector<cv::Point2f> nextPts(points);
     vector<unsigned char> status(points);
@@ -134,9 +117,9 @@ int main(int argc, const char* argv[])
     if (frame0.empty() || frame1.empty())
     {
         VideoCapture capture;
-        Mat frame, frameCopy;
-        Mat frame0Gray, frame1Gray;
-        Mat ptr0, ptr1;
+        UMat frame, frameCopy;
+        UMat frame0Gray, frame1Gray;
+        UMat ptr0, ptr1;
 
         if(vdofile.empty())
             capture.open( inputName );
@@ -183,25 +166,13 @@ int main(int argc, const char* argv[])
                     ptr1 = frame0Gray;
                 }
 
-                if (useCPU)
-                {
-                    pts.clear();
-                    goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
-                    if(pts.size() == 0)
-                        continue;
-                    calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
-                }
-                else
-                {
-                    oclMat d_img(ptr0), d_prevPts;
-                    d_features(d_img, d_prevPts);
-                    if(!d_prevPts.rows || !d_prevPts.cols)
-                        continue;
-                    d_pyrLK.sparse(d_img, oclMat(ptr1), d_prevPts, d_nextPts, d_status);
-                    d_features.downloadPoints(d_prevPts,pts);
-                    download(d_nextPts, nextPts);
-                    download(d_status, status);
-                }
+
+                pts.clear();
+                goodFeaturesToTrack(ptr0, pts, points, 0.01, 0.0);
+                if(pts.size() == 0)
+                    continue;
+                calcOpticalFlowPyrLK(ptr0, ptr1, pts, nextPts, status, err);
+
                 if (i%2 == 1)
                     frame1.copyTo(frameCopy);
                 else
@@ -209,45 +180,40 @@ int main(int argc, const char* argv[])
                 drawArrows(frameCopy, pts, nextPts, status, Scalar(255, 0, 0));
                 imshow("PyrLK [Sparse]", frameCopy);
             }
+            char key = (char)waitKey(10);
 
-            if( waitKey( 10 ) >= 0 )
+            if (key == 27)
                 break;
+            else if (key == 'm' || key == 'M')
+            {
+                ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+                cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL" : "CPU") << " mode\n";
+            }
         }
-
         capture.release();
     }
     else
     {
 nocamera:
+        if (cmd.has("cpu_mode"))
+        {
+            ocl::setUseOpenCL(false);
+            std::cout << "OpenCL was disabled" << std::endl;
+        }
         for(int i = 0; i <= LOOP_NUM; i ++)
         {
             cout << "loop" << i << endl;
             if (i > 0) workBegin();
 
-            if (useCPU)
-            {
-                goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
-                calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-            }
-            else
-            {
-                oclMat d_img(frame0), d_prevPts;
-                d_features(d_img, d_prevPts);
-                d_pyrLK.sparse(d_img, oclMat(frame1), d_prevPts, d_nextPts, d_status);
-                d_features.downloadPoints(d_prevPts, pts);
-                download(d_nextPts, nextPts);
-                download(d_status, status);
-            }
+            goodFeaturesToTrack(frame0, pts, points, 0.01, minDist);
+            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
 
             if (i > 0 && i <= LOOP_NUM)
                 workEnd();
 
             if (i == LOOP_NUM)
             {
-                if (useCPU)
-                    cout << "average CPU time (noCamera) : ";
-                else
-                    cout << "average GPU time (noCamera) : ";
+                cout << "average time (noCamera) : ";
 
                 cout << getTime() / LOOP_NUM << " ms" << endl;
 
diff --git a/samples/tapi/squares.cpp b/samples/tapi/squares.cpp
new file mode 100644
index 000000000..402702e49
--- /dev/null
+++ b/samples/tapi/squares.cpp
@@ -0,0 +1,204 @@
+// The "Square Detector" program.
+// It loads several images sequentially and tries to find squares in
+// each image
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include <iostream>
+#include <string.h>
+
+using namespace cv;
+using namespace std;
+
+int thresh = 50, N = 11;
+const char* wndname = "Square Detection Demo";
+
+// helper function:
+// finds a cosine of angle between vectors
+// from pt0->pt1 and from pt0->pt2
+static double angle( Point pt1, Point pt2, Point pt0 )
+{
+    double dx1 = pt1.x - pt0.x;
+    double dy1 = pt1.y - pt0.y;
+    double dx2 = pt2.x - pt0.x;
+    double dy2 = pt2.y - pt0.y;
+    return (dx1*dx2 + dy1*dy2)/sqrt((dx1*dx1 + dy1*dy1)*(dx2*dx2 + dy2*dy2) + 1e-10);
+}
+
+
+// returns sequence of squares detected on the image.
+// the sequence is stored in the specified memory storage
+static void findSquares( const UMat& image, vector<vector<Point> >& squares )
+{
+    squares.clear();
+    UMat pyr, timg, gray0(image.size(), CV_8U), gray;
+
+    // down-scale and upscale the image to filter out the noise
+    pyrDown(image, pyr, Size(image.cols/2, image.rows/2));
+    pyrUp(pyr, timg, image.size());
+    vector<vector<Point> > contours;
+
+    // find squares in every color plane of the image
+    for( int c = 0; c < 3; c++ )
+    {
+        int ch[] = {c, 0};
+        mixChannels(timg, gray0, ch, 1);
+
+        // try several threshold levels
+        for( int l = 0; l < N; l++ )
+        {
+            // hack: use Canny instead of zero threshold level.
+            // Canny helps to catch squares with gradient shading
+            if( l == 0 )
+            {
+                // apply Canny. Take the upper threshold from slider
+                // and set the lower to 0 (which forces edges merging)
+                Canny(gray0, gray, 0, thresh, 5);
+                // dilate canny output to remove potential
+                // holes between edge segments
+                dilate(gray, gray, UMat(), Point(-1,-1));
+            }
+            else
+            {
+                // apply threshold if l!=0:
+                //     tgray(x,y) = gray(x,y) < (l+1)*255/N ? 255 : 0
+                cv::threshold(gray0, gray, (l+1)*255/N, 255, THRESH_BINARY);
+            }
+
+            // find contours and store them all as a list
+            findContours(gray, contours, RETR_LIST, CHAIN_APPROX_SIMPLE);
+
+            vector<Point> approx;
+
+            // test each contour
+            for( size_t i = 0; i < contours.size(); i++ )
+            {
+                // approximate contour with accuracy proportional
+                // to the contour perimeter
+
+                approxPolyDP(Mat(contours[i]), approx, arcLength(Mat(contours[i]), true)*0.02, true);
+
+                // square contours should have 4 vertices after approximation
+                // relatively large area (to filter out noisy contours)
+                // and be convex.
+                // Note: absolute value of an area is used because
+                // area may be positive or negative - in accordance with the
+                // contour orientation
+                if( approx.size() == 4 &&
+                        fabs(contourArea(Mat(approx))) > 1000 &&
+                        isContourConvex(Mat(approx)) )
+                {
+                    double maxCosine = 0;
+
+                    for( int j = 2; j < 5; j++ )
+                    {
+                        // find the maximum cosine of the angle between joint edges
+                        double cosine = fabs(angle(approx[j%4], approx[j-2], approx[j-1]));
+                        maxCosine = MAX(maxCosine, cosine);
+                    }
+
+                    // if cosines of all angles are small
+                    // (all angles are ~90 degree) then write quandrange
+                    // vertices to resultant sequence
+                    if( maxCosine < 0.3 )
+                        squares.push_back(approx);
+                }
+            }
+        }
+    }
+}
+
+// the function draws all the squares in the image
+static void drawSquares( UMat& _image, const vector<vector<Point> >& squares )
+{
+    Mat image = _image.getMat(ACCESS_WRITE);
+    for( size_t i = 0; i < squares.size(); i++ )
+    {
+        const Point* p = &squares[i][0];
+        int n = (int)squares[i].size();
+        polylines(image, &p, &n, 1, true, Scalar(0,255,0), 3, LINE_AA);
+    }
+}
+
+
+// draw both pure-C++ and ocl square results onto a single image
+static UMat drawSquaresBoth( const UMat& image,
+                            const vector<vector<Point> >& sqs)
+{
+    UMat imgToShow(Size(image.cols, image.rows), image.type());
+    image.copyTo(imgToShow);
+
+    drawSquares(imgToShow, sqs);
+
+    return imgToShow;
+}
+
+
+int main(int argc, char** argv)
+{
+    const char* keys =
+        "{ i input    | pic1.png           | specify input image }"
+        "{ o output   | squares_output.jpg | specify output save path}"
+        "{ h help     | false              | print help message }"
+        "{ m cpu_mode | false              | run without OpenCL }";
+
+    CommandLineParser cmd(argc, argv, keys);
+
+    if(cmd.has("help"))
+    {
+        cout << "Usage : squares [options]" << endl;
+        cout << "Available options:" << endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+    if (cmd.has("cpu_mode"))
+    {
+        ocl::setUseOpenCL(false);
+        std::cout << "OpenCL was disabled" << std::endl;
+    }
+
+    string inputName = cmd.get<string>("i");
+    string outfile = cmd.get<string>("o");
+
+    int iterations = 10;
+    namedWindow( wndname, WINDOW_AUTOSIZE );
+    vector<vector<Point> > squares;
+
+    UMat image;
+    imread(inputName, 1).copyTo(image);
+    if( image.empty() )
+    {
+        cout << "Couldn't load " << inputName << endl;
+        cmd.printMessage();
+        return EXIT_FAILURE;
+    }
+
+    int j = iterations;
+    int64 t_cpp = 0;
+    //warm-ups
+    cout << "warming up ..." << endl;
+    findSquares(image, squares);
+
+    do
+    {
+        int64 t_start = cv::getTickCount();
+        findSquares(image, squares);
+        t_cpp += cv::getTickCount() - t_start;
+
+        t_start  = cv::getTickCount();
+
+        cout << "run loop: " << j << endl;
+    }
+    while(--j);
+    cout << "average time: " << 1000.0f * (double)t_cpp / getTickFrequency() / iterations << "ms" << endl;
+
+    UMat result = drawSquaresBoth(image, squares);
+    imshow(wndname, result);
+    imwrite(outfile, result);
+    waitKey(0);
+
+    return EXIT_SUCCESS;
+}
diff --git a/samples/tapi/surf_matcher.cpp b/samples/tapi/surf_matcher.cpp
new file mode 100644
index 000000000..9066bfd3f
--- /dev/null
+++ b/samples/tapi/surf_matcher.cpp
@@ -0,0 +1,224 @@
+#include <iostream>
+#include <stdio.h>
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/features2d.hpp"
+#include "opencv2/calib3d.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/nonfree.hpp"
+
+using namespace cv;
+
+const int LOOP_NUM = 10;
+const int GOOD_PTS_MAX = 50;
+const float GOOD_PORTION = 0.15f;
+
+int64 work_begin = 0;
+int64 work_end = 0;
+
+static void workBegin()
+{
+    work_begin = getTickCount();
+}
+
+static void workEnd()
+{
+    work_end = getTickCount() - work_begin;
+}
+
+static double getTime()
+{
+    return work_end /((double)getTickFrequency() )* 1000.;
+}
+
+template<class KPDetector>
+struct SURFDetector
+{
+    KPDetector surf;
+    SURFDetector(double hessian = 800.0)
+        :surf(hessian)
+    {
+    }
+    template<class T>
+    void operator()(const T& in, const T& mask, std::vector<cv::KeyPoint>& pts, T& descriptors, bool useProvided = false)
+    {
+        surf(in, mask, pts, descriptors, useProvided);
+    }
+};
+
+template<class KPMatcher>
+struct SURFMatcher
+{
+    KPMatcher matcher;
+    template<class T>
+    void match(const T& in1, const T& in2, std::vector<cv::DMatch>& matches)
+    {
+        matcher.match(in1, in2, matches);
+    }
+};
+
+static Mat drawGoodMatches(
+    const Mat& img1,
+    const Mat& img2,
+    const std::vector<KeyPoint>& keypoints1,
+    const std::vector<KeyPoint>& keypoints2,
+    std::vector<DMatch>& matches,
+    std::vector<Point2f>& scene_corners_
+    )
+{
+    //-- Sort matches and preserve top 10% matches
+    std::sort(matches.begin(), matches.end());
+    std::vector< DMatch > good_matches;
+    double minDist = matches.front().distance;
+    double maxDist = matches.back().distance;
+
+    const int ptsPairs = std::min(GOOD_PTS_MAX, (int)(matches.size() * GOOD_PORTION));
+    for( int i = 0; i < ptsPairs; i++ )
+    {
+        good_matches.push_back( matches[i] );
+    }
+    std::cout << "\nMax distance: " << maxDist << std::endl;
+    std::cout << "Min distance: " << minDist << std::endl;
+
+    std::cout << "Calculating homography using " << ptsPairs << " point pairs." << std::endl;
+
+    // drawing the results
+    Mat img_matches;
+
+    drawMatches( img1, keypoints1, img2, keypoints2,
+                 good_matches, img_matches, Scalar::all(-1), Scalar::all(-1),
+                 std::vector<char>(), DrawMatchesFlags::NOT_DRAW_SINGLE_POINTS  );
+
+    //-- Localize the object
+    std::vector<Point2f> obj;
+    std::vector<Point2f> scene;
+
+    for( size_t i = 0; i < good_matches.size(); i++ )
+    {
+        //-- Get the keypoints from the good matches
+        obj.push_back( keypoints1[ good_matches[i].queryIdx ].pt );
+        scene.push_back( keypoints2[ good_matches[i].trainIdx ].pt );
+    }
+    //-- Get the corners from the image_1 ( the object to be "detected" )
+    std::vector<Point2f> obj_corners(4);
+    obj_corners[0] = Point(0,0);
+    obj_corners[1] = Point( img1.cols, 0 );
+    obj_corners[2] = Point( img1.cols, img1.rows );
+    obj_corners[3] = Point( 0, img1.rows );
+    std::vector<Point2f> scene_corners(4);
+
+    Mat H = findHomography( obj, scene, RANSAC );
+    perspectiveTransform( obj_corners, scene_corners, H);
+
+    scene_corners_ = scene_corners;
+
+    //-- Draw lines between the corners (the mapped object in the scene - image_2 )
+    line( img_matches,
+          scene_corners[0] + Point2f( (float)img1.cols, 0), scene_corners[1] + Point2f( (float)img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
+    line( img_matches,
+          scene_corners[1] + Point2f( (float)img1.cols, 0), scene_corners[2] + Point2f( (float)img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
+    line( img_matches,
+          scene_corners[2] + Point2f( (float)img1.cols, 0), scene_corners[3] + Point2f( (float)img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
+    line( img_matches,
+          scene_corners[3] + Point2f( (float)img1.cols, 0), scene_corners[0] + Point2f( (float)img1.cols, 0),
+          Scalar( 0, 255, 0), 2, LINE_AA );
+    return img_matches;
+}
+
+////////////////////////////////////////////////////
+// This program demonstrates the usage of SURF_OCL.
+// use cpu findHomography interface to calculate the transformation matrix
+int main(int argc, char* argv[])
+{
+    const char* keys =
+        "{ h help     | false            | print help message  }"
+        "{ l left     | box.png          | specify left image  }"
+        "{ r right    | box_in_scene.png | specify right image }"
+        "{ o output   | SURF_output.jpg  | specify output save path }"
+        "{ m cpu_mode | false            | run without OpenCL }";
+
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.has("help"))
+    {
+        std::cout << "Usage: surf_matcher [options]" << std::endl;
+        std::cout << "Available options:" << std::endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+    if (cmd.has("cpu_mode"))
+    {
+        ocl::setUseOpenCL(false);
+        std::cout << "OpenCL was disabled" << std::endl;
+    }
+
+    UMat img1, img2;
+
+    std::string outpath = cmd.get<std::string>("o");
+
+    std::string leftName = cmd.get<std::string>("l");
+    imread(leftName, IMREAD_GRAYSCALE).copyTo(img1);
+    if(img1.empty())
+    {
+        std::cout << "Couldn't load " << leftName << std::endl;
+        cmd.printMessage();
+        return EXIT_FAILURE;
+    }
+
+    std::string rightName = cmd.get<std::string>("r");
+    imread(rightName, IMREAD_GRAYSCALE).copyTo(img2);
+    if(img2.empty())
+    {
+        std::cout << "Couldn't load " << rightName << std::endl;
+        cmd.printMessage();
+        return EXIT_FAILURE;
+    }
+
+    double surf_time = 0.;
+
+    //declare input/output
+    std::vector<KeyPoint> keypoints1, keypoints2;
+    std::vector<DMatch> matches;
+
+    UMat _descriptors1, _descriptors2;
+    Mat descriptors1 = _descriptors1.getMat(ACCESS_RW),
+        descriptors2 = _descriptors2.getMat(ACCESS_RW);
+
+    //instantiate detectors/matchers
+    SURFDetector<SURF> surf;
+
+    SURFMatcher<BFMatcher> matcher;
+
+    //-- start of timing section
+
+    for (int i = 0; i <= LOOP_NUM; i++)
+    {
+        if(i == 1) workBegin();
+        surf(img1.getMat(ACCESS_READ), Mat(), keypoints1, descriptors1);
+        surf(img2.getMat(ACCESS_READ), Mat(), keypoints2, descriptors2);
+        matcher.match(descriptors1, descriptors2, matches);
+    }
+    workEnd();
+    std::cout << "FOUND " << keypoints1.size() << " keypoints on first image" << std::endl;
+    std::cout << "FOUND " << keypoints2.size() << " keypoints on second image" << std::endl;
+
+    surf_time = getTime();
+    std::cout << "SURF run time: " << surf_time / LOOP_NUM << " ms" << std::endl<<"\n";
+
+
+    std::vector<Point2f> corner;
+    Mat img_matches = drawGoodMatches(img1.getMat(ACCESS_READ), img2.getMat(ACCESS_READ), keypoints1, keypoints2, matches, corner);
+
+    //-- Show detected matches
+
+    namedWindow("surf matches", 0);
+    imshow("surf matches", img_matches);
+    imwrite(outpath, img_matches);
+
+    waitKey(0);
+    return EXIT_SUCCESS;
+}
diff --git a/samples/ocl/tvl1_optical_flow.cpp b/samples/tapi/tvl1_optical_flow.cpp
similarity index 72%
rename from samples/ocl/tvl1_optical_flow.cpp
rename to samples/tapi/tvl1_optical_flow.cpp
index f678dd6fd..436ba715c 100644
--- a/samples/ocl/tvl1_optical_flow.cpp
+++ b/samples/tapi/tvl1_optical_flow.cpp
@@ -2,14 +2,13 @@
 #include <vector>
 #include <iomanip>
 
+#include "opencv2/core/ocl.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/highgui/highgui.hpp"
-#include "opencv2/ocl/ocl.hpp"
 #include "opencv2/video/video.hpp"
 
 using namespace std;
 using namespace cv;
-using namespace cv::ocl;
 
 typedef unsigned char uchar;
 #define LOOP_NUM 10
@@ -82,17 +81,17 @@ static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
 int main(int argc, const char* argv[])
 {
     const char* keys =
-        "{ h   | help       | false           | print help message }"
-        "{ l   | left       |                 | specify left image }"
-        "{ r   | right      |                 | specify right image }"
-        "{ o   | output     | tvl1_output.jpg | specify output save path }"
-        "{ c   | camera     | 0               | enable camera capturing }"
-        "{ s   | use_cpu    | false           | use cpu or gpu to process the image }"
-        "{ v   | video      |                 | use video as input }";
+        "{ h help     | false           | print help message }"
+        "{ l left     |                 | specify left image }"
+        "{ r right    |                 | specify right image }"
+        "{ o output   | tvl1_output.jpg | specify output save path }"
+        "{ c camera   | 0               | enable camera capturing }"
+        "{ m cpu_mode | false           | run without OpenCL }"
+        "{ v video    |                 | use video as input }";
 
     CommandLineParser cmd(argc, argv, keys);
 
-    if (cmd.get<bool>("help"))
+    if (cmd.has("help"))
     {
         cout << "Usage: pyrlk_optical_flow [options]" << endl;
         cout << "Available options:" << endl;
@@ -108,22 +107,23 @@ int main(int argc, const char* argv[])
     bool useCamera = cmd.get<bool>("c");
     int inputName = cmd.get<int>("c");
 
-    Mat frame0 = imread(fname0, cv::IMREAD_GRAYSCALE);
-    Mat frame1 = imread(fname1, cv::IMREAD_GRAYSCALE);
+    UMat frame0, frame1;
+    imread(fname0, cv::IMREAD_GRAYSCALE).copyTo(frame0);
+    imread(fname1, cv::IMREAD_GRAYSCALE).copyTo(frame1);
     cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
-    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
 
-    Mat flow, show_flow;
-    Mat flow_vec[2];
+    UMat flow;
+    Mat show_flow;
+    vector<UMat> flow_vec;
     if (frame0.empty() || frame1.empty())
         useCamera = true;
 
     if (useCamera)
     {
         VideoCapture capture;
-        Mat frame, frameCopy;
-        Mat frame0Gray, frame1Gray;
-        Mat ptr0, ptr1;
+        UMat frame, frameCopy;
+        UMat frame0Gray, frame1Gray;
+        UMat ptr0, ptr1;
 
         if(vdofile.empty())
             capture.open( inputName );
@@ -167,28 +167,25 @@ int main(int argc, const char* argv[])
                     ptr1 = frame0Gray;
                 }
 
-                if (useCPU)
-                {
-                    alg->calc(ptr0, ptr1, flow);
-                    split(flow, flow_vec);
-                }
-                else
-                {
-                    oclMat d_flowx, d_flowy;
-                    d_alg(oclMat(ptr0), oclMat(ptr1), d_flowx, d_flowy);
-                    d_flowx.download(flow_vec[0]);
-                    d_flowy.download(flow_vec[1]);
-                }
+                alg->calc(ptr0, ptr1, flow);
+                split(flow, flow_vec);
+
                 if (i%2 == 1)
                     frame1.copyTo(frameCopy);
                 else
                     frame0.copyTo(frameCopy);
-                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                getFlowField(flow_vec[0].getMat(ACCESS_READ), flow_vec[1].getMat(ACCESS_READ), show_flow);
                 imshow("tvl1 optical flow field", show_flow);
             }
 
-            if( waitKey( 10 ) >= 0 )
+            char key = (char)waitKey(10);
+            if (key == 27)
                 break;
+            else if (key == 'm' || key == 'M')
+            {
+                ocl::setUseOpenCL(!cv::ocl::useOpenCL());
+                cout << "Switched to " << (ocl::useOpenCL() ? "OpenCL" : "CPU") << " mode\n";
+            }
         }
 
         capture.release();
@@ -196,23 +193,20 @@ int main(int argc, const char* argv[])
     else
     {
 nocamera:
-        oclMat d_flowx, d_flowy;
+        if (cmd.has("cpu_mode"))
+        {
+            ocl::setUseOpenCL(false);
+            std::cout << "OpenCL was disabled" << std::endl;
+        }
         for(int i = 0; i <= LOOP_NUM; i ++)
         {
             cout << "loop" << i << endl;
 
             if (i > 0) workBegin();
-            if (useCPU)
-            {
-                alg->calc(frame0, frame1, flow);
-                split(flow, flow_vec);
-            }
-            else
-            {
-                d_alg(oclMat(frame0), oclMat(frame1), d_flowx, d_flowy);
-                d_flowx.download(flow_vec[0]);
-                d_flowy.download(flow_vec[1]);
-            }
+
+            alg->calc(frame0, frame1, flow);
+            split(flow, flow_vec);
+
             if (i > 0 && i <= LOOP_NUM)
                 workEnd();
 
@@ -224,7 +218,7 @@ nocamera:
                     cout << "average GPU time (noCamera) : ";
                 cout << getTime() / LOOP_NUM << " ms" << endl;
 
-                getFlowField(flow_vec[0], flow_vec[1], show_flow);
+                getFlowField(flow_vec[0].getMat(ACCESS_READ), flow_vec[1].getMat(ACCESS_READ), show_flow);
                 imshow("PyrLK [Sparse]", show_flow);
                 imwrite(outpath, show_flow);
             }
diff --git a/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp b/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp
index 07092bb74..f3f4be234 100644
--- a/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp
+++ b/samples/winrt/ImageManipulations/common/LayoutAwarePage.cpp
@@ -235,7 +235,7 @@ void LayoutAwarePage::CoreWindow_PointerPressed(CoreWindow^ sender, PointerEvent
     if (properties->IsLeftButtonPressed || properties->IsRightButtonPressed ||
         properties->IsMiddleButtonPressed) return;
 
-    // If back or foward are pressed (but not both) navigate appropriately
+    // If back or forward are pressed (but not both) navigate appropriately
     bool backPressed = properties->IsXButton1Pressed;
     bool forwardPressed = properties->IsXButton2Pressed;
     if (backPressed ^ forwardPressed)