diff --git a/CMakeLists.txt b/CMakeLists.txt
index 470c5520d..742d3e1fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -413,15 +413,6 @@ endif()
 # --- OpenCL ---
 if(WITH_OPENCL)
   include(cmake/OpenCVDetectOpenCL.cmake)
-  if(OPENCL_FOUND)
-    set(HAVE_OPENCL 1)
-  endif()
-  if(WITH_OPENCLAMDFFT AND CLAMDFFT_INCLUDE_DIR)
-    set(HAVE_CLAMDFFT 1)
-  endif()
-  if(WITH_OPENCLAMDBLAS AND CLAMDBLAS_INCLUDE_DIR)
-    set(HAVE_CLAMDBLAS 1)
-  endif()
 endif()
 
 # ----------------------------------------------------------------------------
@@ -799,11 +790,11 @@ if(HAVE_CUDA)
   status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
 endif()
 
-if(HAVE_OPENCL AND BUILD_opencv_ocl)
+if(HAVE_OPENCL)
   status("")
   status("  OpenCL")
   if(OPENCL_INCLUDE_DIR)
-    status("    Include:"            ${OPENCL_INCLUDE_DIR})
+    status("    Include path:"       ${OPENCL_INCLUDE_DIRS})
   endif()
   if(OPENCL_LIBRARIES)
     status("    libraries:"          ${OPENCL_LIBRARIES})
diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.cpp b/android/service/engine/jni/BinderComponent/HardwareDetector.cpp
index b5e0fa600..eab49ac5f 100644
--- a/android/service/engine/jni/BinderComponent/HardwareDetector.cpp
+++ b/android/service/engine/jni/BinderComponent/HardwareDetector.cpp
@@ -163,22 +163,13 @@ int DetectKnownPlatforms()
 {
     int tegra_status = DetectTegra();
 
-    if (3 == tegra_status)
+    // All Tegra platforms since Tegra3
+    if (2 < tegra_status)
     {
-        return PLATFORM_TEGRA3;
+        return PLATFORM_TEGRA + tegra_status - 1;
     }
     else
     {
         return PLATFORM_UNKNOWN;
     }
-
-    // NOTE: Uncomment when all Tegras will be supported
-    /*if (tegra_status > 0)
-     *   {
-     *   return PLATFORM_TEGRA + tegra_status - 1;
-     }
-     else
-     {
-         return PLATFORM_UNKNOWN;
-     }*/
 }
\ No newline at end of file
diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.h b/android/service/engine/jni/BinderComponent/HardwareDetector.h
index e049db9c5..135684418 100644
--- a/android/service/engine/jni/BinderComponent/HardwareDetector.h
+++ b/android/service/engine/jni/BinderComponent/HardwareDetector.h
@@ -27,6 +27,7 @@
 #define PLATFORM_TEGRA 1L
 #define PLATFORM_TEGRA2 2L
 #define PLATFORM_TEGRA3 3L
+#define PLATFORM_TEGRA4 4L
 
 int DetectKnownPlatforms();
 int GetProcessorCount();
diff --git a/android/service/engine/jni/BinderComponent/TegraDetector.cpp b/android/service/engine/jni/BinderComponent/TegraDetector.cpp
index f7db1fa85..3933efe49 100644
--- a/android/service/engine/jni/BinderComponent/TegraDetector.cpp
+++ b/android/service/engine/jni/BinderComponent/TegraDetector.cpp
@@ -7,6 +7,7 @@
 #define KERNEL_CONFIG_TEGRA_MAGIC "CONFIG_ARCH_TEGRA=y"
 #define KERNEL_CONFIG_TEGRA2_MAGIC "CONFIG_ARCH_TEGRA_2x_SOC=y"
 #define KERNEL_CONFIG_TEGRA3_MAGIC "CONFIG_ARCH_TEGRA_3x_SOC=y"
+#define KERNEL_CONFIG_TEGRA4_MAGIC "CONFIG_ARCH_TEGRA_11x_SOC=y"
 #define MAX_DATA_LEN    4096
 
 int DetectTegra()
@@ -19,9 +20,11 @@ int DetectTegra()
         const char *tegra_config = KERNEL_CONFIG_TEGRA_MAGIC;
         const char *tegra2_config = KERNEL_CONFIG_TEGRA2_MAGIC;
         const char *tegra3_config = KERNEL_CONFIG_TEGRA3_MAGIC;
+        const char *tegra4_config = KERNEL_CONFIG_TEGRA4_MAGIC;
         int len = strlen(tegra_config);
         int len2 = strlen(tegra2_config);
         int len3 = strlen(tegra3_config);
+        int len4 = strlen(tegra4_config);
         while (0 != gzgets(kernelConfig, tmpbuf, KERNEL_CONFIG_MAX_LINE_WIDTH))
         {
             if (0 == strncmp(tmpbuf, tegra_config, len))
@@ -41,6 +44,11 @@ int DetectTegra()
                 break;
             }
 
+            if (0 == strncmp(tmpbuf, tegra4_config, len4))
+            {
+                result = 4;
+                break;
+            }
         }
         gzclose(kernelConfig);
     }
diff --git a/android/service/engine/jni/NativeService/CommonPackageManager.cpp b/android/service/engine/jni/NativeService/CommonPackageManager.cpp
index dbcd8ff68..eaa03d4d8 100644
--- a/android/service/engine/jni/NativeService/CommonPackageManager.cpp
+++ b/android/service/engine/jni/NativeService/CommonPackageManager.cpp
@@ -197,6 +197,7 @@ std::vector<std::pair<int, int> > CommonPackageManager::InitArmRating()
     result.push_back(std::pair<int, int>(PLATFORM_UNKNOWN, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON));
     result.push_back(std::pair<int, int>(PLATFORM_UNKNOWN, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_VFPv3d16 | FEATURES_HAS_NEON));
     result.push_back(std::pair<int, int>(PLATFORM_TEGRA3,  ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON));
+    result.push_back(std::pair<int, int>(PLATFORM_TEGRA4,  ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON));
 
     return result;
 }
@@ -218,8 +219,8 @@ std::vector<std::pair<int, int> > CommonPackageManager::InitMipsRating()
 }
 
 const std::vector<std::pair<int, int> > CommonPackageManager::ArchRatings[] = {
-                                           CommonPackageManager::InitArmRating(), 
-                                           CommonPackageManager::InitIntelRating(), 
+                                           CommonPackageManager::InitArmRating(),
+                                           CommonPackageManager::InitIntelRating(),
                                            CommonPackageManager::InitMipsRating()
                                         };
 
diff --git a/android/service/engine/jni/NativeService/PackageInfo.cpp b/android/service/engine/jni/NativeService/PackageInfo.cpp
index 2eb823073..2f8dde043 100644
--- a/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -18,6 +18,7 @@ map<int, string> PackageInfo::InitPlatformNameMap()
     result[PLATFORM_TEGRA] = PLATFORM_TEGRA_NAME;
     result[PLATFORM_TEGRA2] = PLATFORM_TEGRA2_NAME;
     result[PLATFORM_TEGRA3] = PLATFORM_TEGRA3_NAME;
+    result[PLATFORM_TEGRA4] = PLATFORM_TEGRA4_NAME;
 
     return result;
 }
@@ -186,6 +187,10 @@ inline int SplitPlatfrom(const vector<string>& features)
         {
             result = PLATFORM_TEGRA3;
         }
+        else if (PLATFORM_TEGRA4_NAME == tmp)
+        {
+            result = PLATFORM_TEGRA4;
+        }
     }
     else
     {
@@ -425,6 +430,10 @@ InstallPath(install_path)
                 {
                     CpuID = ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON;
                 } break;
+                case PLATFORM_TEGRA4:
+                {
+                    CpuID = ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON;
+                } break;
             }
         }
         else
diff --git a/android/service/engine/jni/NativeService/PackageInfo.h b/android/service/engine/jni/NativeService/PackageInfo.h
index b86ef7a92..2ce561e2f 100644
--- a/android/service/engine/jni/NativeService/PackageInfo.h
+++ b/android/service/engine/jni/NativeService/PackageInfo.h
@@ -12,7 +12,6 @@
 #define ARCH_ARMv7_NAME "armv7a"
 #define ARCH_ARMv8_NAME "armv8"
 
-
 #define FEATURES_HAS_VFPv3d16_NAME "vfpv3d16"
 #define FEATURES_HAS_VFPv3_NAME "vfpv3"
 #define FEATURES_HAS_NEON_NAME "neon"
@@ -25,7 +24,7 @@
 #define PLATFORM_TEGRA_NAME "tegra"
 #define PLATFORM_TEGRA2_NAME "tegra2"
 #define PLATFORM_TEGRA3_NAME "tegra3"
-
+#define PLATFORM_TEGRA4_NAME "tegra4"
 
 class PackageInfo
 {
diff --git a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
index 4e390386f..ce5159f81 100644
--- a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
+++ b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp
@@ -201,6 +201,24 @@ TEST(OpenCVEngineTest, GetPathForCompatiblePackage2)
 #endif
 }
 
+TEST(OpenCVEngineTest, GetPathForCompatiblePackage3)
+{
+    sp<IOpenCVEngine> Engine = InitConnect();
+    Starter.PackageManager->InstalledPackages.clear();
+    Starter.PackageManager->InstallVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON);
+    EXPECT_FALSE(NULL == Engine.get());
+    String16 result = Engine->GetLibPathByVersion(String16("2.4"));
+    #ifdef __SUPPORT_TEGRA3
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_tegra4/lib", String8(result).string());
+    #else
+    #ifdef __SUPPORT_ARMEABI_V7A_FEATURES
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a_neon/lib", String8(result).string());
+    #else
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string());
+    #endif
+    #endif
+}
+
 TEST(OpenCVEngineTest, InstallAndGetVersion)
 {
     sp<IOpenCVEngine> Engine = InitConnect();
diff --git a/android/service/engine/jni/Tests/PackageInfoTest.cpp b/android/service/engine/jni/Tests/PackageInfoTest.cpp
index 6bc84856c..6cbb06943 100644
--- a/android/service/engine/jni/Tests/PackageInfoTest.cpp
+++ b/android/service/engine/jni/Tests/PackageInfoTest.cpp
@@ -85,6 +85,21 @@ TEST(PackageInfo, FullNameTegra3)
     #endif
 }
 
+TEST(PackageInfo, FullNameTegra4)
+{
+    PackageInfo info(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_NEON);
+    string name = info.GetFullName();
+    #ifdef __SUPPORT_TEGRA3
+    EXPECT_STREQ("org.opencv.lib_v24_tegra4", name.c_str());
+    #else
+    #ifdef __SUPPORT_ARMEABI_V7A_FEATURES
+    EXPECT_STREQ("org.opencv.lib_v24_armv7a_neon", name.c_str());
+    #else
+    EXPECT_STREQ("org.opencv.lib_v24_armv7a", name.c_str());
+    #endif
+    #endif
+}
+
 TEST(PackageInfo, FullNameX86SSE2)
 {
     PackageInfo info(2030000, PLATFORM_UNKNOWN, ARCH_X86 | FEATURES_HAS_SSE2);
@@ -148,6 +163,13 @@ TEST(PackageInfo, Tegra3FromFullName)
     EXPECT_EQ(PLATFORM_TEGRA3, info.GetPlatform());
 }
 
+TEST(PackageInfo, Tegra4FromFullName)
+{
+    PackageInfo info("org.opencv.lib_v24_tegra4", "/data/data/org.opencv.lib_v24_tegra4");
+    EXPECT_EQ(2040000, info.GetVersion());
+    EXPECT_EQ(PLATFORM_TEGRA4, info.GetPlatform());
+}
+
 #ifdef __SUPPORT_MIPS
 TEST(PackageInfo, MipsFromFullName)
 {
diff --git a/android/service/engine/jni/Tests/PackageManagmentTest.cpp b/android/service/engine/jni/Tests/PackageManagmentTest.cpp
index f9ccb7484..e21dcf760 100644
--- a/android/service/engine/jni/Tests/PackageManagmentTest.cpp
+++ b/android/service/engine/jni/Tests/PackageManagmentTest.cpp
@@ -102,6 +102,22 @@ TEST(PackageManager, GetPackagePathForTegra3)
 #endif
 }
 
+TEST(PackageManager, GetPackagePathForTegra4)
+{
+    PackageManagerStub pm;
+    EXPECT_TRUE(pm.InstallVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON));
+    string path = pm.GetPackagePathByVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON);
+    #ifdef __SUPPORT_TEGRA3
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_tegra4/lib", path.c_str());
+    #else
+    #ifdef __SUPPORT_ARMEABI_V7A_FEATURES
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a_neon/lib", path.c_str());
+    #else
+    EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", path.c_str());
+    #endif
+    #endif
+}
+
 #ifdef __SUPPORT_MIPS
 TEST(PackageManager, GetPackagePathForMips)
 {
diff --git a/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/android/service/engine/src/org/opencv/engine/HardwareDetector.java
index 7e2a6135f..67320865a 100644
--- a/android/service/engine/src/org/opencv/engine/HardwareDetector.java
+++ b/android/service/engine/src/org/opencv/engine/HardwareDetector.java
@@ -33,6 +33,8 @@ public class HardwareDetector
     public static final int PLATFORM_TEGRA  = 1;
     public static final int PLATFORM_TEGRA2 = 2;
     public static final int PLATFORM_TEGRA3 = 3;
+    public static final int PLATFORM_TEGRA4 = 4;
+
 
     public static final int PLATFORM_UNKNOWN = 0;
 
diff --git a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
index fad279772..5213d9149 100644
--- a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
@@ -83,10 +83,14 @@ public class ManagerActivity extends Activity
             {
                 HardwarePlatformView.setText("Tegra 2");
             }
-            else
+            else if (HardwareDetector.PLATFORM_TEGRA3 == Platfrom)
             {
                 HardwarePlatformView.setText("Tegra 3");
             }
+            else
+            {
+                HardwarePlatformView.setText("Tegra 4");
+            }
         }
         else
         {
@@ -367,10 +371,10 @@ public class ManagerActivity extends Activity
                 temp.put("Version", NormalizeVersion(OpenCVersion, VersionName));
                 // HACK: OpenCV Manager for Armv7-a Neon already has Tegra3 optimizations
                 // that is enabled on proper hardware
-                if (HardwareDetector.DetectKnownPlatforms() == HardwareDetector.PLATFORM_TEGRA3 &&
+                if (HardwareDetector.DetectKnownPlatforms() >= HardwareDetector.PLATFORM_TEGRA3 &&
                   HardwareName.equals("armv7a neon ") &&  Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD)
                 {
-                    temp.put("Hardware", "Tegra 3");
+                    temp.put("Hardware", "Tegra");
                     if (Tags == null)
                     {
                         Tags = "optimized";
diff --git a/cmake/OpenCVDetectAndroidSDK.cmake b/cmake/OpenCVDetectAndroidSDK.cmake
index 0e0240ca8..b125561d4 100644
--- a/cmake/OpenCVDetectAndroidSDK.cmake
+++ b/cmake/OpenCVDetectAndroidSDK.cmake
@@ -264,13 +264,23 @@ macro(add_android_project target path)
     ocv_list_filterout(android_proj_jni_files "\\\\.svn")
 
     if(android_proj_jni_files AND EXISTS ${path}/jni/Android.mk AND NOT DEFINED JNI_LIB_NAME)
+      # find local module name in Android.mk file to build native lib
       file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" )
       string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}")
 
+      # find using of native app glue to determine native activity
+      file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
+
       if(JNI_LIB_NAME)
         ocv_include_modules_recurse(${android_proj_NATIVE_DEPS})
         ocv_include_directories("${path}/jni")
 
+        if (NATIVE_APP_GLUE)
+          include_directories(${ANDROID_NDK}/sources/android/native_app_glue)
+          list(APPEND android_proj_jni_files ${ANDROID_NDK}/sources/android/native_app_glue/android_native_app_glue.c)
+          set(android_proj_NATIVE_DEPS ${android_proj_NATIVE_DEPS} android)
+        endif()
+
         add_library(${JNI_LIB_NAME} MODULE ${android_proj_jni_files})
         target_link_libraries(${JNI_LIB_NAME} ${OPENCV_LINKER_LIBS} ${android_proj_NATIVE_DEPS})
 
diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index 12ab9d3ea..76f76ebc1 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -1,154 +1,104 @@
 if(APPLE)
   set(OPENCL_FOUND YES)
-  set(OPENCL_LIBRARIES "-framework OpenCL")
-else()
+  set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
+  set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
+  mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
+else(APPLE)
   find_package(OpenCL QUIET)
-  if(WITH_OPENCLAMDFFT)
-    set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH})
-    if(NOT CLAMDFFT_SEARCH_PATH)
-      if(WIN32)
-        set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" )
-      endif()
-    endif()
-    set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include )
-    if(UNIX)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib)
-      else()
-        set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64)
-      endif()
-    else()
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import)
-      else()
-        set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import)
-      endif()
-    endif()
-    find_path(CLAMDFFT_INCLUDE_DIR
-      NAMES clAmdFft.h
-      PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH}
-      PATH_SUFFIXES clAmdFft
-      NO_DEFAULT_PATH)
-    find_library(CLAMDFFT_LIBRARY
-      NAMES clAmdFft.Runtime
-      PATHS ${CLAMDFFT_LIB_SEARCH_PATH}
-      NO_DEFAULT_PATH)
-    if(CLAMDFFT_LIBRARY)
-      set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY})
-    else()
-      set(CLAMDFFT_LIBRARIES "")
-    endif()
-  endif()
-  if(WITH_OPENCLAMDBLAS)
-    set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH})
-    if(NOT CLAMDBLAS_SEARCH_PATH)
-      if(WIN32)
-        set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" )
-      endif()
-    endif()
-    set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include )
-    if(UNIX)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib)
-      else()
-        set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64)
-      endif()
-    else()
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import)
-      else()
-        set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import)
-      endif()
-    endif()
-    find_path(CLAMDBLAS_INCLUDE_DIR
-      NAMES clAmdBlas.h
-      PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH}
-      PATH_SUFFIXES clAmdBlas
-      NO_DEFAULT_PATH)
-    find_library(CLAMDBLAS_LIBRARY
-      NAMES clAmdBlas
-      PATHS ${CLAMDBLAS_LIB_SEARCH_PATH}
-      NO_DEFAULT_PATH)
-    if(CLAMDBLAS_LIBRARY)
-      set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY})
-    else()
-      set(CLAMDBLAS_LIBRARIES "")
-    endif()
-  endif()
-  # Try AMD/ATI Stream SDK
+
   if (NOT OPENCL_FOUND)
-    set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT})
-    set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
-    set(ENV_OPENCLROOT $ENV{OPENCLROOT})
-    set(ENV_CUDA_PATH $ENV{CUDA_PATH})
-    set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
-    if(ENV_AMDSTREAMSDKROOT)
-      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86)
-      else()
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86_64)
-      endif()
-    elseif(ENV_AMDSTREAMSDKROOT)
-      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86)
-      else()
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64)
-      endif()
-    elseif(ENV_CUDA_PATH AND WIN32)
-      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32)
-      else()
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64)
-      endif()
-    elseif(ENV_OPENCLROOT AND UNIX)
-      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib)
-      else()
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64)
-      endif()
-    elseif(ENV_INTELOCLSDKROOT)
-      set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include)
-      if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86)
-      else()
-        set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64)
-      endif()
+    find_path(OPENCL_ROOT_DIR
+              NAMES OpenCL/cl.h CL/cl.h include/CL/cl.h include/nvidia-current/CL/cl.h
+              PATHS ENV OCLROOT ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT
+              DOC "OpenCL root directory"
+              NO_DEFAULT_PATH)
+
+    find_path(OPENCL_INCLUDE_DIR
+              NAMES OpenCL/cl.h CL/cl.h
+              HINTS ${OPENCL_ROOT_DIR}
+              PATH_SUFFIXES include include/nvidia-current
+              DOC "OpenCL include directory")
+
+    if (X86_64)
+      set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
+    elseif (X86)
+      set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86)
     endif()
 
-    if(OPENCL_INCLUDE_SEARCH_PATH)
-      find_path(OPENCL_INCLUDE_DIR
-        NAMES CL/cl.h OpenCL/cl.h
-        PATHS ${OPENCL_INCLUDE_SEARCH_PATH}
-        NO_DEFAULT_PATH)
-    else()
-      find_path(OPENCL_INCLUDE_DIR
-        NAMES CL/cl.h OpenCL/cl.h)
-    endif()
-
-    if(OPENCL_LIB_SEARCH_PATH)
-      find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH)
-    else()
-      find_library(OPENCL_LIBRARY NAMES OpenCL)
-    endif()
+    find_library(OPENCL_LIBRARY
+              NAMES OpenCL
+              HINTS ${OPENCL_ROOT_DIR}
+              PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES}
+              DOC "OpenCL library")
 
+    mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
     include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(
-      OPENCL
-      DEFAULT_MSG
-      OPENCL_LIBRARY OPENCL_INCLUDE_DIR
-      )
+    FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR )
+  endif()
+endif(APPLE)
 
-    if(OPENCL_FOUND)
-      set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
-      set(HAVE_OPENCL 1)
-    else()
-      set(OPENCL_LIBRARIES)
+if(OPENCL_FOUND)
+  set(HAVE_OPENCL 1)
+  set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
+  set(OPENCL_LIBRARIES    ${OPENCL_LIBRARY})
+
+  if (X86_64)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
+  elseif (X86)
+    set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
+  endif()
+
+  if(WITH_OPENCLAMDFFT)
+    find_path(CLAMDFFT_ROOT_DIR
+              NAMES include/clAmdFft.h
+              PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
+              PATH_SUFFIXES clAmdFft AMD/clAmdFft
+              DOC "AMD FFT root directory"
+              NO_DEFAULT_PATH)
+
+    find_path(CLAMDFFT_INCLUDE_DIR
+              NAMES clAmdFft.h
+              HINTS ${CLAMDFFT_ROOT_DIR}
+              PATH_SUFFIXES include
+              DOC "clAmdFft include directory")
+
+    find_library(CLAMDFFT_LIBRARY
+              NAMES clAmdFft.Runtime
+              HINTS ${CLAMDFFT_ROOT_DIR}
+              PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
+              DOC "clAmdFft library")
+
+    if(CLAMDFFT_LIBRARY AND CLAMDFFT_INCLUDE_DIR)
+      set(HAVE_CLAMDFFT 1)
+      list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDFFT_INCLUDE_DIR}")
+      list(APPEND OPENCL_LIBRARIES    "${CLAMDFFT_LIBRARY}")
+    endif()
+  endif()
+
+  if(WITH_OPENCLAMDBLAS)
+    find_path(CLAMDBLAS_ROOT_DIR
+              NAMES include/clAmdBlas.h
+              PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
+              PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
+              DOC "AMD FFT root directory"
+              NO_DEFAULT_PATH)
+
+    find_path(CLAMDBLAS_INCLUDE_DIR
+              NAMES clAmdBlas.h
+              HINTS ${CLAMDBLAS_ROOT_DIR}
+              PATH_SUFFIXES include
+              DOC "clAmdFft include directory")
+
+    find_library(CLAMDBLAS_LIBRARY
+              NAMES clAmdBlas
+              HINTS ${CLAMDBLAS_ROOT_DIR}
+              PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
+              DOC "clAmdBlas library")
+
+    if(CLAMDBLAS_LIBRARY AND CLAMDBLAS_INCLUDE_DIR)
+      set(HAVE_CLAMDBLAS 1)
+      list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDBLAS_INCLUDE_DIR}")
+      list(APPEND OPENCL_LIBRARIES    "${CLAMDBLAS_LIBRARY}")
     endif()
-  else()
-    set(HAVE_OPENCL 1)
   endif()
 endif()
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 48aa7139a..75f91a1a2 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -444,6 +444,18 @@ macro(ocv_glob_module_sources)
     source_group("Src\\Cuda"      FILES ${lib_device_srcs} ${lib_device_hdrs})
   endif()
 
+  file(GLOB cl_kernels "src/opencl/*.cl")
+
+  if(HAVE_OPENCL AND cl_kernels)
+    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+    source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+  endif()
+
   ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
                                  SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs})
 
@@ -465,6 +477,9 @@ macro(ocv_create_module)
     if (HAVE_CUDA)
       target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
     endif()
+    if(HAVE_OPENCL AND OPENCL_LIBRARIES)
+      target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
+    endif()
   endif()
 
   add_dependencies(opencv_modules ${the_module})
diff --git a/modules/ocl/cl2cpp.cmake b/cmake/cl2cpp.cmake
similarity index 100%
rename from modules/ocl/cl2cpp.cmake
rename to cmake/cl2cpp.cmake
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 158881bd9..0955a1a3a 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -195,9 +195,9 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
                 d1 = _mm_sub_epi16(d1, c1);
 
                 __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
-                __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
+                __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
                 __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
-                __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
+                __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
 
                 d2 = _mm_sub_epi16(d2, c2);
                 d3 = _mm_sub_epi16(d3, c3);
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index bf376c875..53ea2d40e 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -531,12 +531,12 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
 #endif
 
 template<typename _Tp> void
-JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, int m, int n, int n1, double minval)
+JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,
+               int m, int n, int n1, double minval, _Tp eps)
 {
     VBLAS<_Tp> vblas;
     AutoBuffer<double> Wbuf(n);
     double* W = Wbuf;
-    _Tp eps = DBL_EPSILON*10;
     int i, j, k, iter, max_iter = std::max(m, 30);
     _Tp c, s;
     double sd;
@@ -729,12 +729,12 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, int m, int
 
 static void JacobiSVD(float* At, size_t astep, float* W, float* Vt, size_t vstep, int m, int n, int n1=-1)
 {
-    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN);
+    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2);
 }
 
 static void JacobiSVD(double* At, size_t astep, double* W, double* Vt, size_t vstep, int m, int n, int n1=-1)
 {
-    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN);
+    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN, DBL_EPSILON*10);
 }
 
 /* y[0:m,0:n] += diag(a[0:1,0:m]) * x[0:m,0:n] */
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index e02f78c54..bbe754b33 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -2599,6 +2599,35 @@ TEST(Core_Trace, accuracy) { Core_TraceTest test; test.safe_run(); }
 TEST(Core_SolvePoly, accuracy) { Core_SolvePolyTest test; test.safe_run(); }
 TEST(Core_Phase, accuracy) { Core_PhaseTest test; test.safe_run(); }
 
+
+TEST(Core_SVD, flt)
+{
+    float a[] = {
+    1.23377746e+011f, -7.05490125e+010f, -4.18380882e+010f, -11693456.f,
+    -39091328.f, 77492224.f, -7.05490125e+010f, 2.36211143e+011f,
+    -3.51093473e+010f, 70773408.f, -4.83386156e+005f, -129560368.f,
+    -4.18380882e+010f, -3.51093473e+010f, 9.25311222e+010f, -49052424.f,
+    43922752.f, 12176842.f, -11693456.f, 70773408.f, -49052424.f, 8.40836094e+004f,
+    5.17475293e+003f, -1.16122949e+004f, -39091328.f, -4.83386156e+005f,
+    43922752.f, 5.17475293e+003f, 5.16047969e+004f, 5.68887842e+003f, 77492224.f,
+    -129560368.f, 12176842.f, -1.16122949e+004f, 5.68887842e+003f,
+    1.28060578e+005f
+    };
+
+    float b[] = {
+    283751232.f, 2.61604198e+009f, -745033216.f, 2.31125625e+005f,
+    -4.52429188e+005f, -1.37596525e+006f
+    };
+
+    Mat A(6, 6, CV_32F, a);
+    Mat B(6, 1, CV_32F, b);
+    Mat X, B1;
+    solve(A, B, X, DECOMP_SVD);
+    B1 = A*X;
+    EXPECT_LE(norm(B1, B, NORM_L2 + NORM_RELATIVE), FLT_EPSILON*10);
+}
+
+
 // TODO: eigenvv, invsqrt, cbrt, fastarctan, (round, floor, ceil(?)),
 
 
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 26f806fb8..9db4e5f09 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
 endif()
 
 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
 
 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")
 
diff --git a/modules/gpu/doc/feature_detection_and_description.rst b/modules/gpu/doc/feature_detection_and_description.rst
index 0c4caf77c..8a0288e15 100644
--- a/modules/gpu/doc/feature_detection_and_description.rst
+++ b/modules/gpu/doc/feature_detection_and_description.rst
@@ -5,109 +5,6 @@ Feature Detection and Description
 
 
 
-gpu::SURF_GPU
--------------
-.. ocv:class:: gpu::SURF_GPU
-
-Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
-
-    class SURF_GPU
-    {
-    public:
-        enum KeypointLayout
-        {
-            X_ROW = 0,
-            Y_ROW,
-            LAPLACIAN_ROW,
-            OCTAVE_ROW,
-            SIZE_ROW,
-            ANGLE_ROW,
-            HESSIAN_ROW,
-            ROWS_COUNT
-        };
-
-        //! the default constructor
-        SURF_GPU();
-        //! the full constructor taking all the necessary parameters
-        explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
-             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
-
-        //! returns the descriptor size in float's (64 or 128)
-        int descriptorSize() const;
-
-        //! upload host keypoints to device memory
-        void uploadKeypoints(const vector<KeyPoint>& keypoints,
-            GpuMat& keypointsGPU);
-        //! download keypoints from device to host memory
-        void downloadKeypoints(const GpuMat& keypointsGPU,
-            vector<KeyPoint>& keypoints);
-
-        //! download descriptors from device to host memory
-        void downloadDescriptors(const GpuMat& descriptorsGPU,
-            vector<float>& descriptors);
-
-        void operator()(const GpuMat& img, const GpuMat& mask,
-            GpuMat& keypoints);
-
-        void operator()(const GpuMat& img, const GpuMat& mask,
-            GpuMat& keypoints, GpuMat& descriptors,
-            bool useProvidedKeypoints = false,
-            bool calcOrientation = true);
-
-        void operator()(const GpuMat& img, const GpuMat& mask,
-            std::vector<KeyPoint>& keypoints);
-
-        void operator()(const GpuMat& img, const GpuMat& mask,
-            std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
-            bool useProvidedKeypoints = false,
-            bool calcOrientation = true);
-
-        void operator()(const GpuMat& img, const GpuMat& mask,
-            std::vector<KeyPoint>& keypoints,
-            std::vector<float>& descriptors,
-            bool useProvidedKeypoints = false,
-            bool calcOrientation = true);
-
-        void releaseMemory();
-
-        // SURF parameters
-        double hessianThreshold;
-        int nOctaves;
-        int nOctaveLayers;
-        bool extended;
-        bool upright;
-
-        //! max keypoints = keypointsRatio * img.size().area()
-        float keypointsRatio;
-
-        GpuMat sum, mask1, maskSum, intBuffer;
-
-        GpuMat det, trace;
-
-        GpuMat maxPosBuffer;
-    };
-
-
-The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
-
-The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
-
-* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
-* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
-* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
-* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
-* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
-* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
-* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
-
-The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
-
-The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
-
-.. seealso:: :ocv:class:`SURF`
-
-
-
 gpu::FAST_GPU
 -------------
 .. ocv:class:: gpu::FAST_GPU
diff --git a/modules/gpu/doc/video.rst b/modules/gpu/doc/video.rst
index fc5b1fb6c..284bb17fa 100644
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -579,76 +579,6 @@ Releases all inner buffer's memory.
 
 
 
-gpu::VIBE_GPU
--------------
-.. ocv:class:: gpu::VIBE_GPU
-
-Class used for background/foreground segmentation. ::
-
-    class VIBE_GPU
-    {
-    public:
-        explicit VIBE_GPU(unsigned long rngSeed = 1234567);
-
-        void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
-
-        void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
-
-        void release();
-
-        ...
-    };
-
-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_.
-
-
-
-gpu::VIBE_GPU::VIBE_GPU
------------------------
-The constructor.
-
-.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567)
-
-    :param rngSeed: Value used to initiate a random sequence.
-
-Default constructor sets all parameters to default values.
-
-
-
-gpu::VIBE_GPU::initialize
--------------------------
-Initialize background model and allocates all inner buffers.
-
-.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null())
-
-    :param firstFrame: First frame from video sequence.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::VIBE_GPU::operator()
--------------------------
-Updates the background model and returns the foreground mask
-
-.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null())
-
-    :param frame: Next video frame.
-
-    :param fgmask: The output foreground mask as an 8-bit binary image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::VIBE_GPU::release
-----------------------
-Releases all inner buffer's memory.
-
-.. ocv:function:: void gpu::VIBE_GPU::release()
-
-
-
 gpu::GMG_GPU
 ------------
 .. ocv:class:: gpu::GMG_GPU
@@ -1209,5 +1139,4 @@ Parse next video frame. Implementation must call this method after new frame was
 .. [MOG2001] P. KadewTraKuPong and R. Bowden. *An improved adaptive background mixture model for real-time tracking with shadow detection*. Proc. 2nd European Workshop on Advanced Video-Based Surveillance Systems, 2001
 .. [MOG2004] Z. Zivkovic. *Improved adaptive Gausian mixture model for background subtraction*. International Conference Pattern Recognition, UK, August, 2004
 .. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003
-.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
 .. [GMG2012] A. Godbehere, A. Matsukawa and K. Goldberg. *Visual Tracking of Human Visitors under Variable-Lighting Conditions for a Responsive Audio Art Installation*. American Control Conference, Montreal, June 2012
diff --git a/modules/gpu/include/opencv2/gpu.hpp b/modules/gpu/include/opencv2/gpu.hpp
index 21a03dc20..e0933342b 100644
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -491,6 +491,26 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
 //! converts image from one color space to another
 CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
 
+enum
+{
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    COLOR_BayerBG2BGR_MHT = 256,
+    COLOR_BayerGB2BGR_MHT = 257,
+    COLOR_BayerRG2BGR_MHT = 258,
+    COLOR_BayerGR2BGR_MHT = 259,
+
+    COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT,
+    COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT,
+    COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT,
+    COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT,
+
+    COLOR_BayerBG2GRAY_MHT = 260,
+    COLOR_BayerGB2GRAY_MHT = 261,
+    COLOR_BayerRG2GRAY_MHT = 262,
+    COLOR_BayerGR2GRAY_MHT = 263
+};
+CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null());
+
 //! swap channels
 //! dstOrder - Integer array describing how channel values are permutated. The n-th entry
 //!            of the array contains the number of the channel that is stored in the n-th channel of
@@ -894,9 +914,11 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
 //! Calculates histogram for 8u one channel image
 //! Output hist will have one row, 256 cols and CV32SC1 type.
 CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
+CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
 
 //! normalizes the grayscale image brightness and contrast by normalizing its histogram
 CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
 CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
 
 //////////////////////////////// StereoBM_GPU ////////////////////////////////
@@ -1386,82 +1408,6 @@ private:
     friend class CascadeClassifier_GPU_LBP;
 };
 
-////////////////////////////////// SURF //////////////////////////////////////////
-
-class CV_EXPORTS SURF_GPU
-{
-public:
-    enum KeypointLayout
-    {
-        X_ROW = 0,
-        Y_ROW,
-        LAPLACIAN_ROW,
-        OCTAVE_ROW,
-        SIZE_ROW,
-        ANGLE_ROW,
-        HESSIAN_ROW,
-        ROWS_COUNT
-    };
-
-    //! the default constructor
-    SURF_GPU();
-    //! the full constructor taking all the necessary parameters
-    explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
-         int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
-
-    //! returns the descriptor size in float's (64 or 128)
-    int descriptorSize() const;
-
-    //! upload host keypoints to device memory
-    static void uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
-    //! download keypoints from device to host memory
-    static void downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints);
-
-    //! download descriptors from device to host memory
-    static void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors);
-
-    //! finds the keypoints using fast hessian detector used in SURF
-    //! supports CV_8UC1 images
-    //! keypoints will have nFeature cols and 6 rows
-    //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
-    //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
-    //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
-    //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
-    //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
-    //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
-    //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
-    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
-    //! finds the keypoints and computes their descriptors.
-    //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
-        bool useProvidedKeypoints = false);
-
-    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
-    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
-        bool useProvidedKeypoints = false);
-
-    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
-        bool useProvidedKeypoints = false);
-
-    void releaseMemory();
-
-    // SURF parameters
-    double hessianThreshold;
-    int nOctaves;
-    int nOctaveLayers;
-    bool extended;
-    bool upright;
-
-    //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-    float keypointsRatio;
-
-    GpuMat sum, mask1, maskSum, intBuffer;
-
-    GpuMat det, trace;
-
-    GpuMat maxPosBuffer;
-};
-
 ////////////////////////////////// FAST //////////////////////////////////////////
 
 class CV_EXPORTS FAST_GPU
@@ -2129,41 +2075,6 @@ private:
     GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel
 };
 
-/*!
- * The class implements the following algorithm:
- * "ViBe: A universal background subtraction algorithm for video sequences"
- * O. Barnich and M. Van D Roogenbroeck
- * IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
- */
-class CV_EXPORTS VIBE_GPU
-{
-public:
-    //! the default constructor
-    explicit VIBE_GPU(unsigned long rngSeed = 1234567);
-
-    //! re-initiaization method
-    void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
-
-    //! the update operator
-    void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
-
-    //! releases all inner buffers
-    void release();
-
-    int nbSamples;         // number of samples per pixel
-    int reqMatches;        // #_min
-    int radius;            // R
-    int subsamplingFactor; // amount of random subsampling
-
-private:
-    Size frameSize_;
-
-    unsigned long rngSeed_;
-    GpuMat randStates_;
-
-    GpuMat samples_;
-};
-
 /**
  * Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1)
  * images of the same size, where 255 indicates Foreground and 0 represents Background.
diff --git a/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
new file mode 100644
index 000000000..55b1f247f
--- /dev/null
+++ b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp
@@ -0,0 +1,910 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
+#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
+
+#include "common.hpp"
+
+/*
+  This header file contains inline functions that implement intra-word SIMD
+  operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
+  emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
+  to make the code portable across all GPUs supported by CUDA. The following
+  functions are currently implemented:
+
+  vadd2(a,b)      per-halfword unsigned addition, with wrap-around: a + b
+  vsub2(a,b)      per-halfword unsigned subtraction, with wrap-around: a - b
+  vabsdiff2(a,b)  per-halfword unsigned absolute difference: |a - b|
+  vavg2(a,b)      per-halfword unsigned average: (a + b) / 2
+  vavrg2(a,b)     per-halfword unsigned rounded average: (a + b + 1) / 2
+  vseteq2(a,b)    per-halfword unsigned comparison: a == b ? 1 : 0
+  vcmpeq2(a,b)    per-halfword unsigned comparison: a == b ? 0xffff : 0
+  vsetge2(a,b)    per-halfword unsigned comparison: a >= b ? 1 : 0
+  vcmpge2(a,b)    per-halfword unsigned comparison: a >= b ? 0xffff : 0
+  vsetgt2(a,b)    per-halfword unsigned comparison: a > b ? 1 : 0
+  vcmpgt2(a,b)    per-halfword unsigned comparison: a > b ? 0xffff : 0
+  vsetle2(a,b)    per-halfword unsigned comparison: a <= b ? 1 : 0
+  vcmple2(a,b)    per-halfword unsigned comparison: a <= b ? 0xffff : 0
+  vsetlt2(a,b)    per-halfword unsigned comparison: a < b ? 1 : 0
+  vcmplt2(a,b)    per-halfword unsigned comparison: a < b ? 0xffff : 0
+  vsetne2(a,b)    per-halfword unsigned comparison: a != b ? 1 : 0
+  vcmpne2(a,b)    per-halfword unsigned comparison: a != b ? 0xffff : 0
+  vmax2(a,b)      per-halfword unsigned maximum: max(a, b)
+  vmin2(a,b)      per-halfword unsigned minimum: min(a, b)
+
+  vadd4(a,b)      per-byte unsigned addition, with wrap-around: a + b
+  vsub4(a,b)      per-byte unsigned subtraction, with wrap-around: a - b
+  vabsdiff4(a,b)  per-byte unsigned absolute difference: |a - b|
+  vavg4(a,b)      per-byte unsigned average: (a + b) / 2
+  vavrg4(a,b)     per-byte unsigned rounded average: (a + b + 1) / 2
+  vseteq4(a,b)    per-byte unsigned comparison: a == b ? 1 : 0
+  vcmpeq4(a,b)    per-byte unsigned comparison: a == b ? 0xff : 0
+  vsetge4(a,b)    per-byte unsigned comparison: a >= b ? 1 : 0
+  vcmpge4(a,b)    per-byte unsigned comparison: a >= b ? 0xff : 0
+  vsetgt4(a,b)    per-byte unsigned comparison: a > b ? 1 : 0
+  vcmpgt4(a,b)    per-byte unsigned comparison: a > b ? 0xff : 0
+  vsetle4(a,b)    per-byte unsigned comparison: a <= b ? 1 : 0
+  vcmple4(a,b)    per-byte unsigned comparison: a <= b ? 0xff : 0
+  vsetlt4(a,b)    per-byte unsigned comparison: a < b ? 1 : 0
+  vcmplt4(a,b)    per-byte unsigned comparison: a < b ? 0xff : 0
+  vsetne4(a,b)    per-byte unsigned comparison: a != b ? 1: 0
+  vcmpne4(a,b)    per-byte unsigned comparison: a != b ? 0xff: 0
+  vmax4(a,b)      per-byte unsigned maximum: max(a, b)
+  vmin4(a,b)      per-byte unsigned minimum: min(a, b)
+*/
+
+namespace cv { namespace gpu { namespace device
+{
+    // 2
+
+    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a + b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+        r = r - s;          // subtract out carry-out from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a - b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // borrow to high word
+        r = r + s;          // compensate for borrow from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u, v;
+        s = a & 0x0000ffff; // extract low halfword
+        r = b & 0x0000ffff; // extract low halfword
+        u = ::max(r, s);    // maximum of low halfwords
+        v = ::min(r, s);    // minimum of low halfwords
+        s = a & 0xffff0000; // extract high halfword
+        r = b & 0xffff0000; // extract high halfword
+        t = ::max(r, s);    // maximum of high halfwords
+        s = ::min(r, s);    // minimum of high halfwords
+        r = u | t;          // maximum of both halfwords
+        s = v | s;          // minimum of both halfwords
+        r = r - s;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int s;
+        s = a ^ b;
+        r = a | b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+        s = s >> 1;
+        r = r - s;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::max(r, s);    // maximum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::max(r, s);    // maximum of high halfwords
+        r = t | u;          // combine halfword maximums
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::min(r, s);    // minimum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::min(r, s);    // minimum of high halfwords
+        r = t | u;          // combine halfword minimums
+    #endif
+
+        return r;
+    }
+
+    // 4
+
+    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ b;          // sum bits
+        r = a & 0x7f7f7f7f; // clear msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // msb sum bits
+        r = r + t;          // add without msbs, record carry-out in msbs
+        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+    #endif /* __CUDA_ARCH__ >= 300 */
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ ~b;         // inverted sum bits
+        r = a | 0x80808080; // set msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // inverted msb sum bits
+        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+        r = r ^ s;          // combine inverted msb sum bits and borrows
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int c;
+        c = a ^ b;
+        r = a | b;
+        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        c = c >> 1;
+        r = r - c;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r & ~c;         // msb = 1, if r was 0x00
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, t;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq4(a, b);
+        t = r << 8;         // convert bool
+        r = t - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        t = a ^ b;          // 0x00 if a == b
+        r = t | 0x80808080; // set msbs, to catch carry out
+        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+        r = t & ~r;         // msb = 1, if t was 0x00
+        t = r >> 7;         // build mask
+        t = r - t;          //  from
+        r = t | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge4(a, b);
+        s = r << 8;         // convert bool
+        r = s - r;          //  to mask
+    #else
+        asm ("not.b32 %0,%0;" : "+r"(b));
+        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+        r = r & 0x80808080; // msb = carry-outs
+        s = r >> 7;         // build mask
+        s = r - s;          //  from
+        r = s | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a ^ b;          //
+        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+        r = s - r;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;           // byte-wise unsigned maximum
+    }
+
+    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(b, a);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;
+    }
+}}}
+
+#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index d82211bf3..159f4b968 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -7,7 +7,7 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
diff --git a/modules/gpu/perf/perf_calib3d.cpp b/modules/gpu/perf/perf_calib3d.cpp
index b174d9a12..8019c0349 100644
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 //////////////////////////////////////////////////////////////////////
 // StereoBM
@@ -12,7 +13,7 @@ DEF_PARAM_TEST_1(ImagePair, pair_string);
 PERF_TEST_P(ImagePair, Calib3D_StereoBM,
             Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
 {
-    declare.time(5.0);
+    declare.time(300.0);
 
     const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(imgLeft.empty());
@@ -53,7 +54,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM,
 PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
             Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
-    declare.time(10.0);
+    declare.time(300.0);
 
     const cv::Mat imgLeft = readImage(GET_PARAM(0));
     ASSERT_FALSE(imgLeft.empty());
@@ -87,7 +88,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
 PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
             Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
-    declare.time(10.0);
+    declare.time(300.0);
 
     const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(imgLeft.empty());
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index 22840f9f4..70bb8f24f 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -1748,7 +1748,10 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
     const int normType = GET_PARAM(2);
 
     cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
 
     if (PERF_RUN_GPU())
     {
@@ -1923,7 +1926,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
     const int depth = GET_PARAM(1);
 
     cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
 
     if (PERF_RUN_GPU())
     {
@@ -1958,7 +1964,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
     const int depth = GET_PARAM(1);
 
     cv::Mat src(size, depth);
-    declare.in(src, WARMUP_RNG);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
 
     if (PERF_RUN_GPU())
     {
diff --git a/modules/gpu/perf/perf_denoising.cpp b/modules/gpu/perf/perf_denoising.cpp
index 6f03994bd..970122568 100644
--- a/modules/gpu/perf/perf_denoising.cpp
+++ b/modules/gpu/perf/perf_denoising.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 #define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
 
@@ -63,7 +64,7 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
                     Values(21),
                     Values(5)))
 {
-    declare.time(60.0);
+    declare.time(600.0);
 
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp
index 480f58238..e5a6ef3c8 100644
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -2,105 +2,7 @@
 
 using namespace std;
 using namespace testing;
-
-struct KeypointIdxCompare
-{
-    std::vector<cv::KeyPoint>* keypoints;
-
-    explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
-
-    bool operator ()(size_t i1, size_t i2) const
-    {
-        cv::KeyPoint kp1 = (*keypoints)[i1];
-        cv::KeyPoint kp2 = (*keypoints)[i2];
-        if (kp1.pt.x != kp2.pt.x)
-            return kp1.pt.x < kp2.pt.x;
-        if (kp1.pt.y != kp2.pt.y)
-            return kp1.pt.y < kp2.pt.y;
-        if (kp1.response != kp2.response)
-            return kp1.response < kp2.response;
-        return kp1.octave < kp2.octave;
-    }
-};
-
-static void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
-{
-    std::vector<size_t> indexies(keypoints.size());
-    for (size_t i = 0; i < indexies.size(); ++i)
-        indexies[i] = i;
-
-    std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
-
-    std::vector<cv::KeyPoint> new_keypoints;
-    cv::Mat new_descriptors;
-
-    new_keypoints.resize(keypoints.size());
-
-    cv::Mat descriptors;
-    if (_descriptors.needed())
-    {
-        descriptors = _descriptors.getMat();
-        new_descriptors.create(descriptors.size(), descriptors.type());
-    }
-
-    for (size_t i = 0; i < indexies.size(); ++i)
-    {
-        size_t new_idx = indexies[i];
-        new_keypoints[i] = keypoints[new_idx];
-        if (!new_descriptors.empty())
-            descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
-    }
-
-    keypoints.swap(new_keypoints);
-    if (_descriptors.needed())
-        new_descriptors.copyTo(_descriptors);
-}
-
-//////////////////////////////////////////////////////////////////////
-// SURF
-
-DEF_PARAM_TEST_1(Image, string);
-
-PERF_TEST_P(Image, Features2D_SURF,
-            Values<string>("gpu/perf/aloe.png"))
-{
-    declare.time(50.0);
-
-    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::SURF_GPU d_surf;
-
-        const cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_keypoints, d_descriptors;
-
-        TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-
-        std::vector<cv::KeyPoint> gpu_keypoints;
-        d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);
-
-        cv::Mat gpu_descriptors(d_descriptors);
-
-        sortKeyPoints(gpu_keypoints, gpu_descriptors);
-
-        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
-        SANITY_CHECK(gpu_descriptors, 1e-3);
-    }
-    else
-    {
-        cv::SURF surf;
-
-        std::vector<cv::KeyPoint> cpu_keypoints;
-        cv::Mat cpu_descriptors;
-
-        TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
-
-        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
-        SANITY_CHECK(cpu_descriptors);
-    }
-}
+using namespace perf;
 
 //////////////////////////////////////////////////////////////////////
 // FAST
@@ -153,6 +55,8 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,
             Combine(Values<string>("gpu/perf/aloe.png"),
                     Values(4000)))
 {
+    declare.time(300.0);
+
     const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
 
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index 3516954a6..a343d1057 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 //////////////////////////////////////////////////////////////////////
 // Blur
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index be6eb4877..84cb0e200 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -632,7 +632,7 @@ DEF_PARAM_TEST_1(Image, string);
 PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
             Values<string>("gpu/meanshift/cones.png"))
 {
-    declare.time(15.0);
+    declare.time(300.0);
 
     const cv::Mat img = readImage(GetParam());
     ASSERT_FALSE(img.empty());
@@ -668,7 +668,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftFiltering,
 PERF_TEST_P(Image, ImgProc_MeanShiftProc,
             Values<string>("gpu/meanshift/cones.png"))
 {
-    declare.time(5.0);
+    declare.time(300.0);
 
     const cv::Mat img = readImage(GetParam());
     ASSERT_FALSE(img.empty());
@@ -702,7 +702,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftProc,
 PERF_TEST_P(Image, ImgProc_MeanShiftSegmentation,
             Values<string>("gpu/meanshift/cones.png"))
 {
-    declare.time(5.0);
+    declare.time(300.0);
 
     const cv::Mat img = readImage(GetParam());
     ASSERT_FALSE(img.empty());
@@ -830,6 +830,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate8U,
                     GPU_CHANNELS_1_3_4,
                     ALL_TEMPLATE_METHODS))
 {
+    declare.time(300.0);
+
     const cv::Size size = GET_PARAM(0);
     const cv::Size templ_size = GET_PARAM(1);
     const int cn = GET_PARAM(2);
@@ -868,6 +870,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F,
                     GPU_CHANNELS_1_3_4,
                     Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))))
 {
+    declare.time(300.0);
+
     const cv::Size size = GET_PARAM(0);
     const cv::Size templ_size = GET_PARAM(1);
     const int cn = GET_PARAM(2);
@@ -1034,7 +1038,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerHarris,
 
         TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode);
 
-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 1e-4);
     }
     else
     {
@@ -1077,7 +1081,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerMinEigenVal,
 
         TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode);
 
-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 1e-4);
     }
     else
     {
@@ -1341,7 +1345,12 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
                     Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
                            CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
                            CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
-                           CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR))))
+                           CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
+
+                           CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY),
+                           CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY))))
 {
     const cv::Size size = GET_PARAM(0);
     const int depth = GET_PARAM(1);
@@ -1369,6 +1378,50 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer,
     }
 }
 
+CV_ENUM(DemosaicingCode,
+        cv::COLOR_BayerBG2BGR, cv::COLOR_BayerGB2BGR, cv::COLOR_BayerRG2BGR, cv::COLOR_BayerGR2BGR,
+        cv::COLOR_BayerBG2GRAY, cv::COLOR_BayerGB2GRAY, cv::COLOR_BayerRG2GRAY, cv::COLOR_BayerGR2GRAY,
+        cv::gpu::COLOR_BayerBG2BGR_MHT, cv::gpu::COLOR_BayerGB2BGR_MHT, cv::gpu::COLOR_BayerRG2BGR_MHT, cv::gpu::COLOR_BayerGR2BGR_MHT,
+        cv::gpu::COLOR_BayerBG2GRAY_MHT, cv::gpu::COLOR_BayerGB2GRAY_MHT, cv::gpu::COLOR_BayerRG2GRAY_MHT, cv::gpu::COLOR_BayerGR2GRAY_MHT)
+
+DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode);
+
+PERF_TEST_P(Sz_Code, ImgProc_Demosaicing,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    ValuesIn(DemosaicingCode::all())))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int code = GET_PARAM(1);
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        if (code >= cv::COLOR_COLORCVT_MAX)
+        {
+            FAIL_NO_CPU();
+        }
+        else
+        {
+            cv::Mat dst;
+
+            TEST_CYCLE() cv::cvtColor(src, dst, code);
+
+            CPU_SANITY_CHECK(dst);
+        }
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // SwapChannels
 
diff --git a/modules/gpu/perf/perf_labeling.cpp b/modules/gpu/perf/perf_labeling.cpp
index f3ad12c94..cbc9ff0a2 100644
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 DEF_PARAM_TEST_1(Image, string);
 
diff --git a/modules/gpu/perf/perf_main.cpp b/modules/gpu/perf/perf_main.cpp
index 312b74448..07c1b519c 100644
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -1,70 +1,5 @@
 #include "perf_precomp.hpp"
 
-static void printOsInfo()
-{
-#if defined _WIN32
-#   if defined _WIN64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"), fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"), fflush(stdout);
-#   endif
-#elif defined linux
-#   if defined _LP64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"), fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"), fflush(stdout);
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"), fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"), fflush(stdout);
-#   endif
-#endif
-
-}
-
-static void printCudaInfo()
-{
-    printOsInfo();
-#ifndef HAVE_CUDA
-    printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
-#else
-    int driver;
-    cudaDriverGetVersion(&driver);
-
-    printf("[----------]\n"), fflush(stdout);
-    printf("[ GPU INFO ] \tCUDA Driver  version: %d.\n", driver), fflush(stdout);
-    printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
-    printf("[----------]\n"), fflush(stdout);
-
-    printf("[----------]\n"), fflush(stdout);
-    printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout);
-    printf("[      BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
-    printf("[      PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
-    printf("[----------]\n"), fflush(stdout);
-
-    printf("[----------]\n"), fflush(stdout);
-    int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
-    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
-    printf("[----------]\n"), fflush(stdout);
-
-    for (int i = 0; i < deviceCount; ++i)
-    {
-        cv::gpu::DeviceInfo info(i);
-
-        printf("[----------]\n"), fflush(stdout);
-        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
-        printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
-        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
-        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
-        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
-        if (!info.isCompatible())
-            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
-        printf("[----------]\n"), fflush(stdout);
-    }
-
-#endif
-}
+using namespace perf;
 
 CV_PERF_TEST_MAIN(gpu, printCudaInfo())
diff --git a/modules/gpu/perf/perf_matop.cpp b/modules/gpu/perf/perf_matop.cpp
index 1696e3a7e..f2803f0f2 100644
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 //////////////////////////////////////////////////////////////////////
 // SetTo
diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 969ac1076..4f8e56853 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -2,6 +2,7 @@
 
 using namespace std;
 using namespace testing;
+using namespace perf;
 
 ///////////////////////////////////////////////////////////////
 // HOG
@@ -18,6 +19,8 @@ PERF_TEST_P(Image, ObjDetect_HOG,
                            "gpu/caltech/image_00000527_0.png",
                            "gpu/caltech/image_00000574_0.png"))
 {
+    declare.time(300.0);
+
     const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
 
diff --git a/modules/gpu/perf/perf_precomp.hpp b/modules/gpu/perf/perf_precomp.hpp
index 71fe9e7d0..322cac094 100644
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@@ -19,6 +19,7 @@
 #endif
 
 #include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
 
 #include "opencv2/core.hpp"
 #include "opencv2/highgui.hpp"
@@ -26,12 +27,9 @@
 #include "opencv2/calib3d.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/video.hpp"
-#include "opencv2/nonfree.hpp"
 #include "opencv2/legacy.hpp"
 #include "opencv2/photo.hpp"
 
-#include "utility.hpp"
-
 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
 #endif
diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp
index 61c2687ca..b998ff95f 100644
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
@@ -4,6 +4,18 @@ using namespace std;
 using namespace testing;
 using namespace perf;
 
+#if defined(HAVE_XINE)         || \
+    defined(HAVE_GSTREAMER)    || \
+    defined(HAVE_QUICKTIME)    || \
+    defined(HAVE_AVFOUNDATION) || \
+    defined(HAVE_FFMPEG)       || \
+    defined(WIN32) /* assume that we have ffmpeg */
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
 namespace cv
 {
     template<> void Ptr<CvBGStatModel>::delete_obj()
@@ -142,7 +154,7 @@ PERF_TEST_P(Image_MinDistance, Video_GoodFeaturesToTrack,
 PERF_TEST_P(ImagePair, Video_BroxOpticalFlow,
             Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
 {
-    declare.time(10);
+    declare.time(300);
 
     cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(frame0.empty());
@@ -372,8 +384,8 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
 
         TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v);
 
-        GPU_SANITY_CHECK(u, 1e-4);
-        GPU_SANITY_CHECK(v, 1e-4);
+        GPU_SANITY_CHECK(u, 1e-2);
+        GPU_SANITY_CHECK(v, 1e-2);
     }
     else
     {
@@ -482,6 +494,8 @@ PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
 //////////////////////////////////////////////////////
 // FGDStatModel
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 DEF_PARAM_TEST_1(Video, string);
 
 PERF_TEST_P(Video, Video_FGDStatModel,
@@ -548,9 +562,13 @@ PERF_TEST_P(Video, Video_FGDStatModel,
     }
 }
 
+#endif
+
 //////////////////////////////////////////////////////
 // MOG
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 DEF_PARAM_TEST(Video_Cn_LearningRate, string, MatCn, double);
 
 PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
@@ -643,9 +661,13 @@ PERF_TEST_P(Video_Cn_LearningRate, Video_MOG,
     }
 }
 
+#endif
+
 //////////////////////////////////////////////////////
 // MOG2
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 DEF_PARAM_TEST(Video_Cn, string, int);
 
 PERF_TEST_P(Video_Cn, Video_MOG2,
@@ -740,9 +762,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2,
     }
 }
 
+#endif
+
 //////////////////////////////////////////////////////
 // MOG2GetBackgroundImage
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
             Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
                     GPU_CHANNELS_1_3_4))
@@ -818,74 +844,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage,
     }
 }
 
-//////////////////////////////////////////////////////
-// VIBE
-
-PERF_TEST_P(Video_Cn, Video_VIBE,
-            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
-                    GPU_CHANNELS_1_3_4))
-{
-    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
-    const int cn = GET_PARAM(1);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_frame(frame);
-        cv::gpu::VIBE_GPU vibe;
-        cv::gpu::GpuMat foreground;
-
-        vibe(d_frame, foreground);
-
-        for (int i = 0; i < 10; ++i)
-        {
-            cap >> frame;
-            ASSERT_FALSE(frame.empty());
-
-            if (cn != 3)
-            {
-                cv::Mat temp;
-                if (cn == 1)
-                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-                else
-                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-                cv::swap(temp, frame);
-            }
-
-            d_frame.upload(frame);
-
-            startTimer(); next();
-            vibe(d_frame, foreground);
-            stopTimer();
-        }
-
-        GPU_SANITY_CHECK(foreground);
-    }
-    else
-    {
-        FAIL_NO_CPU();
-    }
-}
+#endif
 
 //////////////////////////////////////////////////////
 // GMG
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 DEF_PARAM_TEST(Video_Cn_MaxFeatures, string, MatCn, int);
 
 PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
@@ -993,11 +958,13 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG,
     }
 }
 
-#ifdef HAVE_NVCUVID
+#endif
 
 //////////////////////////////////////////////////////
 // VideoReader
 
+#if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
     declare.time(20);
@@ -1028,10 +995,12 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video
     }
 }
 
+#endif
+
 //////////////////////////////////////////////////////
 // VideoWriter
 
-#ifdef WIN32
+#if defined(HAVE_NVCUVID) && defined(WIN32)
 
 PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"))
 {
@@ -1089,6 +1058,4 @@ PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video
     SANITY_CHECK(frame);
 }
 
-#endif // WIN32
-
-#endif // HAVE_NVCUVID
+#endif
diff --git a/modules/gpu/perf/utility.cpp b/modules/gpu/perf/utility.cpp
deleted file mode 100644
index 16c61e0c7..000000000
--- a/modules/gpu/perf/utility.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace cv;
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-void PrintTo(const CvtColorInfo& info, ostream* os)
-{
-    static const char* str[] =
-    {
-        "BGR2BGRA",
-        "BGRA2BGR",
-        "BGR2RGBA",
-        "RGBA2BGR",
-        "BGR2RGB",
-        "BGRA2RGBA",
-
-        "BGR2GRAY",
-        "RGB2GRAY",
-        "GRAY2BGR",
-        "GRAY2BGRA",
-        "BGRA2GRAY",
-        "RGBA2GRAY",
-
-        "BGR2BGR565",
-        "RGB2BGR565",
-        "BGR5652BGR",
-        "BGR5652RGB",
-        "BGRA2BGR565",
-        "RGBA2BGR565",
-        "BGR5652BGRA",
-        "BGR5652RGBA",
-
-        "GRAY2BGR565",
-        "BGR5652GRAY",
-
-        "BGR2BGR555",
-        "RGB2BGR555",
-        "BGR5552BGR",
-        "BGR5552RGB",
-        "BGRA2BGR555",
-        "RGBA2BGR555",
-        "BGR5552BGRA",
-        "BGR5552RGBA",
-
-        "GRAY2BGR555",
-        "BGR5552GRAY",
-
-        "BGR2XYZ",
-        "RGB2XYZ",
-        "XYZ2BGR",
-        "XYZ2RGB",
-
-        "BGR2YCrCb",
-        "RGB2YCrCb",
-        "YCrCb2BGR",
-        "YCrCb2RGB",
-
-        "BGR2HSV",
-        "RGB2HSV",
-
-        "",
-        "",
-
-        "BGR2Lab",
-        "RGB2Lab",
-
-        "BayerBG2BGR",
-        "BayerGB2BGR",
-        "BayerRG2BGR",
-        "BayerGR2BGR",
-
-        "BGR2Luv",
-        "RGB2Luv",
-
-        "BGR2HLS",
-        "RGB2HLS",
-
-        "HSV2BGR",
-        "HSV2RGB",
-
-        "Lab2BGR",
-        "Lab2RGB",
-        "Luv2BGR",
-        "Luv2RGB",
-
-        "HLS2BGR",
-        "HLS2RGB",
-
-        "BayerBG2BGR_VNG",
-        "BayerGB2BGR_VNG",
-        "BayerRG2BGR_VNG",
-        "BayerGR2BGR_VNG",
-
-        "BGR2HSV_FULL",
-        "RGB2HSV_FULL",
-        "BGR2HLS_FULL",
-        "RGB2HLS_FULL",
-
-        "HSV2BGR_FULL",
-        "HSV2RGB_FULL",
-        "HLS2BGR_FULL",
-        "HLS2RGB_FULL",
-
-        "LBGR2Lab",
-        "LRGB2Lab",
-        "LBGR2Luv",
-        "LRGB2Luv",
-
-        "Lab2LBGR",
-        "Lab2LRGB",
-        "Luv2LBGR",
-        "Luv2LRGB",
-
-        "BGR2YUV",
-        "RGB2YUV",
-        "YUV2BGR",
-        "YUV2RGB",
-
-        "BayerBG2GRAY",
-        "BayerGB2GRAY",
-        "BayerRG2GRAY",
-        "BayerGR2GRAY",
-
-        //YUV 4:2:0 formats family
-        "YUV2RGB_NV12",
-        "YUV2BGR_NV12",
-        "YUV2RGB_NV21",
-        "YUV2BGR_NV21",
-
-        "YUV2RGBA_NV12",
-        "YUV2BGRA_NV12",
-        "YUV2RGBA_NV21",
-        "YUV2BGRA_NV21",
-
-        "YUV2RGB_YV12",
-        "YUV2BGR_YV12",
-        "YUV2RGB_IYUV",
-        "YUV2BGR_IYUV",
-
-        "YUV2RGBA_YV12",
-        "YUV2BGRA_YV12",
-        "YUV2RGBA_IYUV",
-        "YUV2BGRA_IYUV",
-
-        "YUV2GRAY_420",
-
-        //YUV 4:2:2 formats family
-        "YUV2RGB_UYVY",
-        "YUV2BGR_UYVY",
-        "YUV2RGB_VYUY",
-        "YUV2BGR_VYUY",
-
-        "YUV2RGBA_UYVY",
-        "YUV2BGRA_UYVY",
-        "YUV2RGBA_VYUY",
-        "YUV2BGRA_VYUY",
-
-        "YUV2RGB_YUY2",
-        "YUV2BGR_YUY2",
-        "YUV2RGB_YVYU",
-        "YUV2BGR_YVYU",
-
-        "YUV2RGBA_YUY2",
-        "YUV2BGRA_YUY2",
-        "YUV2RGBA_YVYU",
-        "YUV2BGRA_YVYU",
-
-        "YUV2GRAY_UYVY",
-        "YUV2GRAY_YUY2",
-
-        // alpha premultiplication
-        "RGBA2mRGBA",
-        "mRGBA2RGBA",
-
-        "COLORCVT_MAX"
-    };
-
-    *os << str[info.code];
-}
diff --git a/modules/gpu/perf/utility.hpp b/modules/gpu/perf/utility.hpp
deleted file mode 100644
index 18c85854a..000000000
--- a/modules/gpu/perf/utility.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
-
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
-
-enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
-CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
-#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
-#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    CvtColorInfo() {}
-    explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
-#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
-
-DEF_PARAM_TEST_1(Sz, cv::Size);
-typedef perf::Size_MatType Sz_Type;
-DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
-DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
-
-#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
-
-#define GPU_SANITY_CHECK(mat, ...) \
-    do{ \
-        cv::Mat gpu_##mat(mat); \
-        SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
-    } while(0)
-
-#define CPU_SANITY_CHECK(mat, ...) \
-    do{ \
-        cv::Mat cpu_##mat(mat); \
-        SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
-    } while(0)
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
diff --git a/modules/gpu/perf4au/main.cpp b/modules/gpu/perf4au/main.cpp
index f4a04432b..162a15b2f 100644
--- a/modules/gpu/perf4au/main.cpp
+++ b/modules/gpu/perf4au/main.cpp
@@ -8,69 +8,19 @@
 #include "opencv2/video.hpp"
 #include "opencv2/legacy.hpp"
 #include "opencv2/ts.hpp"
-
-static void printOsInfo()
-{
-#if defined _WIN32
-#   if defined _WIN64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
-#   endif
-#elif defined linux
-#   if defined _LP64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
-#   else
-        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
-#   endif
-#endif
-}
-
-static void printCudaInfo()
-{
-    const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
-
-    printf("[----------]\n"); fflush(stdout);
-    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
-    printf("[----------]\n"); fflush(stdout);
-
-    for (int i = 0; i < deviceCount; ++i)
-    {
-        cv::gpu::DeviceInfo info(i);
-
-        printf("[----------]\n"); fflush(stdout);
-        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
-        printf("[          ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
-        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()); fflush(stdout);
-        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
-        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)); fflush(stdout);
-        if (!info.isCompatible())
-            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
-        printf("[----------]\n"); fflush(stdout);
-    }
-}
+#include "opencv2/ts/gpu_perf.hpp"
 
 int main(int argc, char* argv[])
 {
-    printOsInfo();
-    printCudaInfo();
+    perf::printCudaInfo();
 
-    perf::Regression::Init("nv_perf_test");
+    perf::Regression::Init("gpu_perf4au");
     perf::TestBase::Init(argc, argv);
     testing::InitGoogleTest(&argc, argv);
 
     return RUN_ALL_TESTS();
 }
 
-#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
-#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
-
 //////////////////////////////////////////////////////////
 // HoughLinesP
 
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 851ac938e..47a29881f 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -318,40 +318,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 
 void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
 {
-    class LevelsInit
-    {
-    public:
-        Npp32s pLevels[256];
-        const Npp32s* pLevels3[3];
-        int nValues3[3];
+    const int cn = src.channels();
 
-#if (CUDA_VERSION > 4020)
-        GpuMat d_pLevels;
-#endif
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+    CV_Assert( lut.depth() == CV_8U );
+    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
+    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
 
-        LevelsInit()
-        {
-            nValues3[0] = nValues3[1] = nValues3[2] = 256;
-            for (int i = 0; i < 256; ++i)
-                pLevels[i] = i;
-
-
-#if (CUDA_VERSION <= 4020)
-            pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-            d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-            pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-        }
-    };
-    static LevelsInit lvls;
-
-    int cn = src.channels();
-
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3);
-    CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous());
-
-    dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
+    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
 
     NppiSize sz;
     sz.height = src.rows;
@@ -360,19 +334,34 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
     Mat nppLut;
     lut.convertTo(nppLut, CV_32S);
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+    int nValues3[] = {256, 256, 256};
 
+    Npp32s pLevels[256];
+    for (int i = 0; i < 256; ++i)
+        pLevels[i] = i;
+
+    const Npp32s* pLevels3[3];
+
+#if (CUDA_VERSION <= 4020)
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+#else
+    GpuMat d_pLevels;
+    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
     NppStreamHandler h(stream);
 
     if (src.type() == CV_8UC1)
     {
 #if (CUDA_VERSION <= 4020)
         nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
 #else
         GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
         nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), lvls.d_pLevels.ptr<Npp32s>(), 256) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
 #endif
     }
     else
@@ -409,7 +398,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
         }
 
         nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) );
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
     }
 
     if (stream == 0)
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index 05d460900..76793d520 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
 
 void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
+void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
 void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
 void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
 
@@ -62,6 +63,9 @@ namespace cv { namespace gpu {
         void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
         template <int cn>
         void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+        template <int cn>
+        void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
     }
 }}
 
@@ -1620,26 +1624,56 @@ namespace
 
         funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
     }
-
     void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
     {
         bayer_to_bgr(src, dst, dcn, false, false, stream);
     }
-
     void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
     {
         bayer_to_bgr(src, dst, dcn, false, true, stream);
     }
-
     void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
     {
         bayer_to_bgr(src, dst, dcn, true, false, stream);
     }
-
     void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
     {
         bayer_to_bgr(src, dst, dcn, true, true, stream);
     }
+
+    void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        static const func_t funcs[3] =
+        {
+            Bayer2BGR_8u_gpu<1>,
+            0,
+            Bayer2BGR_16u_gpu<1>,
+        };
+
+        CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1);
+        CV_Assert(src.rows > 2 && src.cols > 2);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));
+
+        funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream));
+    }
+    void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, false, stream);
+    }
+    void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, false, true, stream);
+    }
+    void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, false, stream);
+    }
+    void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream)
+    {
+        bayer_to_gray(src, dst, true, true, stream);
+    }
 }
 
 void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
@@ -1756,10 +1790,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
         yuv_to_bgr,             // CV_YUV2BGR      = 84
         yuv_to_rgb,             // CV_YUV2RGB      = 85
 
-        0,                      // CV_BayerBG2GRAY = 86
-        0,                      // CV_BayerGB2GRAY = 87
-        0,                      // CV_BayerRG2GRAY = 88
-        0,                      // CV_BayerGR2GRAY = 89
+        bayerBG_to_gray,        // CV_BayerBG2GRAY = 86
+        bayerGB_to_gray,        // CV_BayerGB2GRAY = 87
+        bayerRG_to_gray,        // CV_BayerRG2GRAY = 88
+        bayerGR_to_gray,        // CV_BayerGR2GRAY = 89
 
         //YUV 4:2:0 formats family
         0,                      // CV_YUV2RGB_NV12 = 90,
@@ -1825,6 +1859,74 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
     func(src, dst, dcn, stream);
 }
 
+void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
+{
+    const int depth = src.depth();
+
+    CV_Assert( src.channels() == 1 );
+
+    switch (code)
+    {
+    case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
+        bayer_to_gray(src, dst, code == CV_BayerBG2GRAY || code == CV_BayerGB2GRAY, code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY, stream);
+        break;
+
+    case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
+        bayer_to_bgr(src, dst, dcn, code == CV_BayerBG2BGR || code == CV_BayerGB2BGR, code == CV_BayerGB2BGR || code == CV_BayerGR2BGR, stream);
+        break;
+
+    case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT:
+    {
+        if (dcn <= 0)
+            dcn = 3;
+
+        CV_Assert( depth == CV_8U );
+        CV_Assert( dcn == 3 || dcn == 4 );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, dcn));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        if (dcn == 3)
+            device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        else
+            device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT:
+    {
+        CV_Assert( depth == CV_8U );
+
+        dst.create(src.size(), CV_MAKETYPE(depth, 1));
+        dst.setTo(Scalar::all(0));
+
+        Size wholeSize;
+        Point ofs;
+        src.locateROI(wholeSize, ofs);
+        PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step);
+
+        const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
+                                        code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
+
+        device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+
+        break;
+    }
+
+    default:
+        CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
+    }
+}
+
 void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
 {
     CV_Assert(image.type() == CV_8UC4);
diff --git a/modules/gpu/src/cuda/debayer.cu b/modules/gpu/src/cuda/debayer.cu
index 57322ed81..1d2f18e7a 100644
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -42,42 +42,38 @@
 
 #if !defined CUDA_DISABLER
 
-#include <opencv2/gpu/device/common.hpp>
-#include <opencv2/gpu/device/vec_traits.hpp>
-#include <opencv2/gpu/device/vec_math.hpp>
-#include <opencv2/gpu/device/limits.hpp>
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/color.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
 
-namespace cv { namespace gpu {
-    namespace device
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
     {
-        template <typename D>
-        __global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
         {
-            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (s_y >= dst.rows || (s_x << 2) >= dst.cols)
-                return;
-
-            s_y = ::min(::max(s_y, 1), dst.rows - 2);
-
             uchar4 patch[3][3];
             patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
             patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
 
             patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
             patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
 
             patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
             patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)];
-
-            D res0 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res1 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res2 = VecTraits<D>::all(numeric_limits<uchar>::max());
-            D res3 = VecTraits<D>::all(numeric_limits<uchar>::max());
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
 
             if ((s_y & 1) ^ start_with_green)
             {
@@ -181,45 +177,69 @@ namespace cv { namespace gpu {
                     res3.z = t7;
                 }
             }
-
-            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            dst(d_y, d_x) = res0;
-            if (d_x + 1 < dst.cols)
-                dst(d_y, d_x + 1) = res1;
-            if (d_x + 2 < dst.cols)
-                dst(d_y, d_x + 2) = res2;
-            if (d_x + 3 < dst.cols)
-                dst(d_y, d_x + 3) = res3;
         }
+    };
 
-        template <typename D>
-        __global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
         {
-            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (s_y >= dst.rows || (s_x << 1) >= dst.cols)
-                return;
-
-            s_y = ::min(::max(s_y, 1), dst.rows - 2);
-
             ushort2 patch[3][3];
             patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
             patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
 
             patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
             patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
 
             patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
             patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)];
-
-            D res0 = VecTraits<D>::all(numeric_limits<ushort>::max());
-            D res1 = VecTraits<D>::all(numeric_limits<ushort>::max());
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
 
             if ((s_y & 1) ^ start_with_green)
             {
@@ -279,53 +299,246 @@ namespace cv { namespace gpu {
                     res1.z = t3;
                 }
             }
-
-            const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-            const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            dst(d_y, d_x) = res0;
-            if (d_x + 1 < dst.cols)
-                dst(d_y, d_x + 1) = res1;
         }
+    };
 
-        template <int cn>
-        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-        {
-            typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-
-            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <int cn>
-        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-        {
-            typedef typename TypeVec<ushort, cn>::vec_type dst_t;
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-
-            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
     }
-}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 5165b352a..e9397e534 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -48,6 +48,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/simd_functions.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -154,170 +155,28 @@ namespace arithm
 
 namespace arithm
 {
-    template <typename T, typename D> struct VAdd4;
-    template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd4 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd4(a, b);
         }
 
         __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
-    };
-    template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
-    };
-    template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
-    };
-    template <> struct VAdd4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct VAdd2;
-    template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd2 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd2(a, b);
         }
 
         __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
-    };
-    template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
-    };
-    template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
-    };
-    template <> struct VAdd2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
     };
 
     ////////////////////////////////////
@@ -336,13 +195,13 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
@@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
     }
 
-    template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
     }
 
-    template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
     template <typename T, typename D>
     void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
     {
@@ -543,170 +390,28 @@ namespace arithm
 
 namespace arithm
 {
-    template <typename T, typename D> struct VSub4;
-    template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub4 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub4(a, b);
         }
 
         __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
-    };
-    template <> struct VSub4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
-    };
-    template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
-    };
-    template <> struct VSub4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+        __device__ __forceinline__ VSub4(const VSub4& other) {}
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct VSub2;
-    template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub2 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub2(a, b);
         }
 
         __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
-    };
-    template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
-    };
-    template <> struct VSub2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
-    };
-    template <> struct VSub2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+        __device__ __forceinline__ VSub2(const VSub2& other) {}
     };
 
     ////////////////////////////////////
@@ -725,13 +430,13 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
@@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
     }
 
-    template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
     }
 
-    template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
     template <typename T, typename D>
     void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
     {
@@ -1496,90 +1189,28 @@ namespace arithm
 
 namespace arithm
 {
-    template <typename T, typename D> struct VAbsDiff4;
-    template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff4 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff4(a, b);
         }
 
         __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct VAbsDiff2;
-    template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff2 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff2(a, b);
         }
 
         __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
     };
 
     ////////////////////////////////////
@@ -1611,13 +1242,13 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
     ////////////////////////////////////
 
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
@@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
     }
 
-    template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
     }
 
-    template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
     template <typename T>
     void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
@@ -1877,6 +1500,49 @@ namespace arithm
 
 namespace arithm
 {
+    struct VCmpEq4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpeq4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpEq4() {}
+        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+    };
+    struct VCmpNe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpne4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpNe4() {}
+        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+    };
+    struct VCmpLt4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmplt4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLt4() {}
+        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+    };
+    struct VCmpLe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmple4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLe4() {}
+        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+    };
+
+    ////////////////////////////////////
+
     template <class Op, typename T>
     struct Cmp : binary_function<T, T, uchar>
     {
@@ -1890,6 +1556,21 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
+    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    ////////////////////////////////////
+
     template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
     {
     };
@@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+    }
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+    }
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+    }
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+    }
+
     template <template <typename> class Op, typename T>
     void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
@@ -2303,44 +2001,11 @@ namespace arithm
 
 namespace arithm
 {
-    template <typename T> struct VMin4;
-    template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+    struct VMin4 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
-    };
-    template <> struct VMin4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin4(a, b);
         }
 
         __device__ __forceinline__ VMin4() {}
@@ -2349,40 +2014,11 @@ namespace arithm
 
     ////////////////////////////////////
 
-    template <typename T> struct VMin2;
-    template <> struct VMin2<uint> : binary_function<uint, uint, uint>
+    struct VMin2 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
-    };
-    template <> struct VMin2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin2(a, b);
         }
 
         __device__ __forceinline__ VMin2() {}
@@ -2392,13 +2028,13 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
     ////////////////////////////////////
 
-    template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
@@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
     }
 
-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
     }
 
     template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2430,12 +2066,6 @@ namespace arithm
         transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
     }
 
-    template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
     template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2463,44 +2093,11 @@ namespace arithm
 
 namespace arithm
 {
-    template <typename T> struct VMax4;
-    template <> struct VMax4<uint> : binary_function<uint, uint, uint>
+    struct VMax4 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
-    };
-    template <> struct VMax4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax4(a, b);
         }
 
         __device__ __forceinline__ VMax4() {}
@@ -2509,40 +2106,11 @@ namespace arithm
 
     ////////////////////////////////////
 
-    template <typename T> struct VMax2;
-    template <> struct VMax2<uint> : binary_function<uint, uint, uint>
+    struct VMax2 : binary_function<uint, uint, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
-    };
-    template <> struct VMax2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax2(a, b);
         }
 
         __device__ __forceinline__ VMax2() {}
@@ -2552,13 +2120,13 @@ namespace arithm
 
 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
     ////////////////////////////////////
 
-    template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
     {
     };
 
@@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device
 
 namespace arithm
 {
-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
     }
 
-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
     {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
     }
 
     template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2590,12 +2158,6 @@ namespace arithm
         transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
     }
 
-    template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
     template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index eedb31364..2c83967ed 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -263,11 +263,8 @@ namespace
 
 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
     template <typename T, typename D>
     void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
         }
     };
 
-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vadd4<unsigned int, unsigned int>,
-            vadd4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vadd4<int, unsigned int>,
-            vadd4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vadd2<unsigned int, unsigned int>,
-            vadd2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vadd2<int, unsigned int>,
-            vadd2<int, int>
-        }
-    };
-
     if (dtype < 0)
         dtype = src1.depth();
 
@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
     {
         const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
         const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
 
         const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
 
-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
         {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
             {
                 const int vcols = src1_.cols >> 2;
 
-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
             {
                 const int vcols = src1_.cols >> 1;
 
-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 
 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
     template <typename T, typename D>
     void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
         }
     };
 
-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vsub4<unsigned int, unsigned int>,
-            vsub4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vsub4<int, unsigned int>,
-            vsub4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vsub2<unsigned int, unsigned int>,
-            vsub2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vsub2<int, unsigned int>,
-            vsub2<int, int>
-        }
-    };
-
     if (dtype < 0)
         dtype = src1.depth();
 
@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
     {
         const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
         const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
 
         const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
 
-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
         {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
             {
                 const int vcols = src1_.cols >> 2;
 
-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
             {
                 const int vcols = src1_.cols >> 1;
 
-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 
 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
     template <typename T>
     void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
         absDiffMat<float>,
         absDiffMat<double>
     };
-    static const func_t vfuncs4[] =
-    {
-        vabsDiff4<unsigned int>,
-        vabsDiff4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vabsDiff2<unsigned int>,
-        vabsDiff2<int>
-    };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
     {
         const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
         const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
 
         const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
 
-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
         {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
             {
                 const int vcols = src1_.cols >> 2;
 
-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);
 
                 return;
             }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
             {
                 const int vcols = src1_.cols >> 1;
 
-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);
 
                 return;
             }
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 
 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+
     template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
     };
 
+    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    static const func_v4_t funcs_v4[] =
+    {
+        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
+    };
+
     const int depth = src1.depth();
     const int cn = src1.channels();
 
@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
+    if (depth == CV_8U && (src1_.cols & 3) == 0)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            const int vcols = src1_.cols >> 2;
+
+            funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                           stream);
+
+            return;
+        }
+    }
+
     const func_t func = funcs[depth][code];
 
     func(src1_, src2_, dst_, stream);
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
 
 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
     template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
     template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
         minMat<float>,
         minMat<double>
     };
-    static const func_t vfuncs4[] =
-    {
-        vmin4<unsigned int>,
-        vmin4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmin2<unsigned int>,
-        vmin2<int>
-    };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
     {
         const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
         const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
 
         const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
 
-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
         {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
             {
                 const int vcols = src1_.cols >> 2;
 
-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
             {
                 const int vcols = src1_.cols >> 1;
 
-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
         maxMat<float>,
         maxMat<double>
     };
-    static const func_t vfuncs4[] =
-    {
-        vmax4<unsigned int>,
-        vmax4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmax2<unsigned int>,
-        vmax2<int>
-    };
 
     const int depth = src1.depth();
     const int cn = src1.channels();
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
     PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
     PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
     {
         const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
         const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
 
         const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
 
-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
         {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
             {
                 const int vcols = src1_.cols >> 2;
 
-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
             {
                 const int vcols = src1_.cols >> 1;
 
-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);
 
                 return;
             }
diff --git a/modules/gpu/test/main.cpp b/modules/gpu/test/main.cpp
index 6c7c050b0..dfb556931 100644
--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@@ -49,71 +49,6 @@ using namespace cv::gpu;
 using namespace cvtest;
 using namespace testing;
 
-void printOsInfo()
-{
-#if defined _WIN32
-#   if defined _WIN64
-        cout << "OS: Windows x64 \n" << endl;
-#   else
-        cout << "OS: Windows x32 \n" << endl;
-#   endif
-#elif defined linux
-#   if defined _LP64
-        cout << "OS: Linux x64 \n" << endl;
-#   else
-        cout << "OS: Linux x32 \n" << endl;
-#   endif
-#elif defined __APPLE__
-#   if defined _LP64
-        cout << "OS: Apple x64 \n" << endl;
-#   else
-        cout << "OS: Apple x32 \n" << endl;
-#   endif
-#endif
-}
-
-void printCudaInfo()
-{
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-    cout << "OpenCV was built without CUDA support \n" << endl;
-#else
-    int driver;
-    cudaDriverGetVersion(&driver);
-
-    cout << "CUDA Driver  version: " << driver << '\n';
-    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
-
-    cout << endl;
-
-    cout << "GPU module was compiled for the following GPU archs:" << endl;
-    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
-    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
-
-    cout << endl;
-
-    int deviceCount = getCudaEnabledDeviceCount();
-    cout << "CUDA device count: " << deviceCount << '\n';
-
-    cout << endl;
-
-    for (int i = 0; i < deviceCount; ++i)
-    {
-        DeviceInfo info(i);
-
-        cout << "Device [" << i << "] \n";
-        cout << "\t Name: " << info.name() << '\n';
-        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
-        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
-        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
-        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
-        if (!info.isCompatible())
-            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
-
-        cout << endl;
-    }
-#endif
-}
-
 int main(int argc, char** argv)
 {
     try
@@ -133,7 +68,6 @@ int main(int argc, char** argv)
             return 0;
         }
 
-        printOsInfo();
         printCudaInfo();
 
         if (cmd.has("info"))
diff --git a/modules/gpu/test/test_bgfg.cpp b/modules/gpu/test/test_bgfg.cpp
index ebf0a88af..9847fcfab 100644
--- a/modules/gpu/test/test_bgfg.cpp
+++ b/modules/gpu/test/test_bgfg.cpp
@@ -43,9 +43,25 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
+#if defined(HAVE_XINE)         || \
+    defined(HAVE_GSTREAMER)    || \
+    defined(HAVE_QUICKTIME)    || \
+    defined(HAVE_AVFOUNDATION) || \
+    defined(HAVE_FFMPEG)       || \
+    defined(WIN32) /* assume that we have ffmpeg */
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
 //////////////////////////////////////////////////////
 // FGDStatModel
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 namespace cv
 {
     template<> void Ptr<CvBGStatModel>::delete_obj()
@@ -130,9 +146,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, FGDStatModel, testing::Combine(
     testing::Values(std::string("768x576.avi")),
     testing::Values(Channels(3), Channels(4))));
 
+#endif
+
 //////////////////////////////////////////////////////
 // MOG
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 namespace
 {
     IMPLEMENT_PARAM_CLASS(UseGray, bool)
@@ -204,9 +224,13 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
     testing::Values(LearningRate(0.0), LearningRate(0.01)),
     WHOLE_SUBMAT));
 
+#endif
+
 //////////////////////////////////////////////////////
 // MOG2
 
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
 namespace
 {
     IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
@@ -320,46 +344,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
     testing::Values(DetectShadow(true), DetectShadow(false)),
     WHOLE_SUBMAT));
 
-//////////////////////////////////////////////////////
-// VIBE
-
-PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
-{
-};
-
-GPU_TEST_P(VIBE, Accuracy)
-{
-    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const int type = GET_PARAM(2);
-    const bool useRoi = GET_PARAM(3);
-
-    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
-
-    cv::Mat frame = randomMat(size, type, 0.0, 100);
-    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
-
-    cv::gpu::VIBE_GPU vibe;
-    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
-    vibe.initialize(d_frame);
-
-    for (int i = 0; i < 20; ++i)
-        vibe(d_frame, d_fgmask);
-
-    frame = randomMat(size, type, 160, 255);
-    d_frame = loadMat(frame, useRoi);
-    vibe(d_frame, d_fgmask);
-
-    // now fgmask should be entirely foreground
-    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
-    WHOLE_SUBMAT));
+#endif
 
 //////////////////////////////////////////////////////
 // GMG
diff --git a/modules/gpu/test/test_calib3d.cpp b/modules/gpu/test/test_calib3d.cpp
index 318de8d89..5a83662e6 100644
--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 //////////////////////////////////////////////////////////////////////////
 // StereoBM
 
diff --git a/modules/gpu/test/test_color.cpp b/modules/gpu/test/test_color.cpp
index 5aee14d49..112a749c6 100644
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // cvtColor
 
@@ -2218,12 +2220,245 @@ GPU_TEST_P(CvtColor, BayerGR2BGR4)
     EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst3(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 0);
 }
 
+GPU_TEST_P(CvtColor, BayerBG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerBG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerBG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerGB2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGB2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGB2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerRG2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerRG2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerRG2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
+GPU_TEST_P(CvtColor, BayerGR2Gray)
+{
+    if ((depth != CV_8U && depth != CV_16U) || useRoi)
+        return;
+
+    cv::Mat src = randomMat(size, depth);
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::cvtColor(loadMat(src, useRoi), dst, cv::COLOR_BayerGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::cvtColor(src, dst_gold, cv::COLOR_BayerGR2GRAY);
+
+    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
+}
+
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
     WHOLE_SUBMAT));
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Demosaicing
+
+struct Demosaicing : testing::TestWithParam<cv::gpu::DeviceInfo>
+{
+    cv::gpu::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+
+    static void mosaic(const cv::Mat_<cv::Vec3b>& src, cv::Mat_<uchar>& dst, cv::Point firstRed)
+    {
+        dst.create(src.size());
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                cv::Vec3b pix = src(y, x);
+
+                cv::Point alternate;
+                alternate.x = (x + firstRed.x) % 2;
+                alternate.y = (y + firstRed.y) % 2;
+
+                if (alternate.y == 0)
+                {
+                    if (alternate.x == 0)
+                    {
+                        // RG
+                        // GB
+                        dst(y, x) = pix[2];
+                    }
+                    else
+                    {
+                        // GR
+                        // BG
+                        dst(y, x) = pix[1];
+                    }
+                }
+                else
+                {
+                    if (alternate.x == 0)
+                    {
+                        // GB
+                        // RG
+                        dst(y, x) = pix[1];
+                    }
+                    else
+                    {
+                        // BG
+                        // GR
+                        dst(y, x) = pix[0];
+                    }
+                }
+            }
+        }
+    }
+};
+
+GPU_TEST_P(Demosaicing, BayerBG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerBG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerGB2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGB2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerRG2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerRG2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerGR2BGR)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::COLOR_BayerGR2BGR);
+
+    EXPECT_MAT_SIMILAR(img, dst, 2e-2);
+}
+
+GPU_TEST_P(Demosaicing, BayerBG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerBG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerGB2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 1));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGB2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerRG2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(0, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerRG2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+GPU_TEST_P(Demosaicing, BayerGR2BGR_MHT)
+{
+    cv::Mat img = readImage("stereobm/aloe-L.png");
+
+    cv::Mat_<uchar> src;
+    mosaic(img, src, cv::Point(1, 0));
+
+    cv::gpu::GpuMat dst;
+    cv::gpu::demosaicing(loadMat(src), dst, cv::gpu::COLOR_BayerGR2BGR_MHT);
+
+    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Demosaicing, ALL_DEVICES);
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // swapChannels
 
diff --git a/modules/gpu/test/test_copy_make_border.cpp b/modules/gpu/test/test_copy_make_border.cpp
index 0b59fe2d8..99c565b38 100644
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 namespace
 {
     IMPLEMENT_PARAM_CLASS(Border, int)
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index affc30610..af86efc8b 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Merge
 
diff --git a/modules/gpu/test/test_denoising.cpp b/modules/gpu/test/test_denoising.cpp
index fe0c548c7..df84817f8 100644
--- a/modules/gpu/test/test_denoising.cpp
+++ b/modules/gpu/test/test_denoising.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ////////////////////////////////////////////////////////
 // BilateralFilter
 
diff --git a/modules/gpu/test/test_features2d.cpp b/modules/gpu/test/test_features2d.cpp
index 3879ac053..01741d8c6 100644
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -43,306 +43,7 @@
 
 #ifdef HAVE_CUDA
 
-namespace
-{
-    bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
-    {
-        const double maxPtDif = 1.0;
-        const double maxSizeDif = 1.0;
-        const double maxAngleDif = 2.0;
-        const double maxResponseDif = 0.1;
-
-        double dist = cv::norm(p1.pt - p2.pt);
-
-        if (dist < maxPtDif &&
-            fabs(p1.size - p2.size) < maxSizeDif &&
-            abs(p1.angle - p2.angle) < maxAngleDif &&
-            abs(p1.response - p2.response) < maxResponseDif &&
-            p1.octave == p2.octave &&
-            p1.class_id == p2.class_id)
-        {
-            return true;
-        }
-
-        return false;
-    }
-
-    struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
-    {
-        bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
-        {
-            return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
-        }
-    };
-
-    testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
-    {
-        if (gold.size() != actual.size())
-        {
-            return testing::AssertionFailure() << "KeyPoints size mistmach\n"
-                                               << "\"" << gold_expr << "\" : " << gold.size() << "\n"
-                                               << "\"" << actual_expr << "\" : " << actual.size();
-        }
-
-        std::sort(actual.begin(), actual.end(), KeyPointLess());
-        std::sort(gold.begin(), gold.end(), KeyPointLess());
-
-        for (size_t i = 0; i < gold.size(); ++i)
-        {
-            const cv::KeyPoint& p1 = gold[i];
-            const cv::KeyPoint& p2 = actual[i];
-
-            if (!keyPointsEquals(p1, p2))
-            {
-                return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
-                                                   << "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
-                                                   << "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
-                                                   << "size : " << p1.size << " vs " << p2.size << "\n"
-                                                   << "angle : " << p1.angle << " vs " << p2.angle << "\n"
-                                                   << "response : " << p1.response << " vs " << p2.response << "\n"
-                                                   << "octave : " << p1.octave << " vs " << p2.octave << "\n"
-                                                   << "class_id : " << p1.class_id << " vs " << p2.class_id;
-            }
-        }
-
-        return ::testing::AssertionSuccess();
-    }
-
-    #define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
-
-    int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
-    {
-        std::sort(actual.begin(), actual.end(), KeyPointLess());
-        std::sort(gold.begin(), gold.end(), KeyPointLess());
-
-        int validCount = 0;
-
-        for (size_t i = 0; i < gold.size(); ++i)
-        {
-            const cv::KeyPoint& p1 = gold[i];
-            const cv::KeyPoint& p2 = actual[i];
-
-            if (keyPointsEquals(p1, p2))
-                ++validCount;
-        }
-
-        return validCount;
-    }
-
-    int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
-    {
-        int validCount = 0;
-
-        for (size_t i = 0; i < matches.size(); ++i)
-        {
-            const cv::DMatch& m = matches[i];
-
-            const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
-            const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
-
-            if (keyPointsEquals(p1, p2))
-                ++validCount;
-        }
-
-        return validCount;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// SURF
-
-namespace
-{
-    IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
-    IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
-    IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
-    IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
-    IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
-}
-
-PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double hessianThreshold;
-    int nOctaves;
-    int nOctaveLayers;
-    bool extended;
-    bool upright;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        hessianThreshold = GET_PARAM(1);
-        nOctaves = GET_PARAM(2);
-        nOctaveLayers = GET_PARAM(3);
-        extended = GET_PARAM(4);
-        upright = GET_PARAM(5);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(SURF, Detector)
-{
-    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::gpu::SURF_GPU surf;
-    surf.hessianThreshold = hessianThreshold;
-    surf.nOctaves = nOctaves;
-    surf.nOctaveLayers = nOctaveLayers;
-    surf.extended = extended;
-    surf.upright = upright;
-    surf.keypointsRatio = 0.05f;
-
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
-
-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
-
-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, cv::noArray(), keypoints_gold);
-
-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
-
-        EXPECT_GT(matchedRatio, 0.95);
-    }
-}
-
-GPU_TEST_P(SURF, Detector_Masked)
-{
-    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
-    mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
-
-    cv::gpu::SURF_GPU surf;
-    surf.hessianThreshold = hessianThreshold;
-    surf.nOctaves = nOctaves;
-    surf.nOctaveLayers = nOctaveLayers;
-    surf.extended = extended;
-    surf.upright = upright;
-    surf.keypointsRatio = 0.05f;
-
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            surf(loadMat(image), loadMat(mask), keypoints);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf(loadMat(image), loadMat(mask), keypoints);
-
-        cv::SURF surf_gold;
-        surf_gold.hessianThreshold = hessianThreshold;
-        surf_gold.nOctaves = nOctaves;
-        surf_gold.nOctaveLayers = nOctaveLayers;
-        surf_gold.extended = extended;
-        surf_gold.upright = upright;
-
-        std::vector<cv::KeyPoint> keypoints_gold;
-        surf_gold(image, mask, keypoints_gold);
-
-        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
-        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
-
-        EXPECT_GT(matchedRatio, 0.95);
-    }
-}
-
-GPU_TEST_P(SURF, Descriptor)
-{
-    cv::Mat image = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::gpu::SURF_GPU surf;
-    surf.hessianThreshold = hessianThreshold;
-    surf.nOctaves = nOctaves;
-    surf.nOctaveLayers = nOctaveLayers;
-    surf.extended = extended;
-    surf.upright = upright;
-    surf.keypointsRatio = 0.05f;
-
-    cv::SURF surf_gold;
-    surf_gold.hessianThreshold = hessianThreshold;
-    surf_gold.nOctaves = nOctaves;
-    surf_gold.nOctaveLayers = nOctaveLayers;
-    surf_gold.extended = extended;
-    surf_gold.upright = upright;
-
-    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
-    {
-        try
-        {
-            std::vector<cv::KeyPoint> keypoints;
-            cv::gpu::GpuMat descriptors;
-            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(CV_StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-        surf_gold(image, cv::noArray(), keypoints);
-
-        cv::gpu::GpuMat descriptors;
-        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
-
-        cv::Mat descriptors_gold;
-        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
-
-        cv::BFMatcher matcher(cv::NORM_L2);
-        std::vector<cv::DMatch> matches;
-        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
-
-        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
-        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
-
-        EXPECT_GT(matchedRatio, 0.6);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
-    testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
-    testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
-    testing::Values(SURF_Extended(false), SURF_Extended(true)),
-    testing::Values(SURF_Upright(false), SURF_Upright(true))));
+using namespace cvtest;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // FAST
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp
index f6ae4067a..b854da07a 100644
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 namespace
 {
     IMPLEMENT_PARAM_CLASS(KSize, cv::Size)
diff --git a/modules/gpu/test/test_gpumat.cpp b/modules/gpu/test/test_gpumat.cpp
index 9ece87caa..a5c38da4f 100644
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
@@ -44,6 +44,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ////////////////////////////////////////////////////////////////////////////////
 // SetTo
 
diff --git a/modules/gpu/test/test_hough.cpp b/modules/gpu/test/test_hough.cpp
index b348331a1..e4d5ad6cf 100644
--- a/modules/gpu/test/test_hough.cpp
+++ b/modules/gpu/test/test_hough.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines
 
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index 41a94299b..ca64edf17 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Integral
 
diff --git a/modules/gpu/test/test_objdetect.cpp b/modules/gpu/test/test_objdetect.cpp
index fd610d9b5..cc66d6a31 100644
--- a/modules/gpu/test/test_objdetect.cpp
+++ b/modules/gpu/test/test_objdetect.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 //#define DUMP
 
 struct HOG : testing::TestWithParam<cv::gpu::DeviceInfo>, cv::gpu::HOGDescriptor
diff --git a/modules/gpu/test/test_opengl.cpp b/modules/gpu/test/test_opengl.cpp
index 626851e30..c6aa945ac 100644
--- a/modules/gpu/test/test_opengl.cpp
+++ b/modules/gpu/test/test_opengl.cpp
@@ -43,6 +43,8 @@
 
 #if defined(HAVE_CUDA) && defined(HAVE_OPENGL)
 
+using namespace cvtest;
+
 /////////////////////////////////////////////
 // Buffer
 
diff --git a/modules/gpu/test/test_optflow.cpp b/modules/gpu/test/test_optflow.cpp
index a97516f6d..ee60a432f 100644
--- a/modules/gpu/test/test_optflow.cpp
+++ b/modules/gpu/test/test_optflow.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 //////////////////////////////////////////////////////
 // BroxOpticalFlow
 
diff --git a/modules/gpu/test/test_precomp.hpp b/modules/gpu/test/test_precomp.hpp
index 93c1df04d..0d16472dc 100644
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@@ -76,11 +76,10 @@
     #include "opencv2/imgproc.hpp"
     #include "opencv2/video.hpp"
     #include "opencv2/ts.hpp"
+    #include "opencv2/ts/gpu_test.hpp"
     #include "opencv2/gpu.hpp"
-    #include "opencv2/nonfree.hpp"
     #include "opencv2/legacy.hpp"
 
-    #include "utility.hpp"
     #include "interpolation.hpp"
     #include "main_test_nvidia.h"
 #endif
diff --git a/modules/gpu/test/test_pyramids.cpp b/modules/gpu/test/test_pyramids.cpp
index c3d56b63a..43665966e 100644
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ////////////////////////////////////////////////////////
 // pyrDown
 
diff --git a/modules/gpu/test/test_remap.cpp b/modules/gpu/test/test_remap.cpp
index a815ed0e0..13c8a59b7 100644
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ///////////////////////////////////////////////////////////////////
 // Gold implementation
 
diff --git a/modules/gpu/test/test_resize.cpp b/modules/gpu/test/test_resize.cpp
index a34da54fd..729dcc320 100644
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 ///////////////////////////////////////////////////////////////////
 // Gold implementation
 
diff --git a/modules/gpu/test/test_stream.cpp b/modules/gpu/test/test_stream.cpp
index 4adac4129..1ac8ae8bd 100644
--- a/modules/gpu/test/test_stream.cpp
+++ b/modules/gpu/test/test_stream.cpp
@@ -44,6 +44,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 #if CUDA_VERSION >= 5000
 
 struct Async : testing::TestWithParam<cv::gpu::DeviceInfo>
diff --git a/modules/gpu/test/test_threshold.cpp b/modules/gpu/test/test_threshold.cpp
index 43e651ad8..e56975e87 100644
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
 #define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
 
diff --git a/modules/gpu/test/test_warp_affine.cpp b/modules/gpu/test/test_warp_affine.cpp
index fe3a1353b..892c5b7c6 100644
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 namespace
 {
     cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
diff --git a/modules/gpu/test/test_warp_perspective.cpp b/modules/gpu/test/test_warp_perspective.cpp
index dd2054a04..d421d94fc 100644
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@@ -43,6 +43,8 @@
 
 #ifdef HAVE_CUDA
 
+using namespace cvtest;
+
 namespace
 {
     cv::Mat createTransfomMatrix(cv::Size srcSize, double angle)
diff --git a/modules/gpu/test/utility.cpp b/modules/gpu/test/utility.cpp
deleted file mode 100644
index 88f796324..000000000
--- a/modules/gpu/test/utility.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "test_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-using namespace cvtest;
-using namespace testing;
-using namespace testing::internal;
-
-//////////////////////////////////////////////////////////////////////
-// random generators
-
-int randomInt(int minVal, int maxVal)
-{
-    RNG& rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-double randomDouble(double minVal, double maxVal)
-{
-    RNG& rng = TS::ptr()->get_rng();
-    return rng.uniform(minVal, maxVal);
-}
-
-Size randomSize(int minVal, int maxVal)
-{
-    return Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
-}
-
-Scalar randomScalar(double minVal, double maxVal)
-{
-    return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
-}
-
-Mat randomMat(Size size, int type, double minVal, double maxVal)
-{
-    return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
-}
-
-//////////////////////////////////////////////////////////////////////
-// GpuMat create
-
-GpuMat createMat(Size size, int type, bool useRoi)
-{
-    Size size0 = size;
-
-    if (useRoi)
-    {
-        size0.width += randomInt(5, 15);
-        size0.height += randomInt(5, 15);
-    }
-
-    GpuMat d_m(size0, type);
-
-    if (size0 != size)
-        d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));
-
-    return d_m;
-}
-
-GpuMat loadMat(const Mat& m, bool useRoi)
-{
-    GpuMat d_m = createMat(m.size(), m.type(), useRoi);
-    d_m.upload(m);
-    return d_m;
-}
-
-//////////////////////////////////////////////////////////////////////
-// Image load
-
-Mat readImage(const std::string& fileName, int flags)
-{
-    return imread(TS::ptr()->get_data_path() + fileName, flags);
-}
-
-Mat readImageType(const std::string& fname, int type)
-{
-    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
-    if (CV_MAT_CN(type) == 4)
-    {
-        Mat temp;
-        cvtColor(src, temp, COLOR_BGR2BGRA);
-        swap(src, temp);
-    }
-    src.convertTo(src, CV_MAT_DEPTH(type), CV_MAT_DEPTH(type) == CV_32F ? 1.0 / 255.0 : 1.0);
-    return src;
-}
-
-//////////////////////////////////////////////////////////////////////
-// Gpu devices
-
-bool supportFeature(const DeviceInfo& info, FeatureSet feature)
-{
-    return TargetArchs::builtWith(feature) && info.supports(feature);
-}
-
-DeviceManager& DeviceManager::instance()
-{
-    static DeviceManager obj;
-    return obj;
-}
-
-void DeviceManager::load(int i)
-{
-    devices_.clear();
-    devices_.reserve(1);
-
-    std::ostringstream msg;
-
-    if (i < 0 || i >= getCudaEnabledDeviceCount())
-    {
-        msg << "Incorrect device number - " << i;
-        throw runtime_error(msg.str());
-    }
-
-    DeviceInfo info(i);
-
-    if (!info.isCompatible())
-    {
-        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
-        throw runtime_error(msg.str());
-    }
-
-    devices_.push_back(info);
-}
-
-void DeviceManager::loadAll()
-{
-    int deviceCount = getCudaEnabledDeviceCount();
-
-    devices_.clear();
-    devices_.reserve(deviceCount);
-
-    for (int i = 0; i < deviceCount; ++i)
-    {
-        DeviceInfo info(i);
-        if (info.isCompatible())
-        {
-            devices_.push_back(info);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Additional assertion
-
-namespace
-{
-    template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
-    {
-        const int cn = m.channels();
-
-        std::ostringstream ostr;
-        ostr << "(";
-
-        p.x /= cn;
-
-        ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
-        for (int c = 1; c < m.channels(); ++c)
-        {
-            ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
-        }
-        ostr << ")";
-
-        return ostr.str();
-    }
-
-    std::string printMatVal(const Mat& m, Point p)
-    {
-        typedef std::string (*func_t)(const Mat& m, Point p);
-
-        static const func_t funcs[] =
-        {
-            printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
-            printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
-        };
-
-        return funcs[m.depth()](m, p);
-    }
-}
-
-void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minLoc_, Point* maxLoc_, const Mat& mask)
-{
-    if (src.depth() != CV_8S)
-    {
-        minMaxLoc(src, minVal_, maxVal_, minLoc_, maxLoc_, mask);
-        return;
-    }
-
-    // OpenCV's minMaxLoc doesn't support CV_8S type
-    double minVal = numeric_limits<double>::max();
-    Point minLoc(-1, -1);
-
-    double maxVal = -numeric_limits<double>::max();
-    Point maxLoc(-1, -1);
-
-    for (int y = 0; y < src.rows; ++y)
-    {
-        const schar* src_row = src.ptr<schar>(y);
-        const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
-
-        for (int x = 0; x < src.cols; ++x)
-        {
-            if (!mask_row || mask_row[x])
-            {
-                schar val = src_row[x];
-
-                if (val < minVal)
-                {
-                    minVal = val;
-                    minLoc = cv::Point(x, y);
-                }
-
-                if (val > maxVal)
-                {
-                    maxVal = val;
-                    maxLoc = cv::Point(x, y);
-                }
-            }
-        }
-    }
-
-    if (minVal_) *minVal_ = minVal;
-    if (maxVal_) *maxVal_ = maxVal;
-
-    if (minLoc_) *minLoc_ = minLoc;
-    if (maxLoc_) *maxLoc_ = maxLoc;
-}
-
-Mat getMat(InputArray arr)
-{
-    if (arr.kind() == _InputArray::GPU_MAT)
-    {
-        Mat m;
-        arr.getGpuMat().download(m);
-        return m;
-    }
-
-    return arr.getMat();
-}
-
-AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, InputArray m1_, InputArray m2_, double eps)
-{
-    Mat m1 = getMat(m1_);
-    Mat m2 = getMat(m2_);
-
-    if (m1.size() != m2.size())
-    {
-        return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different sizes : \""
-                                  << expr1 << "\" [" << PrintToString(m1.size()) << "] vs \""
-                                  << expr2 << "\" [" << PrintToString(m2.size()) << "]";
-    }
-
-    if (m1.type() != m2.type())
-    {
-        return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different types : \""
-                                  << expr1 << "\" [" << PrintToString(MatType(m1.type())) << "] vs \""
-                                  << expr2 << "\" [" << PrintToString(MatType(m2.type())) << "]";
-    }
-
-    Mat diff;
-    absdiff(m1.reshape(1), m2.reshape(1), diff);
-
-    double maxVal = 0.0;
-    Point maxLoc;
-    minMaxLocGold(diff, 0, &maxVal, 0, &maxLoc);
-
-    if (maxVal > eps)
-    {
-        return AssertionFailure() << "The max difference between matrices \"" << expr1 << "\" and \"" << expr2
-                                  << "\" is " << maxVal << " at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ")"
-                                  << ", which exceeds \"" << eps_expr << "\", where \""
-                                  << expr1 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m1, maxLoc) << ", \""
-                                  << expr2 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m2, maxLoc) << ", \""
-                                  << eps_expr << "\" evaluates to " << eps;
-    }
-
-    return AssertionSuccess();
-}
-
-double checkSimilarity(InputArray m1, InputArray m2)
-{
-    Mat diff;
-    matchTemplate(getMat(m1), getMat(m2), diff, CV_TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
-
-//////////////////////////////////////////////////////////////////////
-// Helper structs for value-parameterized tests
-
-vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
-{
-    vector<MatType> v;
-
-    v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
-
-    for (int depth = depth_start; depth <= depth_end; ++depth)
-    {
-        for (int cn = cn_start; cn <= cn_end; ++cn)
-        {
-            v.push_back(MatType(CV_MAKE_TYPE(depth, cn)));
-        }
-    }
-
-    return v;
-}
-
-const vector<MatType>& all_types()
-{
-    static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
-
-    return v;
-}
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    (*os) << info.name();
-}
-
-void PrintTo(const UseRoi& useRoi, std::ostream* os)
-{
-    if (useRoi)
-        (*os) << "sub matrix";
-    else
-        (*os) << "whole matrix";
-}
-
-void PrintTo(const Inverse& inverse, std::ostream* os)
-{
-    if (inverse)
-        (*os) << "inverse";
-    else
-        (*os) << "direct";
-}
-
-//////////////////////////////////////////////////////////////////////
-// Other
-
-void dumpImage(const std::string& fileName, const Mat& image)
-{
-    imwrite(TS::ptr()->get_data_path() + fileName, image);
-}
-
-void showDiff(InputArray gold_, InputArray actual_, double eps)
-{
-    Mat gold = getMat(gold_);
-    Mat actual = getMat(actual_);
-
-    Mat diff;
-    absdiff(gold, actual, diff);
-    threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
-
-    namedWindow("gold", WINDOW_NORMAL);
-    namedWindow("actual", WINDOW_NORMAL);
-    namedWindow("diff", WINDOW_NORMAL);
-
-    imshow("gold", gold);
-    imshow("actual", actual);
-    imshow("diff", diff);
-
-    waitKey();
-}
-
-#endif // HAVE_CUDA
diff --git a/modules/gpu/test/utility.hpp b/modules/gpu/test/utility.hpp
deleted file mode 100644
index fa0cd93e7..000000000
--- a/modules/gpu/test/utility.hpp
+++ /dev/null
@@ -1,331 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TEST_UTILITY_HPP__
-#define __OPENCV_GPU_TEST_UTILITY_HPP__
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/ts.hpp"
-
-//////////////////////////////////////////////////////////////////////
-// random generators
-
-int randomInt(int minVal, int maxVal);
-double randomDouble(double minVal, double maxVal);
-cv::Size randomSize(int minVal, int maxVal);
-cv::Scalar randomScalar(double minVal, double maxVal);
-cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
-
-//////////////////////////////////////////////////////////////////////
-// GpuMat create
-
-cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi = false);
-cv::gpu::GpuMat loadMat(const cv::Mat& m, bool useRoi = false);
-
-//////////////////////////////////////////////////////////////////////
-// Image load
-
-//! read image from testdata folder
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-//! read image from testdata folder and convert it to specified type
-cv::Mat readImageType(const std::string& fname, int type);
-
-//////////////////////////////////////////////////////////////////////
-// Gpu devices
-
-//! return true if device supports specified feature and gpu module was built with support the feature.
-bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
-
-class DeviceManager
-{
-public:
-    static DeviceManager& instance();
-
-    void load(int i);
-    void loadAll();
-
-    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
-
-private:
-    std::vector<cv::gpu::DeviceInfo> devices_;
-};
-
-#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())
-
-//////////////////////////////////////////////////////////////////////
-// Additional assertion
-
-void minMaxLocGold(const cv::Mat& src, double* minVal_, double* maxVal_ = 0, cv::Point* minLoc_ = 0, cv::Point* maxLoc_ = 0, const cv::Mat& mask = cv::Mat());
-
-cv::Mat getMat(cv::InputArray arr);
-
-testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1, cv::InputArray m2, double eps);
-
-#define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
-#define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(assertMatNear, m1, m2, eps)
-
-#define EXPECT_SCALAR_NEAR(s1, s2, eps) \
-    { \
-        EXPECT_NEAR(s1[0], s2[0], eps); \
-        EXPECT_NEAR(s1[1], s2[1], eps); \
-        EXPECT_NEAR(s1[2], s2[2], eps); \
-        EXPECT_NEAR(s1[3], s2[3], eps); \
-    }
-#define ASSERT_SCALAR_NEAR(s1, s2, eps) \
-    { \
-        ASSERT_NEAR(s1[0], s2[0], eps); \
-        ASSERT_NEAR(s1[1], s2[1], eps); \
-        ASSERT_NEAR(s1[2], s2[2], eps); \
-        ASSERT_NEAR(s1[3], s2[3], eps); \
-    }
-
-#define EXPECT_POINT2_NEAR(p1, p2, eps) \
-    { \
-        EXPECT_NEAR(p1.x, p2.x, eps); \
-        EXPECT_NEAR(p1.y, p2.y, eps); \
-    }
-#define ASSERT_POINT2_NEAR(p1, p2, eps) \
-    { \
-        ASSERT_NEAR(p1.x, p2.x, eps); \
-        ASSERT_NEAR(p1.y, p2.y, eps); \
-    }
-
-#define EXPECT_POINT3_NEAR(p1, p2, eps) \
-    { \
-        EXPECT_NEAR(p1.x, p2.x, eps); \
-        EXPECT_NEAR(p1.y, p2.y, eps); \
-        EXPECT_NEAR(p1.z, p2.z, eps); \
-    }
-#define ASSERT_POINT3_NEAR(p1, p2, eps) \
-    { \
-        ASSERT_NEAR(p1.x, p2.x, eps); \
-        ASSERT_NEAR(p1.y, p2.y, eps); \
-        ASSERT_NEAR(p1.z, p2.z, eps); \
-    }
-
-double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
-
-#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
-    { \
-        ASSERT_EQ(mat1.type(), mat2.type()); \
-        ASSERT_EQ(mat1.size(), mat2.size()); \
-        EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
-    }
-#define ASSERT_MAT_SIMILAR(mat1, mat2, eps) \
-    { \
-        ASSERT_EQ(mat1.type(), mat2.type()); \
-        ASSERT_EQ(mat1.size(), mat2.size()); \
-        ASSERT_LE(checkSimilarity(mat1, mat2), eps); \
-    }
-
-//////////////////////////////////////////////////////////////////////
-// Helper structs for value-parameterized tests
-
-#define GPU_TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    void UnsafeTestBody(); \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
-  { \
-    try \
-    { \
-      UnsafeTestBody(); \
-    } \
-    catch (...) \
-    { \
-      cv::gpu::resetDevice(); \
-      throw; \
-    } \
-  } \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::UnsafeTestBody()
-
-#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const DeviceInfo& info, std::ostream* os);
-}}
-
-#define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
-
-// Depth
-
-using perf::MatDepth;
-
-#define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
-
-#define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
-                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)),  \
-                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)),  \
-                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_32S)),  \
-                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_32F)),  \
-                                    std::make_pair(MatDepth(CV_8U), MatDepth(CV_64F)),  \
-                                                                                        \
-                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_16U)), \
-                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_32S)), \
-                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_32F)), \
-                                    std::make_pair(MatDepth(CV_16U), MatDepth(CV_64F)), \
-                                                                                        \
-                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_16S)), \
-                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_32S)), \
-                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_32F)), \
-                                    std::make_pair(MatDepth(CV_16S), MatDepth(CV_64F)), \
-                                                                                        \
-                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_32S)), \
-                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_32F)), \
-                                    std::make_pair(MatDepth(CV_32S), MatDepth(CV_64F)), \
-                                                                                        \
-                                    std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)), \
-                                    std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
-                                                                                        \
-                                    std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
-
-// Type
-
-using perf::MatType;
-
-//! return vector with types from specified range.
-std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
-
-//! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
-const std::vector<MatType>& all_types();
-
-#define ALL_TYPES testing::ValuesIn(all_types())
-#define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
-
-// ROI
-
-class UseRoi
-{
-public:
-    inline UseRoi(bool val = false) : val_(val) {}
-
-    inline operator bool() const { return val_; }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const UseRoi& useRoi, std::ostream* os);
-
-#define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
-
-// Direct/Inverse
-
-class Inverse
-{
-public:
-    inline Inverse(bool val = false) : val_(val) {}
-
-    inline operator bool() const { return val_; }
-
-private:
-    bool val_;
-};
-
-void PrintTo(const Inverse& useRoi, std::ostream* os);
-
-#define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
-
-// Param class
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-#define ALL_CHANNELS testing::Values(Channels(1), Channels(2), Channels(3), Channels(4))
-#define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4))
-
-// Flags and enums
-
-CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-
-CV_ENUM(BorderType, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-#define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
-
-CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
-
-//////////////////////////////////////////////////////////////////////
-// Other
-
-void dumpImage(const std::string& fileName, const cv::Mat& image);
-void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
-
-#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
diff --git a/modules/imgproc/include/opencv2/imgproc/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc/imgproc.hpp
index a7905d7d3..841e9ec20 100644
--- a/modules/imgproc/include/opencv2/imgproc/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc/imgproc.hpp
@@ -7,7 +7,7 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
diff --git a/modules/java/generator/rst_parser.py b/modules/java/generator/rst_parser.py
index 358392da6..a6ee3f0b3 100755
--- a/modules/java/generator/rst_parser.py
+++ b/modules/java/generator/rst_parser.py
@@ -1,7 +1,7 @@
 #/usr/bin/env python
 
 import os, sys, re, string, fnmatch
-allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade"]
+allmodules = ["core", "flann", "imgproc", "ml", "highgui", "video", "features2d", "calib3d", "objdetect", "legacy", "contrib", "gpu", "androidcamera", "java", "python", "stitching", "ts", "photo", "nonfree", "videostab", "ocl", "softcascade", "superres"]
 verbose = False
 show_warnings = True
 show_errors = True
@@ -380,7 +380,7 @@ class RstParser(object):
 
     @classmethod
     def parse_namespace(cls, func, section_name):
-        known_namespaces = ["cv", "gpu", "flann"]
+        known_namespaces = ["cv", "gpu", "flann", "superres"]
         l = section_name.strip()
         for namespace in known_namespaces:
             if l.startswith(namespace + "::"):
diff --git a/modules/java/generator/src/java/android+CameraBridgeViewBase.java b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
index 4dd3af96f..e76ac26c5 100644
--- a/modules/java/generator/src/java/android+CameraBridgeViewBase.java
+++ b/modules/java/generator/src/java/android+CameraBridgeViewBase.java
@@ -15,6 +15,7 @@ import android.content.DialogInterface;
 import android.content.res.TypedArray;
 import android.graphics.Bitmap;
 import android.graphics.Canvas;
+import android.graphics.Rect;
 import android.util.AttributeSet;
 import android.util.Log;
 import android.view.SurfaceHolder;
@@ -44,6 +45,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
     protected int mFrameHeight;
     protected int mMaxHeight;
     protected int mMaxWidth;
+    protected float mScale = 0;
     protected int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA;
     protected int mCameraIndex = -1;
     protected boolean mEnabled;
@@ -389,7 +391,22 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
             Canvas canvas = getHolder().lockCanvas();
             if (canvas != null) {
                 canvas.drawColor(0, android.graphics.PorterDuff.Mode.CLEAR);
-                canvas.drawBitmap(mCacheBitmap, (canvas.getWidth() - mCacheBitmap.getWidth()) / 2, (canvas.getHeight() - mCacheBitmap.getHeight()) / 2, null);
+                Log.d(TAG, "mStretch value: " + mScale);
+
+                if (mScale != 0) {
+                    canvas.drawBitmap(mCacheBitmap, new Rect(0,0,mCacheBitmap.getWidth(), mCacheBitmap.getHeight()),
+                         new Rect((int)((canvas.getWidth() - mScale*mCacheBitmap.getWidth()) / 2),
+                         (int)((canvas.getHeight() - mScale*mCacheBitmap.getHeight()) / 2),
+                         (int)((canvas.getWidth() - mScale*mCacheBitmap.getWidth()) / 2 + mScale*mCacheBitmap.getWidth()),
+                         (int)((canvas.getHeight() - mScale*mCacheBitmap.getHeight()) / 2 + mScale*mCacheBitmap.getHeight())), null);
+                } else {
+                     canvas.drawBitmap(mCacheBitmap, new Rect(0,0,mCacheBitmap.getWidth(), mCacheBitmap.getHeight()),
+                         new Rect((canvas.getWidth() - mCacheBitmap.getWidth()) / 2,
+                         (canvas.getHeight() - mCacheBitmap.getHeight()) / 2,
+                         (canvas.getWidth() - mCacheBitmap.getWidth()) / 2 + mCacheBitmap.getWidth(),
+                         (canvas.getHeight() - mCacheBitmap.getHeight()) / 2 + mCacheBitmap.getHeight()), null);
+                }
+
                 if (mFpsMeter != null) {
                     mFpsMeter.measure();
                     mFpsMeter.draw(canvas, 20, 30);
diff --git a/modules/java/generator/src/java/android+JavaCameraView.java b/modules/java/generator/src/java/android+JavaCameraView.java
index 962899ad8..34fe6091a 100644
--- a/modules/java/generator/src/java/android+JavaCameraView.java
+++ b/modules/java/generator/src/java/android+JavaCameraView.java
@@ -10,6 +10,7 @@ import android.hardware.Camera.PreviewCallback;
 import android.os.Build;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.ViewGroup.LayoutParams;
 
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
@@ -130,6 +131,11 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                     mFrameWidth = params.getPreviewSize().width;
                     mFrameHeight = params.getPreviewSize().height;
 
+                    if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
+                        mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
+                    else
+                        mScale = 0;
+
                     if (mFpsMeter != null) {
                         mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
                     }
diff --git a/modules/java/generator/src/java/android+NativeCameraView.java b/modules/java/generator/src/java/android+NativeCameraView.java
index 3802167b3..496ed53d6 100644
--- a/modules/java/generator/src/java/android+NativeCameraView.java
+++ b/modules/java/generator/src/java/android+NativeCameraView.java
@@ -8,6 +8,7 @@ import org.opencv.highgui.VideoCapture;
 import android.content.Context;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.ViewGroup.LayoutParams;
 
 /**
  * This class is an implementation of a bridge between SurfaceView and native OpenCV camera.
@@ -102,6 +103,11 @@ public class NativeCameraView extends CameraBridgeViewBase {
             mFrameWidth = (int)frameSize.width;
             mFrameHeight = (int)frameSize.height;
 
+            if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
+                mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
+            else
+                mScale = 0;
+
             if (mFpsMeter != null) {
                 mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
             }
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
index 9592fe7df..d1812c2a0 100644
--- a/modules/ml/include/opencv2/ml.hpp
+++ b/modules/ml/include/opencv2/ml.hpp
@@ -537,6 +537,8 @@ protected:
     virtual void write_params( CvFileStorage* fs ) const;
     virtual void read_params( CvFileStorage* fs, CvFileNode* node );
 
+    void optimize_linear_svm();
+
     CvSVMParams params;
     CvMat* class_labels;
     int var_all;
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 7183d62b1..76e0c41bc 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -1566,6 +1566,7 @@ bool CvSVM::do_train( int svm_type, int sample_count, int var_count, const float
         }
     }
 
+    optimize_linear_svm();
     ok = true;
 
     __END__;
@@ -1573,6 +1574,59 @@ bool CvSVM::do_train( int svm_type, int sample_count, int var_count, const float
     return ok;
 }
 
+
+void CvSVM::optimize_linear_svm()
+{
+    // we optimize only linear SVM: compress all the support vectors into one.
+    if( params.kernel_type != LINEAR )
+        return;
+
+    int class_count = class_labels ? class_labels->cols :
+            params.svm_type == CvSVM::ONE_CLASS ? 1 : 0;
+
+    int i, df_count = class_count > 1 ? class_count*(class_count-1)/2 : 1;
+    CvSVMDecisionFunc* df = decision_func;
+
+    for( i = 0; i < df_count; i++ )
+    {
+        int sv_count = df[i].sv_count;
+        if( sv_count != 1 )
+            break;
+    }
+
+    // if every decision functions uses a single support vector;
+    // it's already compressed. skip it then.
+    if( i == df_count )
+        return;
+
+    int var_count = get_var_count();
+    int sample_size = (int)(var_count*sizeof(sv[0][0]));
+    float** new_sv = (float**)cvMemStorageAlloc(storage, df_count*sizeof(new_sv[0]));
+
+    for( i = 0; i < df_count; i++ )
+    {
+        new_sv[i] = (float*)cvMemStorageAlloc(storage, sample_size);
+        float* dst = new_sv[i];
+        memset(dst, 0, sample_size);
+        int j, k, sv_count = df[i].sv_count;
+        for( j = 0; j < sv_count; j++ )
+        {
+            const float* src = class_count > 1 ? sv[df[i].sv_index[j]] : sv[j];
+            double a = df[i].alpha[j];
+            for( k = 0; k < var_count; k++ )
+                dst[k] = (float)(dst[k] + src[k]*a);
+        }
+        df[i].sv_count = 1;
+        df[i].alpha[0] = 1.;
+        if( class_count > 1 )
+            df[i].sv_index[0] = i;
+    }
+
+    sv = new_sv;
+    sv_total = df_count;
+}
+
+
 bool CvSVM::train( const CvMat* _train_data, const CvMat* _responses,
     const CvMat* _var_idx, const CvMat* _sample_idx, CvSVMParams _params )
 {
@@ -2565,6 +2619,7 @@ void CvSVM::read( CvFileStorage* fs, CvFileNode* svm_node )
         CV_NEXT_SEQ_ELEM( df_node->data.seq->elem_size, reader );
     }
 
+    optimize_linear_svm();
     create_kernel();
 
     __END__;
diff --git a/modules/nonfree/CMakeLists.txt b/modules/nonfree/CMakeLists.txt
index eeaf53a6f..a846f7406 100644
--- a/modules/nonfree/CMakeLists.txt
+++ b/modules/nonfree/CMakeLists.txt
@@ -3,4 +3,28 @@ if(BUILD_ANDROID_PACKAGE)
 endif()
 
 set(the_description "Functionality with possible limitations on the use")
-ocv_define_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d)
+ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl)
+ocv_module_include_directories()
+
+if(HAVE_CUDA AND HAVE_opencv_gpu)
+  ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu")
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS})
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+
+  file(GLOB lib_cuda "src/cuda/*.cu")
+  ocv_cuda_compile(cuda_objs ${lib_cuda})
+
+  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+else()
+  set(lib_cuda "")
+  set(cuda_objs "")
+  set(cuda_link_libs "")
+endif()
+
+ocv_glob_module_sources(SOURCES ${lib_cuda} ${cuda_objs})
+
+ocv_create_module(${cuda_link_libs})
+ocv_add_precompiled_headers(${the_module})
+
+ocv_add_accuracy_tests()
+ocv_add_perf_tests()
diff --git a/modules/nonfree/doc/background_subtraction.rst b/modules/nonfree/doc/background_subtraction.rst
new file mode 100644
index 000000000..11603ca56
--- /dev/null
+++ b/modules/nonfree/doc/background_subtraction.rst
@@ -0,0 +1,79 @@
+Background Subtraction
+======================
+
+.. highlight:: cpp
+
+
+
+gpu::VIBE_GPU
+-------------
+.. ocv:class:: gpu::VIBE_GPU
+
+Class used for background/foreground segmentation. ::
+
+    class VIBE_GPU
+    {
+    public:
+        explicit VIBE_GPU(unsigned long rngSeed = 1234567);
+
+        void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
+
+        void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
+
+        void release();
+
+        ...
+    };
+
+The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_.
+
+
+
+gpu::VIBE_GPU::VIBE_GPU
+-----------------------
+The constructor.
+
+.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567)
+
+    :param rngSeed: Value used to initiate a random sequence.
+
+Default constructor sets all parameters to default values.
+
+
+
+gpu::VIBE_GPU::initialize
+-------------------------
+Initialize background model and allocates all inner buffers.
+
+.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null())
+
+    :param firstFrame: First frame from video sequence.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::VIBE_GPU::operator()
+-------------------------
+Updates the background model and returns the foreground mask
+
+.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null())
+
+    :param frame: Next video frame.
+
+    :param fgmask: The output foreground mask as an 8-bit binary image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::VIBE_GPU::release
+----------------------
+Releases all inner buffer's memory.
+
+.. ocv:function:: void gpu::VIBE_GPU::release()
+
+
+
+
+.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
diff --git a/modules/nonfree/doc/feature_detection.rst b/modules/nonfree/doc/feature_detection.rst
index e4ac35742..c7ccb7493 100644
--- a/modules/nonfree/doc/feature_detection.rst
+++ b/modules/nonfree/doc/feature_detection.rst
@@ -127,3 +127,204 @@ Detects keypoints and computes SURF descriptors for them.
 The function is parallelized with the TBB library.
 
 If you are using the C version, make sure you call ``cv::initModule_nonfree()`` from ``nonfree/nonfree.hpp``.
+
+
+gpu::SURF_GPU
+-------------
+.. ocv:class:: gpu::SURF_GPU
+
+Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
+
+    class SURF_GPU
+    {
+    public:
+        enum KeypointLayout
+        {
+            X_ROW = 0,
+            Y_ROW,
+            LAPLACIAN_ROW,
+            OCTAVE_ROW,
+            SIZE_ROW,
+            ANGLE_ROW,
+            HESSIAN_ROW,
+            ROWS_COUNT
+        };
+
+        //! the default constructor
+        SURF_GPU();
+        //! the full constructor taking all the necessary parameters
+        explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
+             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f);
+
+        //! returns the descriptor size in float's (64 or 128)
+        int descriptorSize() const;
+
+        //! upload host keypoints to device memory
+        void uploadKeypoints(const vector<KeyPoint>& keypoints,
+            GpuMat& keypointsGPU);
+        //! download keypoints from device to host memory
+        void downloadKeypoints(const GpuMat& keypointsGPU,
+            vector<KeyPoint>& keypoints);
+
+        //! download descriptors from device to host memory
+        void downloadDescriptors(const GpuMat& descriptorsGPU,
+            vector<float>& descriptors);
+
+        void operator()(const GpuMat& img, const GpuMat& mask,
+            GpuMat& keypoints);
+
+        void operator()(const GpuMat& img, const GpuMat& mask,
+            GpuMat& keypoints, GpuMat& descriptors,
+            bool useProvidedKeypoints = false,
+            bool calcOrientation = true);
+
+        void operator()(const GpuMat& img, const GpuMat& mask,
+            std::vector<KeyPoint>& keypoints);
+
+        void operator()(const GpuMat& img, const GpuMat& mask,
+            std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
+            bool useProvidedKeypoints = false,
+            bool calcOrientation = true);
+
+        void operator()(const GpuMat& img, const GpuMat& mask,
+            std::vector<KeyPoint>& keypoints,
+            std::vector<float>& descriptors,
+            bool useProvidedKeypoints = false,
+            bool calcOrientation = true);
+
+        void releaseMemory();
+
+        // SURF parameters
+        double hessianThreshold;
+        int nOctaves;
+        int nOctaveLayers;
+        bool extended;
+        bool upright;
+
+        //! max keypoints = keypointsRatio * img.size().area()
+        float keypointsRatio;
+
+        GpuMat sum, mask1, maskSum, intBuffer;
+
+        GpuMat det, trace;
+
+        GpuMat maxPosBuffer;
+    };
+
+
+The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
+
+The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
+
+* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
+* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
+* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
+* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
+* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
+* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
+* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
+
+The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
+
+The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
+
+.. seealso:: :ocv:class:`SURF`
+
+
+ocl::SURF_OCL
+-------------
+.. ocv:class:: ocl::SURF_OCL
+
+Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
+
+    class SURF_OCL
+    {
+    public:
+        enum KeypointLayout
+        {
+            X_ROW = 0,
+            Y_ROW,
+            LAPLACIAN_ROW,
+            OCTAVE_ROW,
+            SIZE_ROW,
+            ANGLE_ROW,
+            HESSIAN_ROW,
+            ROWS_COUNT
+        };
+
+        //! the default constructor
+        SURF_OCL();
+        //! the full constructor taking all the necessary parameters
+        explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
+             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
+
+        //! returns the descriptor size in float's (64 or 128)
+        int descriptorSize() const;
+
+        //! upload host keypoints to device memory
+        void uploadKeypoints(const vector<KeyPoint>& keypoints,
+            oclMat& keypointsocl);
+        //! download keypoints from device to host memory
+        void downloadKeypoints(const oclMat& keypointsocl,
+            vector<KeyPoint>& keypoints);
+
+        //! download descriptors from device to host memory
+        void downloadDescriptors(const oclMat& descriptorsocl,
+            vector<float>& descriptors);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            oclMat& keypoints);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            oclMat& keypoints, oclMat& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints, oclMat& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void operator()(const oclMat& img, const oclMat& mask,
+            std::vector<KeyPoint>& keypoints,
+            std::vector<float>& descriptors,
+            bool useProvidedKeypoints = false);
+
+        void releaseMemory();
+
+        // SURF parameters
+        double hessianThreshold;
+        int nOctaves;
+        int nOctaveLayers;
+        bool extended;
+        bool upright;
+
+        //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+        float keypointsRatio;
+
+        oclMat sum, mask1, maskSum, intBuffer;
+
+        oclMat det, trace;
+
+        oclMat maxPosBuffer;
+    };
+
+
+The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
+
+The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
+
+* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
+* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
+* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
+* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
+* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
+* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
+* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
+
+The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
+
+The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
+
+.. seealso:: :ocv:class:`SURF`
\ No newline at end of file
diff --git a/modules/nonfree/doc/nonfree.rst b/modules/nonfree/doc/nonfree.rst
index e524ea82f..f8fa1d6eb 100644
--- a/modules/nonfree/doc/nonfree.rst
+++ b/modules/nonfree/doc/nonfree.rst
@@ -8,3 +8,4 @@ The module contains algorithms that may be patented in some countries or have so
     :maxdepth: 2
 
     feature_detection
+    background_subtraction
diff --git a/modules/nonfree/include/opencv2/nonfree/gpu.hpp b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
new file mode 100644
index 000000000..823c3778e
--- /dev/null
+++ b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
@@ -0,0 +1,169 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_NONFREE_GPU_HPP__
+#define __OPENCV_NONFREE_GPU_HPP__
+
+#include "opencv2/opencv_modules.hpp"
+
+#if defined(HAVE_OPENCV_GPU)
+
+#include "opencv2/gpu.hpp"
+
+namespace cv { namespace gpu {
+
+class CV_EXPORTS SURF_GPU
+{
+public:
+    enum KeypointLayout
+    {
+        X_ROW = 0,
+        Y_ROW,
+        LAPLACIAN_ROW,
+        OCTAVE_ROW,
+        SIZE_ROW,
+        ANGLE_ROW,
+        HESSIAN_ROW,
+        ROWS_COUNT
+    };
+
+    //! the default constructor
+    SURF_GPU();
+    //! the full constructor taking all the necessary parameters
+    explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4,
+         int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
+
+    //! returns the descriptor size in float's (64 or 128)
+    int descriptorSize() const;
+
+    //! upload host keypoints to device memory
+    void uploadKeypoints(const std::vector<KeyPoint>& keypoints, GpuMat& keypointsGPU);
+    //! download keypoints from device to host memory
+    void downloadKeypoints(const GpuMat& keypointsGPU, std::vector<KeyPoint>& keypoints);
+
+    //! download descriptors from device to host memory
+    void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector<float>& descriptors);
+
+    //! finds the keypoints using fast hessian detector used in SURF
+    //! supports CV_8UC1 images
+    //! keypoints will have nFeature cols and 6 rows
+    //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+    //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+    //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+    //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+    //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+    //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+    //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints);
+    //! finds the keypoints and computes their descriptors.
+    //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+    void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
+        bool useProvidedKeypoints = false);
+
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints);
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors,
+        bool useProvidedKeypoints = false);
+
+    void operator()(const GpuMat& img, const GpuMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
+        bool useProvidedKeypoints = false);
+
+    void releaseMemory();
+
+    // SURF parameters
+    double hessianThreshold;
+    int nOctaves;
+    int nOctaveLayers;
+    bool extended;
+    bool upright;
+
+    //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+    float keypointsRatio;
+
+    GpuMat sum, mask1, maskSum, intBuffer;
+
+    GpuMat det, trace;
+
+    GpuMat maxPosBuffer;
+};
+
+/*!
+ * The class implements the following algorithm:
+ * "ViBe: A universal background subtraction algorithm for video sequences"
+ * O. Barnich and M. Van D Roogenbroeck
+ * IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011
+ */
+class CV_EXPORTS VIBE_GPU
+{
+public:
+    //! the default constructor
+    explicit VIBE_GPU(unsigned long rngSeed = 1234567);
+
+    //! re-initiaization method
+    void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null());
+
+    //! the update operator
+    void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null());
+
+    //! releases all inner buffers
+    void release();
+
+    int nbSamples;         // number of samples per pixel
+    int reqMatches;        // #_min
+    int radius;            // R
+    int subsamplingFactor; // amount of random subsampling
+
+private:
+    Size frameSize_;
+
+    unsigned long rngSeed_;
+    GpuMat randStates_;
+
+    GpuMat samples_;
+};
+
+} // namespace gpu
+
+} // namespace cv
+
+#endif // defined(HAVE_OPENCV_GPU)
+
+#endif // __OPENCV_NONFREE_GPU_HPP__
diff --git a/modules/nonfree/include/opencv2/nonfree/ocl.hpp b/modules/nonfree/include/opencv2/nonfree/ocl.hpp
new file mode 100644
index 000000000..9ceb9c330
--- /dev/null
+++ b/modules/nonfree/include/opencv2/nonfree/ocl.hpp
@@ -0,0 +1,124 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_NONFREE_OCL_HPP__
+#define __OPENCV_NONFREE_OCL_HPP__
+
+#include "opencv2/ocl.hpp"
+
+namespace cv
+{
+    namespace ocl
+    {
+        //! Speeded up robust features, port from GPU module.
+        ////////////////////////////////// SURF //////////////////////////////////////////
+
+        class CV_EXPORTS SURF_OCL
+        {
+        public:
+            enum KeypointLayout
+            {
+                X_ROW = 0,
+                Y_ROW,
+                LAPLACIAN_ROW,
+                OCTAVE_ROW,
+                SIZE_ROW,
+                ANGLE_ROW,
+                HESSIAN_ROW,
+                ROWS_COUNT
+            };
+
+            //! the default constructor
+            SURF_OCL();
+            //! the full constructor taking all the necessary parameters
+            explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
+                              int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
+
+            //! returns the descriptor size in float's (64 or 128)
+            int descriptorSize() const;
+            //! upload host keypoints to device memory
+            void uploadKeypoints(const std::vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
+            //! download keypoints from device to host memory
+            void downloadKeypoints(const oclMat &keypointsocl, std::vector<KeyPoint> &keypoints);
+            //! download descriptors from device to host memory
+            void downloadDescriptors(const oclMat &descriptorsocl, std::vector<float> &descriptors);
+            //! finds the keypoints using fast hessian detector used in SURF
+            //! supports CV_8UC1 images
+            //! keypoints will have nFeature cols and 6 rows
+            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
+            //! finds the keypoints and computes their descriptors.
+            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
+                            bool useProvidedKeypoints = false);
+            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
+            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
+                            bool useProvidedKeypoints = false);
+            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
+                            bool useProvidedKeypoints = false);
+
+            void releaseMemory();
+
+            // SURF parameters
+            float hessianThreshold;
+            int nOctaves;
+            int nOctaveLayers;
+            bool extended;
+            bool upright;
+            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+            float keypointsRatio;
+            oclMat sum, mask1, maskSum, intBuffer;
+            oclMat det, trace;
+            oclMat maxPosBuffer;
+
+        };
+    }
+}
+
+#endif //__OPENCV_NONFREE_OCL_HPP__
\ No newline at end of file
diff --git a/modules/nonfree/perf/perf_gpu.cpp b/modules/nonfree/perf/perf_gpu.cpp
new file mode 100644
index 000000000..c00955c89
--- /dev/null
+++ b/modules/nonfree/perf/perf_gpu.cpp
@@ -0,0 +1,138 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+#if defined(HAVE_XINE)         || \
+    defined(HAVE_GSTREAMER)    || \
+    defined(HAVE_QUICKTIME)    || \
+    defined(HAVE_AVFOUNDATION) || \
+    defined(HAVE_FFMPEG)       || \
+    defined(WIN32) /* assume that we have ffmpeg */
+
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 1
+#else
+#  define BUILD_WITH_VIDEO_INPUT_SUPPORT 0
+#endif
+
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+
+//////////////////////////////////////////////////////////////////////
+// SURF
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, GPU_SURF,
+            Values<string>("gpu/perf/aloe.png"))
+{
+    declare.time(50.0);
+
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::SURF_GPU d_surf;
+
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);
+
+        cv::Mat gpu_descriptors(d_descriptors);
+
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors, 1e-3);
+    }
+    else
+    {
+        cv::SURF surf;
+
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;
+
+        TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
+    }
+}
+
+//////////////////////////////////////////////////////
+// VIBE
+
+#if BUILD_WITH_VIDEO_INPUT_SUPPORT
+
+DEF_PARAM_TEST(Video_Cn, string, int);
+
+PERF_TEST_P(Video_Cn, GPU_VIBE,
+            Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"),
+                    GPU_CHANNELS_1_3_4))
+{
+    const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0));
+    const int cn = GET_PARAM(1);
+
+    cv::VideoCapture cap(inputFile);
+    ASSERT_TRUE(cap.isOpened());
+
+    cv::Mat frame;
+    cap >> frame;
+    ASSERT_FALSE(frame.empty());
+
+    if (cn != 3)
+    {
+        cv::Mat temp;
+        if (cn == 1)
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+        else
+            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+        cv::swap(temp, frame);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_frame(frame);
+        cv::gpu::VIBE_GPU vibe;
+        cv::gpu::GpuMat foreground;
+
+        vibe(d_frame, foreground);
+
+        for (int i = 0; i < 10; ++i)
+        {
+            cap >> frame;
+            ASSERT_FALSE(frame.empty());
+
+            if (cn != 3)
+            {
+                cv::Mat temp;
+                if (cn == 1)
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
+                else
+                    cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
+                cv::swap(temp, frame);
+            }
+
+            d_frame.upload(frame);
+
+            startTimer(); next();
+            vibe(d_frame, foreground);
+            stopTimer();
+        }
+
+        GPU_SANITY_CHECK(foreground);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+#endif
+
+#endif
diff --git a/modules/nonfree/perf/perf_main.cpp b/modules/nonfree/perf/perf_main.cpp
index 3fcca19c3..de1242149 100644
--- a/modules/nonfree/perf/perf_main.cpp
+++ b/modules/nonfree/perf/perf_main.cpp
@@ -1,3 +1,4 @@
 #include "perf_precomp.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
 
-CV_PERF_TEST_MAIN(nonfree)
+CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())
diff --git a/modules/nonfree/perf/perf_precomp.hpp b/modules/nonfree/perf/perf_precomp.hpp
index bb03c28b6..205045b1d 100644
--- a/modules/nonfree/perf/perf_precomp.hpp
+++ b/modules/nonfree/perf/perf_precomp.hpp
@@ -13,6 +13,15 @@
 #include "opencv2/nonfree.hpp"
 #include "opencv2/highgui.hpp"
 
+#include "opencv2/opencv_modules.hpp"
+#ifdef HAVE_OPENCV_OCL
+#  include "opencv2/nonfree/ocl.hpp"
+#endif
+
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+    #include "opencv2/nonfree/gpu.hpp"
+#endif
+
 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
 #endif
diff --git a/modules/ocl/perf/perf_surf.cpp b/modules/nonfree/perf/perf_surf.ocl.cpp
similarity index 63%
rename from modules/ocl/perf/perf_surf.cpp
rename to modules/nonfree/perf/perf_surf.ocl.cpp
index 6aa4f512a..23b1f1ecd 100644
--- a/modules/ocl/perf/perf_surf.cpp
+++ b/modules/nonfree/perf/perf_surf.ocl.cpp
@@ -43,61 +43,69 @@
 //
 //M*/
 
-#include "precomp.hpp"
-#include <iomanip>
+#include "perf_precomp.hpp"
 
-#ifdef HAVE_OPENCL
+#ifdef HAVE_OPENCV_OCL
 
 using namespace cv;
 using namespace cv::ocl;
-using namespace cvtest;
-using namespace testing;
 using namespace std;
 
-#define FILTER_IMAGE "../../../samples/gpu/road.png"
+typedef perf::TestBaseWithParam<std::string> OCL_SURF;
 
-TEST(SURF, Performance)
+#define SURF_IMAGES \
+    "cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
+    "stitching/a3.png"
+
+PERF_TEST_P(OCL_SURF, DISABLED_with_data_transfer, testing::Values(SURF_IMAGES))
 {
-    cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+    string filename = getDataPath(GetParam());
+    Mat img = imread(filename, IMREAD_GRAYSCALE);
     ASSERT_FALSE(img.empty());
 
-    ocl::SURF_OCL d_surf;
-    ocl::oclMat d_keypoints;
-    ocl::oclMat d_descriptors;
+    SURF_OCL d_surf;
+    oclMat d_keypoints;
+    oclMat d_descriptors;
+    Mat cpu_kp;
+    Mat cpu_dp;
 
-    double totalgputick = 0;
-    double totalgputick_kernel = 0;
+    declare.time(60);
 
-    double t1 = 0;
-    double t2 = 0;
-    for(int j = 0; j < LOOP_TIMES + 1; j ++)
+    TEST_CYCLE()
     {
-        t1 = (double)cvGetTickCount();//gpu start1
+        oclMat d_src(img);
 
-        ocl::oclMat d_src(img);//upload
-
-        t2 = (double)cvGetTickCount(); //kernel
-        d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
-        t2 = (double)cvGetTickCount() - t2;//kernel
-
-        cv::Mat cpu_kp, cpu_dp;
-        d_keypoints.download (cpu_kp);//download
-        d_descriptors.download (cpu_dp);//download
-
-        t1 = (double)cvGetTickCount() - t1;//gpu end1
-
-        if(j == 0)
-            continue;
-
-        totalgputick = t1 + totalgputick;
-
-        totalgputick_kernel = t2 + totalgputick_kernel;
+        d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
 
+        d_keypoints.download(cpu_kp);
+        d_descriptors.download(cpu_dp);
     }
 
-    cout << "average gpu runtime is  " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-    cout << "average gpu runtime without data transfer is  " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
-
-
+    SANITY_CHECK(cpu_kp, 1);
+    SANITY_CHECK(cpu_dp, 1);
 }
-#endif  //Have opencl
\ No newline at end of file
+
+PERF_TEST_P(OCL_SURF, DISABLED_without_data_transfer, testing::Values(SURF_IMAGES))
+{
+    string filename = getDataPath(GetParam());
+    Mat img = imread(filename, IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    SURF_OCL d_surf;
+    oclMat d_keypoints;
+    oclMat d_descriptors;
+    oclMat d_src(img);
+
+    declare.time(60);
+
+    TEST_CYCLE() d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
+
+    Mat cpu_kp;
+    Mat cpu_dp;
+    d_keypoints.download(cpu_kp);
+    d_descriptors.download(cpu_dp);
+    SANITY_CHECK(cpu_kp, 1);
+    SANITY_CHECK(cpu_dp, 1);
+}
+
+#endif // HAVE_OPENCV_OCL
\ No newline at end of file
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/nonfree/src/cuda/surf.cu
similarity index 96%
rename from modules/gpu/src/cuda/surf.cu
rename to modules/nonfree/src/cuda/surf.cu
index 5dc2c8210..b82e2c386 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/nonfree/src/cuda/surf.cu
@@ -55,6 +55,33 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/filters.hpp"
 
+namespace cv { namespace gpu { namespace device
+{
+    namespace surf
+    {
+        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+
+        void bindImgTex(PtrStepSzb img);
+        size_t bindSumTex(PtrStepSz<unsigned int> sum);
+        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
+
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+            int octave, int nOctaveLayer);
+
+        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
+
+        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
+            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
+            unsigned int* featureCounter);
+
+        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+
+        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+    }
+}}}
+
 namespace cv { namespace gpu { namespace device
 {
     namespace surf
@@ -717,6 +744,9 @@ namespace cv { namespace gpu { namespace device
             int height;
         };
 
+        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+                                   float& dx, float& dy);
+
         __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
                                    float& dx, float& dy)
         {
diff --git a/modules/gpu/src/cuda/bgfg_vibe.cu b/modules/nonfree/src/cuda/vibe.cu
similarity index 95%
rename from modules/gpu/src/cuda/bgfg_vibe.cu
rename to modules/nonfree/src/cuda/vibe.cu
index 72ea7f0ae..ff489eeab 100644
--- a/modules/gpu/src/cuda/bgfg_vibe.cu
+++ b/modules/nonfree/src/cuda/vibe.cu
@@ -44,6 +44,18 @@
 
 #include "opencv2/gpu/device/common.hpp"
 
+namespace cv { namespace gpu { namespace device
+{
+    namespace vibe
+    {
+        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
+
+        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
+
+        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
+    }
+}}}
+
 namespace cv { namespace gpu { namespace device
 {
     namespace vibe
@@ -255,4 +267,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/ocl/src/kernels/nonfree_surf.cl b/modules/nonfree/src/opencl/surf.cl
similarity index 98%
rename from modules/ocl/src/kernels/nonfree_surf.cl
rename to modules/nonfree/src/opencl/surf.cl
index e6acba2b0..e917864d7 100644
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -78,7 +78,12 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col
 
 // dynamically change the precision used for floating type
 
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #define F double
 #else
 #define F float
@@ -744,13 +749,19 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
     barrier(CLK_LOCAL_MEM_FENCE);
 
     if (tid < 16)
-    {
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
         data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
-    }
 #undef op
 }
 
@@ -892,9 +903,9 @@ __kernel
             kp_dir += 2.0f * CV_PI_F;
         kp_dir *= 180.0f / CV_PI_F;
 
-        //kp_dir = 360.0f - kp_dir;
-        //if (fabs(kp_dir - 360.f) < FLT_EPSILON)
-        //    kp_dir = 0.f;
+        kp_dir = 360.0f - kp_dir;
+        if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+            kp_dir = 0.f;
 
         featureDir[get_group_id(0)] = kp_dir;
     }
@@ -913,7 +924,7 @@ __kernel
 
     if(get_global_id(0) <= nFeatures)
     {
-        featureDir[get_global_id(0)] = 90.0f;
+        featureDir[get_global_id(0)] = 270.0f;
     }
 }
 
@@ -1011,7 +1022,12 @@ void calc_dx_dy(
     const float centerX = featureX[get_group_id(0)];
     const float centerY = featureY[get_group_id(0)];
     const float size = featureSize[get_group_id(0)];
-    float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);
+    float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
+    if(fabs(descriptor_dir - 360.0f) < FLT_EPSILON)
+    {
+        descriptor_dir = 0.0f;
+    }
+    descriptor_dir *= (float)(CV_PI_F / 180.0f);
 
     /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
diff --git a/modules/nonfree/src/precomp.hpp b/modules/nonfree/src/precomp.hpp
index f4494d442..7b1445b1f 100644
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
@@ -53,4 +53,24 @@
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/internal.hpp"
 
+#include "opencv2/opencv_modules.hpp"
+
+#if defined(HAVE_OPENCV_GPU)
+    #include "opencv2/nonfree/gpu.hpp"
+
+    #if defined(HAVE_CUDA)
+        #include "opencv2/gpu/stream_accessor.hpp"
+        #include "opencv2/gpu/device/common.hpp"
+
+        static inline void throw_nogpu() { CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform"); }
+    #else
+        static inline void throw_nogpu() { CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); }
+    #endif
+#endif
+
+#ifdef HAVE_OPENCV_OCL
+#  include "opencv2/nonfree/ocl.hpp"
+#  include "opencv2/ocl/private/util.hpp"
+#endif
+
 #endif
diff --git a/modules/ocl/src/surf.cpp b/modules/nonfree/src/surf.ocl.cpp
similarity index 94%
rename from modules/ocl/src/surf.cpp
rename to modules/nonfree/src/surf.ocl.cpp
index 5d1439228..420a5f6ae 100644
--- a/modules/ocl/src/surf.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -42,9 +42,9 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include <iomanip>
 #include "precomp.hpp"
-#include "mcwutil.hpp"
+
+#ifdef HAVE_OPENCV_OCL
 
 using namespace cv;
 using namespace cv::ocl;
@@ -54,7 +54,7 @@ namespace cv
     namespace ocl
     {
         ///////////////////////////OpenCL kernel strings///////////////////////////
-        extern const char *nonfree_surf;
+        extern const char *surf;
 
         const char* noImage2dOption = "-D DISABLE_IMAGE2D";
 
@@ -74,10 +74,11 @@ namespace cv
 }
 
 
-static inline int divUp(int total, int grain)
+static inline size_t divUp(size_t total, size_t grain)
 {
     return (total + grain - 1) / grain;
 }
+
 static inline int calcSize(int octave, int layer)
 {
     /* Wavelet size at first layer of first octave. */
@@ -150,7 +151,7 @@ public:
         integral(img, surf_.sum);
         if(support_image2d())
         {
-        bindImgTex(img, imgTex);
+            bindImgTex(img, imgTex);
             bindImgTex(surf_.sum, sumTex);
         }
 
@@ -158,7 +159,7 @@ public:
 
         if (use_mask)
         {
-            throw std::exception();
+            CV_Error(CV_StsBadFunc, "Masked SURF detector is not implemented yet");
             //!FIXME
             // temp fix for missing min overload
             //oclMat temp(mask.size(), mask.type());
@@ -508,11 +509,11 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
         divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
         1
     };
-    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
-        int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
+        int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
 {
     const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
 
@@ -536,7 +537,7 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxCandidates));
     args.push_back( std::make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
 
-    if(use_mask)
+    if(useMask)
     {
         if(maskSumTex)
         {
@@ -554,11 +555,11 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
                                1
                               };
 
-    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
-        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
+        oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
 {
     Context *clCxt = det.clCxt;
     std::string kernelName = "icvInterpolateKeypoint";
@@ -567,19 +568,19 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&det.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
-    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters.data));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counters_.data));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&det.step));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_rows));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&octave));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&layer_rows));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&maxFeatures));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&max_features));
 
     size_t localThreads[3]  = {3, 3, 3};
     size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
 
-    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@@ -606,7 +607,7 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
     size_t localThreads[3]  = {32, 4, 1};
     size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
 
-    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
@@ -621,9 +622,9 @@ void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
 
     size_t localThreads[3]  = {256, 1, 1};
-    size_t globalThreads[3] = {nFeatures, 1, 1};
+    size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
 
-    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
 
 
@@ -631,7 +632,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
 {
     // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
     Context *clCxt = descriptors.clCxt;
-    std::string kernelName = "";
+    std::string kernelName;
     std::vector< std::pair<size_t, const void *> > args;
     size_t localThreads[3]  = {1, 1, 1};
     size_t globalThreads[3] = {1, 1, 1};
@@ -663,7 +664,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
 
-        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 
         kernelName = "normalize_descriptors64";
 
@@ -677,7 +678,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
 
-        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
     }
     else
     {
@@ -706,7 +707,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
 
-        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
 
         kernelName = "normalize_descriptors128";
 
@@ -720,7 +721,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
         args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
 
-        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
     }
 }
 
+#endif //HAVE_OPENCV_OCL
diff --git a/modules/gpu/src/surf.cpp b/modules/nonfree/src/surf_gpu.cpp
similarity index 99%
rename from modules/gpu/src/surf.cpp
rename to modules/nonfree/src/surf_gpu.cpp
index 123c8ce6e..f2a01cfdc 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/nonfree/src/surf_gpu.cpp
@@ -42,10 +42,12 @@
 
 #include "precomp.hpp"
 
+#if defined(HAVE_OPENCV_GPU)
+
 using namespace cv;
 using namespace cv::gpu;
 
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+#if !defined (HAVE_CUDA)
 
 cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
 cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float, bool) { throw_nogpu(); }
@@ -60,7 +62,7 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<Key
 void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&, std::vector<float>&, bool) { throw_nogpu(); }
 void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
 
-#else /* !defined (HAVE_CUDA) */
+#else // !defined (HAVE_CUDA)
 
 namespace cv { namespace gpu { namespace device
 {
@@ -415,4 +417,6 @@ void cv::gpu::SURF_GPU::releaseMemory()
     maxPosBuffer.release();
 }
 
-#endif /* !defined (HAVE_CUDA) */
+#endif // !defined (HAVE_CUDA)
+
+#endif // defined(HAVE_OPENCV_GPU)
diff --git a/modules/gpu/src/bgfg_vibe.cpp b/modules/nonfree/src/vibe_gpu.cpp
similarity index 98%
rename from modules/gpu/src/bgfg_vibe.cpp
rename to modules/nonfree/src/vibe_gpu.cpp
index ff7e58082..e34862765 100644
--- a/modules/gpu/src/bgfg_vibe.cpp
+++ b/modules/nonfree/src/vibe_gpu.cpp
@@ -42,6 +42,8 @@
 
 #include "precomp.hpp"
 
+#if defined(HAVE_OPENCV_GPU)
+
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
 
 cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
@@ -135,3 +137,5 @@ void cv::gpu::VIBE_GPU::release()
 }
 
 #endif
+
+#endif // defined(HAVE_OPENCV_GPU)
diff --git a/modules/nonfree/test/test_gpu.cpp b/modules/nonfree/test/test_gpu.cpp
new file mode 100644
index 000000000..2993cf567
--- /dev/null
+++ b/modules/nonfree/test/test_gpu.cpp
@@ -0,0 +1,285 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+
+using namespace cvtest;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SURF
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
+    IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
+    IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
+    IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
+    IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
+}
+
+PARAM_TEST_CASE(SURF, cv::gpu::DeviceInfo, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
+{
+    cv::gpu::DeviceInfo devInfo;
+    double hessianThreshold;
+    int nOctaves;
+    int nOctaveLayers;
+    bool extended;
+    bool upright;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        hessianThreshold = GET_PARAM(1);
+        nOctaves = GET_PARAM(2);
+        nOctaveLayers = GET_PARAM(3);
+        extended = GET_PARAM(4);
+        upright = GET_PARAM(5);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(SURF, Detector)
+{
+    cv::Mat image = readImage("../gpu/features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::gpu::SURF_GPU surf;
+    surf.hessianThreshold = hessianThreshold;
+    surf.nOctaves = nOctaves;
+    surf.nOctaveLayers = nOctaveLayers;
+    surf.extended = extended;
+    surf.upright = upright;
+    surf.keypointsRatio = 0.05f;
+
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints);
+
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, cv::noArray(), keypoints_gold);
+
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
+}
+
+GPU_TEST_P(SURF, Detector_Masked)
+{
+    cv::Mat image = readImage("../gpu/features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::Mat mask(image.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Range(0, image.rows / 2), cv::Range(0, image.cols / 2)).setTo(cv::Scalar::all(0));
+
+    cv::gpu::SURF_GPU surf;
+    surf.hessianThreshold = hessianThreshold;
+    surf.nOctaves = nOctaves;
+    surf.nOctaveLayers = nOctaveLayers;
+    surf.extended = extended;
+    surf.upright = upright;
+    surf.keypointsRatio = 0.05f;
+
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            surf(loadMat(image), loadMat(mask), keypoints);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf(loadMat(image), loadMat(mask), keypoints);
+
+        cv::SURF surf_gold;
+        surf_gold.hessianThreshold = hessianThreshold;
+        surf_gold.nOctaves = nOctaves;
+        surf_gold.nOctaveLayers = nOctaveLayers;
+        surf_gold.extended = extended;
+        surf_gold.upright = upright;
+
+        std::vector<cv::KeyPoint> keypoints_gold;
+        surf_gold(image, mask, keypoints_gold);
+
+        ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+        int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+        EXPECT_GT(matchedRatio, 0.95);
+    }
+}
+
+GPU_TEST_P(SURF, Descriptor)
+{
+    cv::Mat image = readImage("../gpu/features2d/aloe.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::gpu::SURF_GPU surf;
+    surf.hessianThreshold = hessianThreshold;
+    surf.nOctaves = nOctaves;
+    surf.nOctaveLayers = nOctaveLayers;
+    surf.extended = extended;
+    surf.upright = upright;
+    surf.keypointsRatio = 0.05f;
+
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;
+
+    if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
+    {
+        try
+        {
+            std::vector<cv::KeyPoint> keypoints;
+            cv::gpu::GpuMat descriptors;
+            surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+        surf_gold(image, cv::noArray(), keypoints);
+
+        cv::gpu::GpuMat descriptors;
+        surf(loadMat(image), cv::gpu::GpuMat(), keypoints, descriptors, true);
+
+        cv::Mat descriptors_gold;
+        surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+
+        cv::BFMatcher matcher(cv::NORM_L2);
+        std::vector<cv::DMatch> matches;
+        matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+
+        int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+        double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+
+        EXPECT_GT(matchedRatio, 0.6);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Features2D, SURF, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(SURF_HessianThreshold(100.0), SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
+    testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
+    testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
+    testing::Values(SURF_Extended(false), SURF_Extended(true)),
+    testing::Values(SURF_Upright(false), SURF_Upright(true))));
+
+//////////////////////////////////////////////////////
+// VIBE
+
+PARAM_TEST_CASE(VIBE, cv::gpu::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+};
+
+GPU_TEST_P(VIBE, Accuracy)
+{
+    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
+    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(1);
+    const int type = GET_PARAM(2);
+    const bool useRoi = GET_PARAM(3);
+
+    const cv::Mat fullfg(size, CV_8UC1, cv::Scalar::all(255));
+
+    cv::Mat frame = randomMat(size, type, 0.0, 100);
+    cv::gpu::GpuMat d_frame = loadMat(frame, useRoi);
+
+    cv::gpu::VIBE_GPU vibe;
+    cv::gpu::GpuMat d_fgmask = createMat(size, CV_8UC1, useRoi);
+    vibe.initialize(d_frame);
+
+    for (int i = 0; i < 20; ++i)
+        vibe(d_frame, d_fgmask);
+
+    frame = randomMat(size, type, 160, 255);
+    d_frame = loadMat(frame, useRoi);
+    vibe(d_frame, d_fgmask);
+
+    // now fgmask should be entirely foreground
+    ASSERT_MAT_NEAR(fullfg, d_fgmask, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Video, VIBE, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4)),
+    WHOLE_SUBMAT));
+
+#endif
diff --git a/modules/nonfree/test/test_main.cpp b/modules/nonfree/test/test_main.cpp
index 6b2499344..c9e33a943 100644
--- a/modules/nonfree/test/test_main.cpp
+++ b/modules/nonfree/test/test_main.cpp
@@ -1,3 +1,73 @@
 #include "test_precomp.hpp"
 
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+
+int main(int argc, char **argv)
+{
+    try
+    {
+         const char*  keys =
+                "{ h | help ?            | false | Print help}"
+                "{ i | info              | false | Print information about system and exit }"
+                "{ d | device            | -1   | Device on which tests will be executed (-1 means all devices) }"
+                ;
+
+        CommandLineParser cmd(argc, (const char**)argv, keys);
+
+        if (cmd.get<bool>("help"))
+        {
+            cmd.printParams();
+            return 0;
+    }
+
+        printCudaInfo();
+
+        if (cmd.get<bool>("info"))
+    {
+            return 0;
+    }
+
+        int device = cmd.get<int>("device");
+        if (device < 0)
+    {
+            DeviceManager::instance().loadAll();
+
+            std::cout << "Run tests on all supported devices \n" << std::endl;
+    }
+        else
+    {
+            DeviceManager::instance().load(device);
+
+            DeviceInfo info(device);
+            std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl;
+}
+
+        TS::ptr()->init("cv");
+        InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+    catch (const std::exception& e)
+    {
+        std::cerr << e.what() << std::endl;
+        return -1;
+    }
+    catch (...)
+{
+        std::cerr << "Unknown error" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
+
+#else // HAVE_CUDA
+
 CV_TEST_MAIN("cv")
+
+#endif // HAVE_CUDA
diff --git a/modules/nonfree/test/test_precomp.hpp b/modules/nonfree/test/test_precomp.hpp
index 4a14b31fb..3346fdc78 100644
--- a/modules/nonfree/test/test_precomp.hpp
+++ b/modules/nonfree/test/test_precomp.hpp
@@ -15,4 +15,14 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/nonfree.hpp"
 
+#include "opencv2/opencv_modules.hpp"
+#ifdef HAVE_OPENCV_OCL
+#  include "opencv2/nonfree/ocl.hpp"
+#endif
+
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+    #include "opencv2/ts/gpu_test.hpp"
+    #include "opencv2/nonfree/gpu.hpp"
+#endif
+
 #endif
diff --git a/modules/nonfree/test/test_surf.ocl.cpp b/modules/nonfree/test/test_surf.ocl.cpp
new file mode 100644
index 000000000..76ed37de4
--- /dev/null
+++ b/modules/nonfree/test/test_surf.ocl.cpp
@@ -0,0 +1,226 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_OPENCV_OCL
+
+using namespace std;
+using std::tr1::get;
+
+static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
+{
+    const double maxPtDif = 0.1;
+    const double maxSizeDif = 0.1;
+    const double maxAngleDif = 0.1;
+    const double maxResponseDif = 0.01;
+
+    double dist = cv::norm(p1.pt - p2.pt);
+
+    if (dist < maxPtDif &&
+        fabs(p1.size - p2.size) < maxSizeDif &&
+        abs(p1.angle - p2.angle) < maxAngleDif &&
+        abs(p1.response - p2.response) < maxResponseDif &&
+        p1.octave == p2.octave &&
+        p1.class_id == p2.class_id)
+    {
+        return true;
+    }
+
+    return false;
+}
+
+static int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+{
+    std::sort(actual.begin(), actual.end(), perf::comparators::KeypointGreater());
+    std::sort(gold.begin(), gold.end(), perf::comparators::KeypointGreater());
+
+    int validCount = 0;
+
+    for (size_t i = 0; i < gold.size(); ++i)
+    {
+        const cv::KeyPoint& p1 = gold[i];
+        const cv::KeyPoint& p2 = actual[i];
+
+        if (keyPointsEquals(p1, p2))
+            ++validCount;
+    }
+
+    return validCount;
+}
+
+static int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
+{
+    int validCount = 0;
+
+    for (size_t i = 0; i < matches.size(); ++i)
+    {
+        const cv::DMatch& m = matches[i];
+
+        const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
+        const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
+
+        if (keyPointsEquals(p1, p2))
+            ++validCount;
+    }
+
+    return validCount;
+}
+
+#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+#define IMPLEMENT_PARAM_CLASS(name, type) \
+    namespace { class name { \
+    public: \
+        name ( type arg = type ()) : val_(arg) {} \
+        operator type () const {return val_;} \
+    private: \
+        type val_; \
+    }; \
+    inline void PrintTo( name param, std::ostream* os) {*os << #name <<  "=" << testing::PrintToString(static_cast< type >(param));}}
+
+IMPLEMENT_PARAM_CLASS(HessianThreshold, double)
+IMPLEMENT_PARAM_CLASS(Octaves, int)
+IMPLEMENT_PARAM_CLASS(OctaveLayers, int)
+IMPLEMENT_PARAM_CLASS(Extended, bool)
+IMPLEMENT_PARAM_CLASS(Upright, bool)
+
+PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright)
+{
+    double hessianThreshold;
+    int nOctaves;
+    int nOctaveLayers;
+    bool extended;
+    bool upright;
+
+    virtual void SetUp()
+    {
+        hessianThreshold = get<0>(GetParam());
+        nOctaves = get<1>(GetParam());
+        nOctaveLayers = get<2>(GetParam());
+        extended = get<3>(GetParam());
+        upright = get<4>(GetParam());
+    }
+};
+
+TEST_P(SURF, DISABLED_Detector)
+{
+    cv::Mat image  = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::ocl::SURF_OCL surf;
+    surf.hessianThreshold = static_cast<float>(hessianThreshold);
+    surf.nOctaves = nOctaves;
+    surf.nOctaveLayers = nOctaveLayers;
+    surf.extended = extended;
+    surf.upright = upright;
+    surf.keypointsRatio = 0.05f;
+
+    std::vector<cv::KeyPoint> keypoints;
+    surf(cv::ocl::oclMat(image), cv::ocl::oclMat(), keypoints);
+
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;
+
+    std::vector<cv::KeyPoint> keypoints_gold;
+    surf_gold(image, cv::noArray(), keypoints_gold);
+
+    ASSERT_EQ(keypoints_gold.size(), keypoints.size());
+    int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
+
+    EXPECT_GT(matchedRatio, 0.99);
+}
+
+TEST_P(SURF, DISABLED_Descriptor)
+{
+    cv::Mat image  = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    cv::ocl::SURF_OCL surf;
+    surf.hessianThreshold = static_cast<float>(hessianThreshold);
+    surf.nOctaves = nOctaves;
+    surf.nOctaveLayers = nOctaveLayers;
+    surf.extended = extended;
+    surf.upright = upright;
+    surf.keypointsRatio = 0.05f;
+
+    cv::SURF surf_gold;
+    surf_gold.hessianThreshold = hessianThreshold;
+    surf_gold.nOctaves = nOctaves;
+    surf_gold.nOctaveLayers = nOctaveLayers;
+    surf_gold.extended = extended;
+    surf_gold.upright = upright;
+
+    std::vector<cv::KeyPoint> keypoints;
+    surf_gold(image, cv::noArray(), keypoints);
+
+    cv::ocl::oclMat descriptors;
+    surf(cv::ocl::oclMat(image), cv::ocl::oclMat(), keypoints, descriptors, true);
+
+    cv::Mat descriptors_gold;
+    surf_gold(image, cv::noArray(), keypoints, descriptors_gold, true);
+
+    cv::BFMatcher matcher(cv::NORM_L2);
+    std::vector<cv::DMatch> matches;
+    matcher.match(descriptors_gold, cv::Mat(descriptors), matches);
+
+    int matchedCount = getMatchedPointsCount(keypoints, keypoints, matches);
+    double matchedRatio = static_cast<double>(matchedCount) / keypoints.size();
+
+    EXPECT_GT(matchedRatio, 0.35);
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_Features2D, SURF, testing::Combine(
+    testing::Values(HessianThreshold(500.0), HessianThreshold(1000.0)),
+    testing::Values(Octaves(3), Octaves(4)),
+    testing::Values(OctaveLayers(2), OctaveLayers(3)),
+    testing::Values(Extended(false), Extended(true)),
+    testing::Values(Upright(false), Upright(true))));
+
+#endif // HAVE_OPENCV_OCL
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt
index d0e254d3a..a7cd3a071 100644
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -1,69 +1,7 @@
-# Will be modified later
 if(NOT HAVE_OPENCL)
   ocv_module_disable(ocl)
 endif()
 
 set(the_description "OpenCL-accelerated Computer Vision")
-ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
-
-ocv_module_include_directories()
-
-file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
-set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
-set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
-
-add_custom_command(
-  OUTPUT ${kernels_cpp}
-  COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
-  DEPENDS ${CL_FILES} ${cl2cpp_script})
-
-file(GLOB lib_hdrs     "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
-file(GLOB lib_srcs     "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-
-source_group("Include"   FILES ${lib_hdrs})
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
-
-if (HAVE_OPENCL)
-  set(ocl_link_libs ${OPENCL_LIBRARIES})
-  if(OPENCL_INCLUDE_DIR)
-    ocv_include_directories(${OPENCL_INCLUDE_DIR})
-  endif()
-  if (HAVE_CLAMDFFT)
-    set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES})
-    ocv_include_directories(${CLAMDFFT_INCLUDE_DIR})
-  endif()
-  if (HAVE_CLAMDBLAS)
-    set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES})
-    ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR})
-  endif()
-endif()
-
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
-
-ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
-ocv_create_module(${ocl_link_libs})
-
-install(FILES ${lib_hdrs}
-  DESTINATION include/opencv2/${name}
-  COMPONENT main)
-
-ocv_add_precompiled_headers(${the_module})
-
-################################################################################################################
-################################      OpenCL Module Tests     ##################################################
-################################################################################################################
-file(GLOB test_srcs "test/*.cpp")
-file(GLOB test_hdrs "test/*.hpp" "test/*.h")
-
-ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
-                       FILES "Src" ${test_srcs})
-
-################################################################################################################
-################################   OpenCL Module Performance  ##################################################
-################################################################################################################
-file(GLOB perf_srcs "perf/*.cpp")
-file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
-
-ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
-                   FILES "Src" ${perf_srcs})
diff --git a/modules/ocl/doc/object_detection.rst b/modules/ocl/doc/object_detection.rst
index 0104da593..17eb62d0e 100644
--- a/modules/ocl/doc/object_detection.rst
+++ b/modules/ocl/doc/object_detection.rst
@@ -88,102 +88,3 @@ Computes a proximity map for a raster template and an image where the template i
     * ``CV_TM_CCORR``
 
 .. seealso:: :ocv:func:`matchTemplate`
-
-
-ocl::SURF_OCL
--------------
-.. ocv:class:: ocl::SURF_OCL
-
-Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
-
-    class SURF_OCL
-    {
-    public:
-        enum KeypointLayout
-        {
-            X_ROW = 0,
-            Y_ROW,
-            LAPLACIAN_ROW,
-            OCTAVE_ROW,
-            SIZE_ROW,
-            ANGLE_ROW,
-            HESSIAN_ROW,
-            ROWS_COUNT
-        };
-
-        //! the default constructor
-        SURF_OCL();
-        //! the full constructor taking all the necessary parameters
-        explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
-             int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
-
-        //! returns the descriptor size in float's (64 or 128)
-        int descriptorSize() const;
-
-        //! upload host keypoints to device memory
-        void uploadKeypoints(const vector<KeyPoint>& keypoints,
-            oclMat& keypointsocl);
-        //! download keypoints from device to host memory
-        void downloadKeypoints(const oclMat& keypointsocl,
-            vector<KeyPoint>& keypoints);
-
-        //! download descriptors from device to host memory
-        void downloadDescriptors(const oclMat& descriptorsocl,
-            vector<float>& descriptors);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            oclMat& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints, oclMat& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void operator()(const oclMat& img, const oclMat& mask,
-            std::vector<KeyPoint>& keypoints,
-            std::vector<float>& descriptors,
-            bool useProvidedKeypoints = false);
-
-        void releaseMemory();
-
-        // SURF parameters
-        double hessianThreshold;
-        int nOctaves;
-        int nOctaveLayers;
-        bool extended;
-        bool upright;
-
-        //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-        float keypointsRatio;
-
-        oclMat sum, mask1, maskSum, intBuffer;
-
-        oclMat det, trace;
-
-        oclMat maxPosBuffer;
-    };
-
-
-The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
-
-The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
-
-* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
-* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
-* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]``  contains the laplacian sign of the i-th feature.
-* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
-* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
-* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
-* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
-
-The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
-
-The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
-
-.. seealso:: :ocv:class:`SURF`
\ No newline at end of file
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 3cf09dc84..5811a7c5b 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -134,20 +134,29 @@ namespace cv
 
         //////////////////////////////// OpenCL context ////////////////////////
         //This is a global singleton class used to represent a OpenCL context.
-        class Context
+        class CV_EXPORTS Context
         {
         protected:
             Context();
             friend class std::auto_ptr<Context>;
-            static std::auto_ptr<Context> clCxt;
 
+        private:
+            static std::auto_ptr<Context> clCxt;
+            static int val;
         public:
             ~Context();
-            static int val;
+            void release();
+            Info::Impl* impl;
+
             static Context *getContext();
             static void setContext(Info &oclinfo);
-            struct Impl;
-            Impl *impl;
+
+            enum {CL_DOUBLE, CL_UNIFIED_MEM};
+            bool supportsFeature(int ftype);
+            size_t computeUnits();
+            size_t maxWorkGroupSize();
+            void* oclContext();
+            void* oclCommandQueue();
         };
 
         //! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
@@ -1088,156 +1097,6 @@ namespace cv
         };
 
 
-
-        //! Speeded up robust features, port from GPU module.
-        ////////////////////////////////// SURF //////////////////////////////////////////
-
-        class CV_EXPORTS SURF_OCL
-
-        {
-
-        public:
-
-            enum KeypointLayout
-
-            {
-
-                X_ROW = 0,
-
-                Y_ROW,
-
-                LAPLACIAN_ROW,
-
-                OCTAVE_ROW,
-
-                SIZE_ROW,
-
-                ANGLE_ROW,
-
-                HESSIAN_ROW,
-
-                ROWS_COUNT
-
-            };
-
-
-
-            //! the default constructor
-
-            SURF_OCL();
-
-            //! the full constructor taking all the necessary parameters
-
-            explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
-
-                              int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
-
-
-
-            //! returns the descriptor size in float's (64 or 128)
-
-            int descriptorSize() const;
-
-
-
-            //! upload host keypoints to device memory
-
-            void uploadKeypoints(const std::vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
-
-            //! download keypoints from device to host memory
-
-            void downloadKeypoints(const oclMat &keypointsocl, std::vector<KeyPoint> &keypoints);
-
-
-
-            //! download descriptors from device to host memory
-
-            void downloadDescriptors(const oclMat &descriptorsocl, std::vector<float> &descriptors);
-
-
-
-            //! finds the keypoints using fast hessian detector used in SURF
-
-            //! supports CV_8UC1 images
-
-            //! keypoints will have nFeature cols and 6 rows
-
-            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
-
-            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
-
-            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
-
-            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
-
-            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
-
-            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
-
-            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
-
-            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
-
-            //! finds the keypoints and computes their descriptors.
-
-            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
-
-            void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
-
-                            bool useProvidedKeypoints = false);
-
-
-
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
-
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
-
-                            bool useProvidedKeypoints = false);
-
-
-
-            void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
-
-                            bool useProvidedKeypoints = false);
-
-
-
-            void releaseMemory();
-
-
-
-            // SURF parameters
-
-            float hessianThreshold;
-
-            int nOctaves;
-
-            int nOctaveLayers;
-
-            bool extended;
-
-            bool upright;
-
-
-
-            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
-
-            float keypointsRatio;
-
-
-
-            oclMat sum, mask1, maskSum, intBuffer;
-
-
-
-            oclMat det, trace;
-
-
-
-            oclMat maxPosBuffer;
-
-        };
-
         ////////////////////////feature2d_ocl/////////////////
         /****************************************************************************************\
         *                                      Distance                                          *
@@ -1821,6 +1680,40 @@ namespace cv
 
         //! computes moments of the rasterized shape or a vector of points
         CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
+
+        class CV_EXPORTS StereoBM_OCL
+        {
+        public:
+            enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+
+            enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+
+            //! the default constructor
+            StereoBM_OCL();
+            //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8.
+            StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ);
+
+            //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
+            //! Output disparity has CV_8U type.
+            void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity);
+
+            //! Some heuristics that tries to estmate
+            // if current GPU will be faster then CPU in this algorithm.
+            // It queries current active device.
+            static bool checkIfGpuCallReasonable();
+
+            int preset;
+            int ndisp;
+            int winSize;
+
+            // If avergeTexThreshold  == 0 => post procesing is disabled
+            // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
+            // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
+            // i.e. input left image is low textured.
+            float avergeTexThreshold;
+        private:
+            oclMat minSSD, leBuf, riBuf;
+        };
     }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200
diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp
index e982774a5..eb8ff427d 100644
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -7,7 +7,7 @@
 //  copy or use the software.
 //
 //
-//                          License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
diff --git a/modules/ocl/include/opencv2/ocl/private/util.hpp b/modules/ocl/include/opencv2/ocl/private/util.hpp
new file mode 100644
index 000000000..a60521e4d
--- /dev/null
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -0,0 +1,130 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OCL_PRIVATE_UTIL__
+#define __OPENCV_OCL_PRIVATE_UTIL__
+
+#include "opencv2/ocl.hpp"
+
+#if defined __APPLE__
+#include <OpenCL/OpenCL.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+namespace cv
+{
+    namespace ocl
+    {
+        enum openCLMemcpyKind
+        {
+            clMemcpyHostToDevice = 0,
+            clMemcpyDeviceToHost,
+            clMemcpyDeviceToDevice
+        };
+        ///////////////////////////OpenCL call wrappers////////////////////////////
+        void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
+                                          size_t widthInBytes, size_t height);
+        void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
+                                            size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
+        void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
+                                       const void *src, size_t spitch,
+                                       size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
+        void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
+                                           const void *src, size_t spitch,
+                                           size_t width, size_t height, int src_offset);
+        void CV_EXPORTS openCLFree(void *devPtr);
+        cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
+        void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
+        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+                                                       const char **source, std::string kernelName);
+        cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
+                                                       const char **source, std::string kernelName, const char *build_options);
+        void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, std::vector< std::pair<size_t, const void *> > &args,
+                                 int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
+        void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
+                                  size_t globalThreads[3], size_t localThreads[3],
+                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
+        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
+        void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                                 int depth, const char *build_options);
+
+        cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
+                             const size_t size);
+
+        cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
+
+        int CV_EXPORTS savetofile(const Context *clcxt,  cl_program &program, const char *fileName);
+
+        enum FLUSH_MODE
+        {
+            CLFINISH = 0,
+            CLFLUSH,
+            DISABLE
+        };
+
+        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
+        void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
+                                  int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+        // bind oclMat to OpenCL image textures
+        // note:
+        //   1. there is no memory management. User need to explicitly release the resource
+        //   2. for faster clamping, there is no buffer padding for the constructed texture
+        cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
+        void CV_EXPORTS releaseTexture(cl_mem& texture);
+
+        // returns whether the current context supports image2d_t format or not
+        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
+
+    }//namespace ocl
+
+}//namespace cv
+
+#endif //__OPENCV_OCL_PRIVATE_UTIL__
diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index 232688895..ec17300d3 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -131,7 +131,7 @@ inline int divUp(int total, int grain)
 template<typename T>
 void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, std::string kernelName, const char **kernelString, void *_scalar)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -194,7 +194,7 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
 }
 static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, std::string kernelName, const char **kernelString)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -271,7 +271,7 @@ typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst,
 
 void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
 {
-    if((src1.clCxt -> impl -> double_support != 0) && (src1.depth() == CV_64F))
+    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F))
         arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
     else
         arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
@@ -279,7 +279,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
 void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
 {
 
-    if(src1.clCxt -> impl -> double_support != 0)
+    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
         arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
     else
         arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
@@ -288,7 +288,7 @@ void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double
 template <typename WT , typename CL_WT>
 void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, std::string kernelName, const char **kernelString, int isMatSubScalar)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -360,7 +360,7 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
 
 static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, std::string kernelName, const char **kernelString, double scalar)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -404,7 +404,7 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, std::string ke
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
-    if(src.clCxt -> impl -> double_support != 0)
+    if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
         args.push_back( std::make_pair( sizeof(cl_double), (void *)&scalar ));
     else
     {
@@ -463,7 +463,7 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
 }
 void cv::ocl::divide(double scalar, const oclMat &src,  oclMat &dst)
 {
-    if(src.clCxt -> impl -> double_support == 0)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -523,7 +523,7 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, std
 
 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -598,7 +598,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
 template <typename T>
 Scalar arithmetic_sum(const oclMat &src, int type = 0)
 {
-    size_t groupnum = src.clCxt->impl->maxComputeUnits;
+    size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
     int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
@@ -626,7 +626,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 typedef Scalar (*sumFunc)(const oclMat &src, int type);
 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -637,13 +637,13 @@ Scalar cv::ocl::sum(const oclMat &src)
     };
 
     sumFunc func;
-    func = functab[src.clCxt->impl->double_support];
+    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     return func(src, 0);
 }
 
 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -654,13 +654,13 @@ Scalar cv::ocl::absSum(const oclMat &src)
     };
 
     sumFunc func;
-    func = functab[src.clCxt->impl->double_support];
+    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     return func(src, 1);
 }
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -671,7 +671,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
     };
 
     sumFunc func;
-    func = functab[src.clCxt->impl->double_support];
+    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     return func(src, 2);
 }
 //////////////////////////////////////////////////////////////////////////////
@@ -770,7 +770,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
 
 template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
 {
-    size_t groupnum = src.clCxt->impl->maxComputeUnits;
+    size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
     groupnum = groupnum * 2;
     int vlen = 8;
@@ -809,7 +809,7 @@ typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, co
 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
 {
     CV_Assert(src.oclchannels() == 1);
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -893,7 +893,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 //////////////////////////////////////////////////////////////////////////////
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, std::string kernelName)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -942,7 +942,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, std::string
 }
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, std::string kernelName, bool isVertical)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1122,7 +1122,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, std::string k
     CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
 
     Context  *clCxt = src.clCxt;
-    if(clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1163,7 +1163,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 //////////////////////////////////////////////////////////////////////////////
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, std::string kernelName)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1211,7 +1211,7 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
 
 static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, std::string kernelName, const char **kernelString)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1275,7 +1275,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 std::string kernelName, bool angleInDegrees)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1330,7 +1330,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         std::string kernelName)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -1451,7 +1451,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                           Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
     CV_Assert(src.oclchannels() == 1);
-    size_t groupnum = src.clCxt->impl->maxComputeUnits;
+    size_t groupnum = src.clCxt->computeUnits();
     CV_Assert(groupnum != 0);
     int minloc = -1 , maxloc = -1;
     int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
@@ -1512,7 +1512,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -1523,7 +1523,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     };
 
     minMaxLocFunc func;
-    func = functab[src.clCxt->impl->double_support];
+    func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
     func(src, minVal, maxVal, minLoc, maxLoc, mask);
 }
 
@@ -1558,8 +1558,8 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
 
 int cv::ocl::countNonZero(const oclMat &src)
 {
-    size_t groupnum = src.clCxt->impl->maxComputeUnits;
-    if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+    size_t groupnum = src.clCxt->computeUnits();
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "select device don't support double");
     }
@@ -1844,7 +1844,7 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
 
 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1857,7 +1857,7 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
 void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
     // dst.create(src1.size(),src1.type());
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1873,7 +1873,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co
 
 void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1888,7 +1888,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co
 void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
     //    dst.create(src1.size(),src1.type());
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1905,7 +1905,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c
 
 void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1919,7 +1919,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c
 
 void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -1938,7 +1938,7 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c
 void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
 {
 
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
@@ -2035,7 +2035,7 @@ oclMatExpr::operator oclMat() const
 #define BLOCK_ROWS    (256/TILE_DIM)
 static void transpose_run(const oclMat &src, oclMat &dst, std::string kernelName)
 {
-    if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -2134,7 +2134,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset));
 
-    if(src1.clCxt -> impl -> double_support != 0)
+    if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
     {
         args.push_back( std::make_pair( sizeof(cl_double), (void *)&alpha ));
         args.push_back( std::make_pair( sizeof(cl_double), (void *)&beta ));
@@ -2281,7 +2281,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, std::s
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
-    if(src1.clCxt -> impl -> double_support == 0)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
     {
         float pf = p;
         args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
@@ -2293,7 +2293,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, std::s
 }
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if(x.clCxt -> impl -> double_support == 0 && x.type() == CV_64F)
+    if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
     {
         std::cout << "Selected device do not support double" << std::endl;
         return;
diff --git a/modules/ocl/src/canny.cpp b/modules/ocl/src/canny.cpp
index a7871d4a3..1408b0039 100644
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -43,9 +43,7 @@
 //
 //M*/
 
-#include <iomanip>
 #include "precomp.hpp"
-#include "mcwutil.hpp"
 
 using namespace cv;
 using namespace cv::ocl;
@@ -99,7 +97,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
     {
         openCLFree(counter);
     }
-    counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
+    counter = clCreateBuffer( (cl_context)getoclContext(), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
     openCLSafeCall(err);
 }
 
@@ -355,7 +353,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
 void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
 {
     unsigned int count;
-    openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
+    openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
     Context *clCxt = map.clCxt;
     std::string kernelName = "edgesHysteresisGlobal";
     std::vector< std::pair<size_t, const void *> > args;
@@ -365,7 +363,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
     int count_i[1] = {0};
     while(count > 0)
     {
-        openCLSafeCall(clEnqueueWriteBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
 
         args.clear();
         size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
@@ -380,7 +378,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
         args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
 
         openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
-        openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
         std::swap(st1, st2);
     }
 #undef DIVUP
diff --git a/modules/ocl/src/fft.cpp b/modules/ocl/src/fft.cpp
index 076fab987..5edd07bb3 100644
--- a/modules/ocl/src/fft.cpp
+++ b/modules/ocl/src/fft.cpp
@@ -205,7 +205,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     clStridesIn[2]  = is_row_dft ? clStridesIn[1]  : dft_size.width * clStridesIn[1];
     clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
 
-    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) );
+    openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
 
     openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
     openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
@@ -219,7 +219,8 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
     openCLSafeCall( clAmdFftSetPlanScale  ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
 
     //ready to bake
-    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) );
+    cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
+    openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
 }
 cv::ocl::FftPlan::~FftPlan()
 {
@@ -337,16 +338,17 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
     if (buffersize)
     {
         cl_int medstatus;
-        clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+        clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
         openCLSafeCall( medstatus );
     }
+    cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue();
     openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
         is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
         1,
-        &src.clCxt->impl->clCmdQueue,
+        &clq,
         0, NULL, NULL,
         (cl_mem *)&src.data, (cl_mem *)&dst.data, clMedBuffer ) );
-    openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) );
+    openCLSafeCall( clFinish(clq) );
     if(clMedBuffer)
     {
         openCLFree(clMedBuffer);
diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp
index 61e899c17..e73e5155c 100644
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@@ -48,8 +48,7 @@
 //M*/
 
 #include "precomp.hpp"
-#include "mcwutil.hpp"
-#include <iostream>
+
 using namespace cv;
 using namespace cv::ocl;
 
@@ -194,7 +193,7 @@ public:
 
 namespace
 {
-typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, bool rectKernel, bool usrROI);
+typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, const Point, bool rectKernel);
 
 class MorphFilter_GPU : public BaseFilter_GPU
 {
@@ -204,7 +203,7 @@ public:
 
     virtual void operator()(const oclMat &src, oclMat &dst)
     {
-        func(src, dst, kernel, ksize, anchor, rectKernel, false) ;
+        func(src, dst, kernel, ksize, anchor, rectKernel) ;
     }
 
     oclMat kernel;
@@ -219,7 +218,7 @@ public:
 **Note that the kernel need to be further refined.
 */
 static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
-                         Size &ksize, const Point anchor, bool rectKernel, bool useROI)
+                         Size &ksize, const Point anchor, bool rectKernel)
 {
     //Normalize the result by default
     //float alpha = ksize.height * ksize.width;
@@ -275,10 +274,9 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     }
 
     char compile_option[128];
-    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s %s",
+    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D ERODE %s %s",
         anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
         rectKernel?"-D RECTKERNEL":"",
-        useROI?"-D USEROI":"",
         s);
     std::vector< std::pair<size_t, const void *> > args;
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
@@ -299,7 +297,7 @@ static void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
 
 //! data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
 static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
-                          Size &ksize, const Point anchor, bool rectKernel, bool useROI)
+                          Size &ksize, const Point anchor, bool rectKernel)
 {
     //Normalize the result by default
     //float alpha = ksize.height * ksize.width;
@@ -356,10 +354,9 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
     }
 
     char compile_option[128];
-    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s %s",
+    sprintf(compile_option, "-D RADIUSX=%d -D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D DILATE %s %s",
         anchor.x, anchor.y, (int)localThreads[0], (int)localThreads[1],
         rectKernel?"-D RECTKERNEL":"",
-        useROI?"-D USEROI":"",
         s);
     std::vector< std::pair<size_t, const void *> > args;
     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
@@ -1480,7 +1477,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
 
 void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
 {
-    if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
diff --git a/modules/ocl/src/gemm.cpp b/modules/ocl/src/gemm.cpp
index be7e79cce..840f6285c 100644
--- a/modules/ocl/src/gemm.cpp
+++ b/modules/ocl/src/gemm.cpp
@@ -87,7 +87,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
     int offb    = src2.offset;
     int offc    = dst.offset;
 
-
+    cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue();
     switch(src1.type())
     {
     case CV_32FC1:
@@ -97,11 +97,12 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
         offa /= sizeof(float);
         offb /= sizeof(float);
         offc /= sizeof(float);
+
         openCLSafeCall
         (
             clAmdBlasSgemmEx(order, transA, transB, M, N, K,
                              alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+                             beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
         );
         break;
     case CV_64FC1:
@@ -115,7 +116,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
         (
             clAmdBlasDgemmEx(order, transA, transB, M, N, K,
                              alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+                             beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
         );
         break;
     case CV_32FC2:
@@ -132,7 +133,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
         (
             clAmdBlasCgemmEx(order, transA, transB, M, N, K,
                              alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
         );
     }
     break;
@@ -150,7 +151,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
         (
             clAmdBlasZgemmEx(order, transA, transB, M, N, K,
                              alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
-                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
+                             beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
         );
     }
     break;
diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index 31da6ec2d..83cfb5a6f 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -969,7 +969,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
         size_t blocksize = 8;
         size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->impl->maxComputeUnits) *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
                                     localThreads[1], 1
                                   };
         int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1045,21 +1045,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
         stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
 
         //classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
         //status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
 
         nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0,
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
                                             nodenum * sizeof(GpuHidHaarTreeNode),
                                             node, 0, NULL, NULL));
         candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
         //openCLVerifyCall(status);
         scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
         //flag  = 1;
         //}
 
@@ -1184,7 +1184,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         int grp_per_CU = 12;
         size_t blocksize = 8;
         size_t localThreads[3] = { blocksize, blocksize , 1 };
-        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->impl->maxComputeUnits *localThreads[0],
+        size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
                                     localThreads[1], 1
                                   };
         int outputsz = 256 * globalThreads[0] / localThreads[0];
@@ -1193,7 +1193,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
                                         nodenum * sizeof(GpuHidHaarTreeNode));
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0,
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
                                             nodenum * sizeof(GpuHidHaarTreeNode),
                                             node, 0, NULL, NULL));
         cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
@@ -1250,16 +1250,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         int splitnode = stage[0].count + stage[1].count + stage[2].count;
         stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
         candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
         //openCLVerifyCall(status);
         scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
         //openCLVerifyCall(status);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
         pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
         correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
-        openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
         //int argcount = 0;
 
         std::vector<std::pair<size_t, const void *> > args;
@@ -1284,7 +1284,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
 
         //openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
-        candidate = (int *)clEnqueueMapBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
+        candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
 
         for(int i = 0; i < outputsz; i++)
         {
@@ -1295,7 +1295,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
         free(scaleinfo);
         free(p);
         free(correction);
-        clEnqueueUnmapMemObject(gsum.clCxt->impl->clCmdQueue, candidatebuffer, candidate, 0, 0, 0);
+        clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
         openCLSafeCall(clReleaseMemObject(stagebuffer));
         openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
         openCLSafeCall(clReleaseMemObject(nodebuffer));
diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp
index db9220583..7fc4b2d3e 100644
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -44,7 +44,7 @@
 //M*/
 
 #include "precomp.hpp"
-#include "mcwutil.hpp"
+
 using namespace cv;
 using namespace cv::ocl;
 
diff --git a/modules/ocl/src/hough.cpp b/modules/ocl/src/hough.cpp
index f5d770c95..840f53d7d 100644
--- a/modules/ocl/src/hough.cpp
+++ b/modules/ocl/src/hough.cpp
@@ -73,7 +73,7 @@ namespace
 
         int totalCount = 0;
         int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(src.clCxt->impl->clContext,
+        cl_mem counter = clCreateBuffer((cl_context)src.clCxt->oclContext(),
                                         CL_MEM_COPY_HOST_PTR,
                                         sizeof(int),
                                         &totalCount,
@@ -98,7 +98,7 @@ namespace
         args.push_back( std::make_pair( sizeof(cl_mem)  , (void *)&counter ));
 
         openCLExecuteKernel(src.clCxt, &imgproc_hough, "buildPointList", globalThreads, localThreads, args, -1, -1);
-        openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)src.clCxt->oclCommandQueue(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
         openCLSafeCall(clReleaseMemObject(counter));
 
         return totalCount;
@@ -143,7 +143,7 @@ namespace
     {
         int totalCount = 0;
         int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(accum.clCxt->impl->clContext,
+        cl_mem counter = clCreateBuffer((cl_context)accum.clCxt->oclContext(),
                                         CL_MEM_COPY_HOST_PTR,
                                         sizeof(int),
                                         &totalCount,
@@ -169,7 +169,7 @@ namespace
 
         openCLExecuteKernel(accum.clCxt, &imgproc_hough, "buildCentersList", globalThreads, localThreads, args, -1, -1);
 
-        openCLSafeCall(clEnqueueReadBuffer(accum.clCxt->impl->clCmdQueue, counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)accum.clCxt->oclCommandQueue(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
         openCLSafeCall(clReleaseMemObject(counter));
 
         return totalCount;
@@ -182,14 +182,14 @@ namespace
     {
         int totalCount = 0;
         int err = CL_SUCCESS;
-        cl_mem counter = clCreateBuffer(circles.clCxt->impl->clContext,
+        cl_mem counter = clCreateBuffer((cl_context)circles.clCxt->oclContext(),
                                         CL_MEM_COPY_HOST_PTR,
                                         sizeof(int),
                                         &totalCount,
                                         &err);
         openCLSafeCall(err);
 
-        const size_t blkSizeX = circles.clCxt->impl->maxWorkGroupSize;
+        const size_t blkSizeX = circles.clCxt->maxWorkGroupSize();
         size_t localThreads[3] = { blkSizeX, 1, 1 };
 
         const size_t glbSizeX = centersCount * blkSizeX;
@@ -216,7 +216,7 @@ namespace
 
         openCLExecuteKernel(circles.clCxt, &imgproc_hough, "circlesAccumRadius", globalThreads, localThreads, args, -1, -1);
 
-        openCLSafeCall(clEnqueueReadBuffer(circles.clCxt->impl->clCmdQueue, counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
+        openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)circles.clCxt->oclCommandQueue(), counter, CL_TRUE, 0, sizeof(int), &totalCount, 0, NULL, NULL));
 
         openCLSafeCall(clReleaseMemObject(counter));
 
@@ -283,7 +283,7 @@ void cv::ocl::HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf&
         unsigned int* oldBuf = oldBuf_;
         unsigned int* newBuf = newBuf_;
 
-        openCLSafeCall(clEnqueueReadBuffer(buf.centers.clCxt->impl->clCmdQueue,
+        openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)buf.centers.clCxt->oclCommandQueue(),
                                            (cl_mem)buf.centers.data,
                                            CL_TRUE,
                                            0,
@@ -357,7 +357,7 @@ void cv::ocl::HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf&
             }
         }
 
-        openCLSafeCall(clEnqueueWriteBuffer(buf.centers.clCxt->impl->clCmdQueue,
+        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)buf.centers.clCxt->oclCommandQueue(),
                                             (cl_mem)buf.centers.data,
                                             CL_TRUE,
                                             0,
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 3501636de..b6bf62d46 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -290,7 +290,7 @@ namespace cv
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
                 float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
 
-               if(src.clCxt -> impl -> double_support != 0)
+               if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                 }
@@ -318,7 +318,7 @@ namespace cv
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.cols));
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.rows));
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
-                if(src.clCxt -> impl -> double_support != 0)
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                 }
@@ -382,7 +382,7 @@ namespace cv
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.rows));
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
                 args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
-                if(src.clCxt -> impl -> double_support != 0)
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     args.push_back( std::make_pair(sizeof(cl_double), (void *)&ifx_d));
                     args.push_back( std::make_pair(sizeof(cl_double), (void *)&ify_d));
@@ -823,12 +823,12 @@ namespace cv
                 std::string kernelName = "warpAffine" + s[interpolation];
 
 
-                if(src.clCxt -> impl -> double_support != 0)
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     cl_int st;
-                    coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
                 }
                 else
                 {
@@ -838,8 +838,8 @@ namespace cv
                         {
                             float_coeffs[m][n] = coeffs[m][n];
                         }
-                    coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
-                    openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
+                        coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
+                        openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
 
                 }
                 //TODO: improve this kernel
@@ -893,12 +893,12 @@ namespace cv
                 std::string s[3] = {"NN", "Linear", "Cubic"};
                 std::string kernelName = "warpPerspective" + s[interpolation];
 
-                if(src.clCxt -> impl -> double_support != 0)
+                if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
                 {
                     cl_int st;
-                    coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
                 }
                 else
                 {
@@ -907,9 +907,9 @@ namespace cv
                         for(int n = 0; n < 3; n++)
                             float_coeffs[m][n] = coeffs[m][n];
 
-                    coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
+                    coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
                     openCLVerifyCall(st);
-                    openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
+                    openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
                 }
                 //TODO: improve this kernel
                 size_t blkSizeX = 16, blkSizeY = 16;
@@ -1017,7 +1017,7 @@ namespace cv
         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
         {
             CV_Assert(src.type() == CV_8UC1);
-            if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1191,7 +1191,7 @@ namespace cv
         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
                           double k, int borderType)
         {
-            if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1205,7 +1205,7 @@ namespace cv
 
         void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
         {
-            if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
             {
                 CV_Error(CV_GpuNotSupported, "select device don't support double");
             }
@@ -1259,7 +1259,7 @@ namespace cv
             if( src.depth() != CV_8U || src.oclchannels() != 4 )
                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
 
-            //            if(src.clCxt->impl->double_support == 0)
+            //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
             //            {
             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
             //            }
@@ -1327,7 +1327,7 @@ namespace cv
             if( src.depth() != CV_8U || src.oclchannels() != 4 )
                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
 
-            //            if(src.clCxt->impl->double_support == 0)
+            //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
             //            {
             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
             //            }
diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp
index c0796d0ad..93bf1e66c 100644
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -121,6 +121,135 @@ namespace cv
             cacheSize = 0;
         }
 
+
+       struct Info::Impl
+        {
+            cl_platform_id oclplatform;
+            std::vector<cl_device_id> devices;
+            std::vector<std::string> devName;
+
+            cl_context oclcontext;
+            cl_command_queue clCmdQueue;
+            int devnum;
+            size_t maxWorkGroupSize;
+            cl_uint maxDimensions; // == maxWorkItemSizes.size()
+            std::vector<size_t> maxWorkItemSizes;
+            cl_uint maxComputeUnits;
+            char extra_options[512];
+            int  double_support;
+            int unified_memory; //1 means integrated GPU, otherwise this value is 0
+            std::string binpath;
+            int refcounter;
+
+            Impl()
+            {
+                refcounter = 1;
+                oclplatform = 0;
+                oclcontext = 0;
+                clCmdQueue = 0;
+                devnum = -1;
+                maxComputeUnits = 0;
+                maxWorkGroupSize = 0;
+                memset(extra_options, 0, 512);
+                double_support = 0;
+                unified_memory = 0;
+            }
+
+            void setDevice(void *ctx, void *q, int devnum);
+
+            void release()
+            {
+                if(1 == CV_XADD(&refcounter, -1))
+                {
+                    releaseResources();
+                    delete this;
+            }
+            }
+
+            Impl* copy()
+            {
+                CV_XADD(&refcounter, 1);
+                return this;
+            }
+
+        private:
+            Impl(const Impl&);
+            Impl& operator=(const Impl&);
+            void releaseResources();
+        };
+
+        void Info::Impl::releaseResources()
+        {
+            devnum = -1;
+
+            if(clCmdQueue)
+            {
+                openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
+                clCmdQueue = 0;
+            }
+
+            if(oclcontext)
+            {
+                openCLSafeCall(clReleaseContext(oclcontext));
+                oclcontext = 0;
+            }
+        }
+
+        void Info::Impl::setDevice(void *ctx, void *q, int dnum)
+        {
+            if((ctx && q) || devnum != dnum)
+                releaseResources();
+
+            CV_Assert(dnum >= 0 && dnum < (int)devices.size());
+            devnum = dnum;
+            if(ctx && q)
+            {
+                oclcontext = (cl_context)ctx;
+                clCmdQueue = (cl_command_queue)q;
+                clRetainContext(oclcontext);
+                clRetainCommandQueue(clCmdQueue);
+            }
+            else
+            {
+                cl_int status = 0;
+                cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
+                oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
+                openCLVerifyCall(status);
+                clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
+                openCLVerifyCall(status);
+            }
+
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
+            maxWorkItemSizes.resize(maxDimensions);
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
+
+            cl_bool unfymem = false;
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
+            unified_memory = unfymem ? 1 : 0;
+
+            //initialize extra options for compilation. Currently only fp64 is included.
+            //Assume 4KB is enough to store all possible extensions.
+            const int EXT_LEN = 4096 + 1 ;
+            char extends_set[EXT_LEN];
+            size_t extends_size;
+            openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
+            extends_set[EXT_LEN - 1] = 0;
+            size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
+
+            if(fp64_khr != std::string::npos)
+            {
+                sprintf(extra_options, "-D DOUBLE_SUPPORT");
+                double_support = 1;
+            }
+            else
+            {
+                memset(extra_options, 0, 512);
+                double_support = 0;
+            }
+        }
+
         ////////////////////////Common OpenCL specific calls///////////////
         int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
         {
@@ -140,27 +269,6 @@ namespace cv
             return 0;
         }
 
-       struct Info::Impl
-        {
-            cl_platform_id oclplatform;
-            std::vector<cl_device_id> devices;
-            std::vector<std::string> devName;
-
-            cl_context oclcontext;
-            cl_command_queue clCmdQueue;
-            int devnum;
-            cl_uint maxDimensions;
-            size_t maxWorkGroupSize;
-            size_t *maxWorkItemSizes;
-            cl_uint maxComputeUnits;
-            char extra_options[512];
-            int  double_support;
-            Impl()
-            {
-                memset(extra_options, 0, 512);
-            }
-        };
-
         inline int divUp(int total, int grain)
         {
             return (total + grain - 1) / grain;
@@ -168,6 +276,9 @@ namespace cv
 
         int getDevice(std::vector<Info> &oclinfo, int devicetype)
         {
+            //TODO: cache oclinfo vector
+            oclinfo.clear();
+
             switch(devicetype)
             {
             case CVCL_DEVICE_TYPE_DEFAULT:
@@ -177,125 +288,62 @@ namespace cv
             case CVCL_DEVICE_TYPE_ALL:
                 break;
             default:
-                CV_Error(CV_GpuApiCallError, "Unkown device type");
+                return 0;
             }
-            int devcienums = 0;
-            // Platform info
-            cl_int status = 0;
-            cl_uint numPlatforms;
-            Info ocltmpinfo;
-            openCLSafeCall(clGetPlatformIDs(0, NULL, &numPlatforms));
-            CV_Assert(numPlatforms > 0);
-            cl_platform_id *platforms = new cl_platform_id[numPlatforms];
 
-            openCLSafeCall(clGetPlatformIDs(numPlatforms, platforms, NULL));
+            // Platform info
+            cl_uint numPlatforms;
+            openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
+            if(numPlatforms < 1) return 0;
+
+            std::vector<cl_platform_id> platforms(numPlatforms);
+            openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
+
             char deviceName[256];
+            int devcienums = 0;
             for (unsigned i = 0; i < numPlatforms; ++i)
             {
                 cl_uint numsdev;
-                status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
+                cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
                 if(status != CL_DEVICE_NOT_FOUND)
-                {
                     openCLVerifyCall(status);
-                }
+
                 if(numsdev > 0)
                 {
                     devcienums += numsdev;
-                    cl_device_id *devices = new cl_device_id[numsdev];
-                    openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, devices, NULL));
+                    std::vector<cl_device_id> devices(numsdev);
+                    openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
+
+                    Info ocltmpinfo;
                     ocltmpinfo.impl->oclplatform = platforms[i];
-                    for(unsigned j = 0; j < numsdev; j++)
+                    for(unsigned j = 0; j < numsdev; ++j)
                     {
                         ocltmpinfo.impl->devices.push_back(devices[j]);
-                        openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 256, deviceName, NULL));
-                        ocltmpinfo.impl->devName.push_back(std::string(deviceName));
-                        ocltmpinfo.DeviceName.push_back(std::string(deviceName));
+                        openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
+                        ocltmpinfo.impl->devName.push_back(deviceName);
+                        ocltmpinfo.DeviceName.push_back(deviceName);
                     }
-                    delete[] devices;
                     oclinfo.push_back(ocltmpinfo);
-                    ocltmpinfo.release();
                 }
             }
-            delete[] platforms;
-            if(devcienums > 0)
-            {
-                setDevice(oclinfo[0]);
-            }
             return devcienums;
         }
 
-        static void fillClcontext(Info &oclinfo)
-        {
-            //get device information
-            size_t devnum = oclinfo.impl->devnum;
-
-            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE,
-                                           sizeof(size_t), (void *)&oclinfo.impl->maxWorkGroupSize, NULL));
-            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
-                                           sizeof(cl_uint), (void *)&oclinfo.impl->maxDimensions, NULL));
-            oclinfo.impl->maxWorkItemSizes = new size_t[oclinfo.impl->maxDimensions];
-            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES,
-                                           sizeof(size_t)*oclinfo.impl->maxDimensions, (void *)oclinfo.impl->maxWorkItemSizes, NULL));
-            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS,
-                                           sizeof(cl_uint), (void *)&oclinfo.impl->maxComputeUnits, NULL));
-            //initialize extra options for compilation. Currently only fp64 is included.
-            //Assume 4KB is enough to store all possible extensions.
-
-            const int EXT_LEN = 4096 + 1 ;
-            char extends_set[EXT_LEN];
-            size_t extends_size;
-            openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_EXTENSIONS,
-                                           EXT_LEN, (void *)extends_set, &extends_size));
-            CV_Assert(extends_size < (size_t)EXT_LEN);
-            extends_set[EXT_LEN - 1] = 0;
-            memset(oclinfo.impl->extra_options, 0, 512);
-            oclinfo.impl->double_support = 0;
-            int fp64_khr = std::string(extends_set).find("cl_khr_fp64");
-
-            if(fp64_khr >= 0 && fp64_khr < EXT_LEN)
-            {
-                sprintf(oclinfo.impl->extra_options , "-D DOUBLE_SUPPORT");
-                oclinfo.impl -> double_support = 1;
-            }
-            Context::setContext(oclinfo);
-
-        }
-
         void setDevice(Info &oclinfo, int devnum)
         {
-            CV_Assert(devnum >= 0);
-            cl_int status = 0;
-            cl_context_properties cps[3] =
-            {
-                CL_CONTEXT_PLATFORM, (cl_context_properties)(oclinfo.impl->oclplatform), 0
-            };
-            oclinfo.impl->devnum = devnum;
-            oclinfo.impl->oclcontext = clCreateContext(cps, 1, &oclinfo.impl->devices[devnum], NULL, NULL, &status);
-            openCLVerifyCall(status);
-            //create the command queue using the first device of the list
-            oclinfo.impl->clCmdQueue = clCreateCommandQueue(oclinfo.impl->oclcontext, oclinfo.impl->devices[devnum],
-                                       CL_QUEUE_PROFILING_ENABLE, &status);
-            openCLVerifyCall(status);
-            fillClcontext(oclinfo);
+            oclinfo.impl->setDevice(0, 0, devnum);
+            Context::setContext(oclinfo);
         }
 
         void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
         {
-            CV_Assert(devnum >= 0);
-            oclinfo.impl->devnum = devnum;
-            if(ctx && q)
-            {
-                oclinfo.impl->oclcontext = (cl_context)ctx;
-                oclinfo.impl->clCmdQueue = (cl_command_queue)q;
-                clRetainContext((cl_context)ctx);
-                clRetainCommandQueue((cl_command_queue)q);
-                fillClcontext(oclinfo);
-             }
+            oclinfo.impl->setDevice(ctx, q, devnum);
+            Context::setContext(oclinfo);
          }
 
         void *getoclContext()
         {
-            return &(Context::getContext()->impl->clContext);
+            return &(Context::getContext()->impl->oclcontext);
         }
 
         void *getoclCommandQueue()
@@ -313,7 +361,7 @@ namespace cv
         cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
         {
             cl_int status;
-            cl_mem buffer = clCreateBuffer(clCxt->impl->clContext, (cl_mem_flags)flag, size, NULL, &status);
+            cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
             openCLVerifyCall(status);
             return buffer;
         }
@@ -328,8 +376,7 @@ namespace cv
                                size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
         {
             cl_int status;
-
-            *dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
+            *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
                                       widthInBytes * height, 0, &status);
             openCLVerifyCall(status);
             *pitch = widthInBytes;
@@ -337,7 +384,7 @@ namespace cv
 
         void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
                             const void *src, size_t spitch,
-                            size_t width, size_t height, enum openCLMemcpyKind kind, int channels)
+                            size_t width, size_t height, openCLMemcpyKind kind, int channels)
         {
             size_t buffer_origin[3] = {0, 0, 0};
             size_t host_origin[3] = {0, 0, 0};
@@ -395,7 +442,7 @@ namespace cv
         void setBinpath(const char *path)
         {
             Context *clcxt = Context::getContext();
-            clcxt->impl->Binpath = path;
+            clcxt->impl->binpath = path;
         }
 
         int savetofile(const Context*,  cl_program &program, const char *fileName)
@@ -439,11 +486,11 @@ namespace cv
 
             if(NULL != build_options)
             {
-                src_sign << (int64)(*source) << clCxt->impl->clContext << "_" << build_options;
+                src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
             }
             else
             {
-                src_sign << (int64)(*source) << clCxt->impl->clContext;
+                src_sign << (int64)(*source) << clCxt->impl->oclcontext;
             }
             srcsign = src_sign.str();
 
@@ -463,24 +510,24 @@ namespace cv
                     strcat(all_build_options, build_options);
                 if(all_build_options != NULL)
                 {
-                    filename = clCxt->impl->Binpath  + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb";
+                    filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
                 }
                 else
                 {
-                    filename = clCxt->impl->Binpath  + kernelName + "_" + clCxt->impl->devName + ".clb";
+                    filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
                 }
 
                 FILE *fp = fopen(filename.c_str(), "rb");
-                if(fp == NULL || clCxt->impl->Binpath.size() == 0)    //we should generate a binary file for the first time.
+                if(fp == NULL || clCxt->impl->binpath.size() == 0)    //we should generate a binary file for the first time.
                 {
                     if(fp != NULL)
                         fclose(fp);
 
                     program = clCreateProgramWithSource(
-                                  clCxt->impl->clContext, 1, source, NULL, &status);
+                                  clCxt->impl->oclcontext, 1, source, NULL, &status);
                     openCLVerifyCall(status);
-                    status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL);
-                    if(status == CL_SUCCESS && clCxt->impl->Binpath.size())
+                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
+                    if(status == CL_SUCCESS && clCxt->impl->binpath.size())
                         savetofile(clCxt, program, filename.c_str());
                 }
                 else
@@ -492,15 +539,15 @@ namespace cv
                     CV_Assert(1 == fread(binary, binarySize, 1, fp));
                     fclose(fp);
                     cl_int status = 0;
-                    program = clCreateProgramWithBinary(clCxt->impl->clContext,
+                    program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
                                                         1,
-                                                        &(clCxt->impl->devices),
+                                                        &(clCxt->impl->devices[clCxt->impl->devnum]),
                                                         (const size_t *)&binarySize,
                                                         (const unsigned char **)&binary,
                                                         NULL,
                                                         &status);
                     openCLVerifyCall(status);
-                    status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL);
+                    status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
                     delete[] binary;
                 }
 
@@ -512,14 +559,14 @@ namespace cv
                         char *buildLog = NULL;
                         size_t buildLogSize = 0;
                         logStatus = clGetProgramBuildInfo(program,
-                                                          clCxt->impl->devices, CL_PROGRAM_BUILD_LOG, buildLogSize,
+                                                          clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
                                                           buildLog, &buildLogSize);
                         if(logStatus != CL_SUCCESS)
                             std::cout << "Failed to build the program and get the build info." << std::endl;
                         buildLog = new char[buildLogSize];
                         CV_DbgAssert(!!buildLog);
                         memset(buildLog, 0, buildLogSize);
-                        openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices,
+                        openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
                                                              CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
                         std::cout << "\n\t\t\tBUILD LOG\n";
                         std::cout << buildLog << std::endl;
@@ -541,13 +588,13 @@ namespace cv
         void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
         {
             size_t kernelWorkGroupSize;
-            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices,
+            openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
                                                     CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
-            CV_Assert( (localThreads[0] <= clCxt->impl->maxWorkItemSizes[0]) &&
-                          (localThreads[1] <= clCxt->impl->maxWorkItemSizes[1]) &&
-                          (localThreads[2] <= clCxt->impl->maxWorkItemSizes[2]) &&
-                          ((localThreads[0] * localThreads[1] * localThreads[2]) <= kernelWorkGroupSize) &&
-                          (localThreads[0] * localThreads[1] * localThreads[2]) <= clCxt->impl->maxWorkGroupSize);
+            CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
+            CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
+            CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
+            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
+            CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
         }
 
 #ifdef PRINT_KERNEL_RUN_TIME
@@ -812,142 +859,148 @@ namespace cv
         /////////////////////////////OpenCL initialization/////////////////
         std::auto_ptr<Context> Context::clCxt;
         int Context::val = 0;
-        Mutex cs;
+        static Mutex cs;
         Context *Context::getContext()
         {
-            if(val == 0)
+            if(*((volatile int*)&val) != 1)
             {
                 AutoLock al(cs);
-                if( NULL == clCxt.get())
+                if(*((volatile int*)&val) != 1)
+                {
+                    if( 0 == clCxt.get())
                     clCxt.reset(new Context);
 
-                val = 1;
+                    std::vector<Info> oclinfo;
+                    CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
+                    oclinfo[0].impl->setDevice(0, 0, 0);
+                    clCxt.get()->impl = oclinfo[0].impl->copy();
+
+                    *((volatile int*)&val) = 1;
+                }
+            }
                 return clCxt.get();
             }
+
+        void Context::setContext(Info &oclinfo)
+        {
+            AutoLock guard(cs);
+            if(*((volatile int*)&val) != 1)
+            {
+                if( 0 == clCxt.get())
+                    clCxt.reset(new Context);
+
+                clCxt.get()->impl = oclinfo.impl->copy();
+
+                *((volatile int*)&val) = 1;
+            }
             else
             {
-                return clCxt.get();
+                clCxt.get()->impl->release();
+                clCxt.get()->impl = oclinfo.impl->copy();
             }
         }
-        void Context::setContext(Info &oclinfo)
-        {
-            Context *clcxt = getContext();
-            clcxt->impl->clContext = oclinfo.impl->oclcontext;
-            clcxt->impl->clCmdQueue = oclinfo.impl->clCmdQueue;
-            clcxt->impl->devices = oclinfo.impl->devices[oclinfo.impl->devnum];
-            clcxt->impl->devName = oclinfo.impl->devName[oclinfo.impl->devnum];
-            clcxt->impl->maxDimensions = oclinfo.impl->maxDimensions;
-            clcxt->impl->maxWorkGroupSize = oclinfo.impl->maxWorkGroupSize;
-            for(size_t i=0; i<clcxt->impl->maxDimensions && i<4; i++)
-                clcxt->impl->maxWorkItemSizes[i] = oclinfo.impl->maxWorkItemSizes[i];
-            clcxt->impl->maxComputeUnits = oclinfo.impl->maxComputeUnits;
-            clcxt->impl->double_support = oclinfo.impl->double_support;
-            //extra options to recognize compiler options
-            memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
-            cl_bool unfymem = false;
-            openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
-                                           sizeof(cl_bool), (void *)&unfymem, NULL));
-            if(unfymem)
-                clcxt->impl->unified_memory = 1;
-        }
+
         Context::Context()
         {
-            impl = new Impl;
-            //Information of the OpenCL context
-            impl->clContext = NULL;
-            impl->clCmdQueue = NULL;
-            impl->devices = NULL;
-            impl->maxDimensions = 0;
-            impl->maxWorkGroupSize = 0;
-            for(int i=0; i<4; i++)
-                impl->maxWorkItemSizes[i] = 0;
-            impl->maxComputeUnits = 0;
-            impl->double_support = 0;
-            //extra options to recognize vendor specific fp64 extensions
-            memset(impl->extra_options, 0, 512);
-            impl->unified_memory = 0;
+            impl = 0;
             programCache = ProgramCache::getProgramCache();
         }
 
         Context::~Context()
         {
-            delete impl;
+            release();
+        }
+
+        void Context::release()
+        {
+            if (impl)
+                impl->release();
             programCache->releaseProgram();
         }
+
+        bool Context::supportsFeature(int ftype)
+        {
+            switch(ftype)
+            {
+            case CL_DOUBLE:
+                return impl->double_support == 1;
+            case CL_UNIFIED_MEM:
+                return impl->unified_memory == 1;
+            default:
+                return false;
+            }
+        }
+
+        size_t Context::computeUnits()
+        {
+            return impl->maxComputeUnits;
+        }
+
+        size_t Context::maxWorkGroupSize()
+        {
+            return impl->maxWorkGroupSize;
+        }
+
+        void* Context::oclContext()
+        {
+            return impl->oclcontext;
+        }
+
+        void* Context::oclCommandQueue()
+        {
+            return impl->clCmdQueue;
+        }
+
         Info::Info()
         {
             impl = new Impl;
-            impl->oclplatform = 0;
-            impl->oclcontext = 0;
-            impl->clCmdQueue = 0;
-            impl->devnum = 0;
-            impl->maxDimensions = 0;
-            impl->maxWorkGroupSize = 0;
-            impl->maxWorkItemSizes = 0;
-            impl->maxComputeUnits = 0;
-            impl->double_support = 0;
-            //extra_options = 0;
         }
+
         void Info::release()
         {
             fft_teardown();
-            if(impl->oclplatform)
-            {
-                impl->oclplatform = 0;
-            }
-            if(impl->clCmdQueue)
-            {
-                openCLSafeCall(clReleaseCommandQueue(impl->clCmdQueue));
-            }
-            ProgramCache::getProgramCache()->releaseProgram();
-            if(impl->oclcontext)
-            {
-                openCLSafeCall(clReleaseContext(impl->oclcontext));
-            }
-            if(impl->maxWorkItemSizes)
-            {
-                delete[] impl->maxWorkItemSizes;
-                impl->maxWorkItemSizes = 0;
-            }
-            //if(extra_options)
-            //{
-            //	delete[] extra_options;
-            //	extra_options = 0;
-            //}
-            impl->devices.clear();
-            impl->devName.clear();
+            impl->release();
+            impl = new Impl;
             DeviceName.clear();
         }
+
         Info::~Info()
         {
-            release();
-            delete impl;
+            fft_teardown();
+            impl->release();
         }
+
         Info &Info::operator = (const Info &m)
         {
-            impl->oclplatform = m.impl->oclplatform;
-            impl->oclcontext = m.impl->oclcontext;
-            impl->clCmdQueue = m.impl->clCmdQueue;
-            impl->devnum = m.impl->devnum;
-            impl->maxDimensions = m.impl->maxDimensions;
-            impl->maxWorkGroupSize = m.impl->maxWorkGroupSize;
-            impl->maxWorkItemSizes = m.impl->maxWorkItemSizes;
-            impl->maxComputeUnits = m.impl->maxComputeUnits;
-            impl->double_support = m.impl->double_support;
-            memcpy(impl->extra_options, m.impl->extra_options, 512);
-            for(size_t i = 0; i < m.impl->devices.size(); i++)
-            {
-                impl->devices.push_back(m.impl->devices[i]);
-                impl->devName.push_back(m.impl->devName[i]);
-                DeviceName.push_back(m.DeviceName[i]);
-            }
+            impl->release();
+            impl = m.impl->copy();
+            DeviceName = m.DeviceName;
             return *this;
         }
+
         Info::Info(const Info &m)
         {
-            impl = new Impl;
-            *this = m;
+            impl = m.impl->copy();
+            DeviceName = m.DeviceName;
         }
     }//namespace ocl
 
 }//namespace cv
+
+#if defined BUILD_SHARED_LIBS && defined CVAPI_EXPORTS && defined WIN32 && !defined WINCE
+#include <windows.h>
+BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID );
+
+BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
+{
+    if( fdwReason == DLL_PROCESS_DETACH )
+    {
+        // application hangs if call clReleaseCommandQueue here, so release context only
+        // without context release application hangs as well
+        cl_context ctx = (cl_context)getoclContext();
+        if(ctx)
+            openCLSafeCall(clReleaseContext(ctx));
+    }
+    return TRUE;
+}
+#endif
diff --git a/modules/ocl/src/interpolate_frames.cpp b/modules/ocl/src/interpolate_frames.cpp
index fe06187b5..b7e66d82b 100644
--- a/modules/ocl/src/interpolate_frames.cpp
+++ b/modules/ocl/src/interpolate_frames.cpp
@@ -43,9 +43,7 @@
 //
 //M*/
 
-#include <iomanip>
 #include "precomp.hpp"
-#include "mcwutil.hpp"
 
 using namespace cv;
 using namespace cv::ocl;
diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp
index 189a7dc9a..6b2cdd39a 100644
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@@ -189,7 +189,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
         int pitch = wholeSize.width * 3 * m.elemSize1();
         int tail_padding = m.elemSize1() * 3072;
         int err;
-        cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
                                      (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
         openCLVerifyCall(err);
 
@@ -241,7 +241,7 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
         int pitch = wholecols * 3 * m.elemSize1();
         int tail_padding = m.elemSize1() * 3072;
         int err;
-        cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
+        cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
                                      (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
         openCLVerifyCall(err);
 
@@ -594,7 +594,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, std:
 #ifdef CL_VERSION_1_2
     if(dst.offset == 0 && dst.cols == dst.wholecols)
     {
-        clEnqueueFillBuffer(dst.clCxt->impl->clCmdQueue, (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
+        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
     }
     else
     {
diff --git a/modules/ocl/src/mcwutil.cpp b/modules/ocl/src/mcwutil.cpp
index ebdcbf73e..f1c44f972 100644
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,9 +43,8 @@
 //
 //M*/
 
-#include "mcwutil.hpp"
+#include "precomp.hpp"
 
-#if defined (HAVE_OPENCL)
 #ifndef CL_VERSION_1_2
 #define CL_VERSION_1_2 0
 #endif
@@ -90,15 +89,15 @@ namespace cv
             for(size_t i = 0; i < args.size(); i ++)
                 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
 
-            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+            openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads,
                                                   localThreads, 0, NULL, NULL));
 
             switch(finish_mode)
             {
             case CLFINISH:
-                clFinish(clCxt->impl->clCmdQueue);
+                clFinish((cl_command_queue)clCxt->oclCommandQueue());
             case CLFLUSH:
-                clFlush(clCxt->impl->clCmdQueue);
+                clFlush((cl_command_queue)clCxt->oclCommandQueue());
                 break;
             case DISABLE:
             default:
@@ -123,7 +122,7 @@ namespace cv
                                   build_options, finish_mode);
         }
 
-       cl_mem bindTexture(const oclMat &mat)
+        cl_mem bindTexture(const oclMat &mat)
         {
             cl_mem texture;
             cl_image_format format;
@@ -173,10 +172,10 @@ namespace cv
             desc.buffer           = NULL;
             desc.num_mip_levels   = 0;
             desc.num_samples      = 0;
-            texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
+            texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
 #else
             texture = clCreateImage2D(
-                mat.clCxt->impl->clContext,
+                (cl_context)mat.clCxt->oclContext(),
                 CL_MEM_READ_WRITE,
                 &format,
                 mat.cols,
@@ -191,10 +190,10 @@ namespace cv
             cl_mem devData;
             if (mat.cols * mat.elemSize() != mat.step)
             {
-                devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows
+                devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows
                     * mat.elemSize(), NULL, NULL);
                 const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
-                clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin,
+                clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
                     regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
             }
             else
@@ -202,10 +201,10 @@ namespace cv
                 devData = (cl_mem)mat.data;
             }
 
-            clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0);
+            clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
             if ((mat.cols * mat.elemSize() != mat.step))
             {
-                clFinish(mat.clCxt->impl->clCmdQueue);
+                clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
                 clReleaseMemObject(devData);
             }
 
@@ -230,7 +229,7 @@ namespace cv
             try
             {
                 cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
-                _support = true;
+                //_support = true;
             }
             catch (const cv::Exception& e)
             {
@@ -250,4 +249,3 @@ namespace cv
     }//namespace ocl
 
 }//namespace cv
-#endif
\ No newline at end of file
diff --git a/modules/ocl/src/moments.cpp b/modules/ocl/src/moments.cpp
index 4bf3d0032..d7df6fd55 100644
--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -98,25 +98,19 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
         CvSeqReader reader;
         int lpt = contour->total;
         double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
-        int dst_type = cv::ocl::Context::getContext()->impl->double_support ? CV_64FC1 : CV_32FC1;
 
         cvStartReadSeq( contour, &reader, 0 );
 
-        cv::ocl::oclMat dst_a00(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a10(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a01(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a20(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a11(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a02(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a30(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a21(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a12(1,lpt,dst_type);
-        cv::ocl::oclMat dst_a03(1,lpt,dst_type);
         size_t reader_size = lpt << 1;
         cv::Mat reader_mat(1,reader_size,CV_32FC1);
 
         bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
 
+        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
+        {
+            CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
+        }
+
         if( is_float )
         {
             for(size_t i = 0; i < reader_size; ++i)
@@ -136,6 +130,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
             }
         }
 
+        cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
         cv::ocl::oclMat reader_oclmat(reader_mat);
         int llength = std::min(lpt,128);
         size_t localThreads[3]  = { llength, 1, 1};
@@ -143,48 +138,43 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
         std::vector<std::pair<size_t , const void *> > args;
         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a00.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a10.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a01.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a20.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a11.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a02.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a30.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a21.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a12.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a03.data ));
-        openCLExecuteKernel(dst_a00.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
+        args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
+        cl_int dst_step = (cl_int)dst_a.step;
+        args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
 
-        cv::Mat dst(dst_a00);
-        cv::Scalar s = cv::sum(dst);
-        a00 = s[0];
-        dst = dst_a10;
-        s = cv::sum(dst);
-        a10 = s[0];//dstsum[1];
-        dst = dst_a01;
-        s = cv::sum(dst);
-        a01 = s[0];//dstsum[2];
-        dst = dst_a20;
-        s = cv::sum(dst);
-        a20 = s[0];//dstsum[3];
-        dst = dst_a11;
-        s = cv::sum(dst);
-        a11 = s[0];//dstsum[4];
-        dst = dst_a02;
-        s = cv::sum(dst);
-        a02 = s[0];//dstsum[5];
-        dst = dst_a30;
-        s = cv::sum(dst);
-        a30 = s[0];//dstsum[6];
-        dst = dst_a21;
-        s = cv::sum(dst);
-        a21 = s[0];//dstsum[7];
-        dst = dst_a12;
-        s = cv::sum(dst);
-        a12 = s[0];//dstsum[8];
-        dst = dst_a03;
-        s = cv::sum(dst);
-        a03 = s[0];//dstsum[9];
+        openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
+
+        cv::Mat dst(dst_a);
+        a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
+        if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
+        {
+            for (int i = 0; i < contour->total; ++i)
+            {
+                a00 += dst.at<cl_long>(0, i);
+                a10 += dst.at<cl_long>(1, i);
+                a01 += dst.at<cl_long>(2, i);
+                a20 += dst.at<cl_long>(3, i);
+                a11 += dst.at<cl_long>(4, i);
+                a02 += dst.at<cl_long>(5, i);
+                a30 += dst.at<cl_long>(6, i);
+                a21 += dst.at<cl_long>(7, i);
+                a12 += dst.at<cl_long>(8, i);
+                a03 += dst.at<cl_long>(9, i);
+            }
+        }
+        else
+        {
+            a00 = cv::sum(dst.row(0))[0];
+            a10 = cv::sum(dst.row(1))[0];
+            a01 = cv::sum(dst.row(2))[0];
+            a20 = cv::sum(dst.row(3))[0];
+            a11 = cv::sum(dst.row(4))[0];
+            a02 = cv::sum(dst.row(5))[0];
+            a30 = cv::sum(dst.row(6))[0];
+            a21 = cv::sum(dst.row(7))[0];
+            a12 = cv::sum(dst.row(8))[0];
+            a03 = cv::sum(dst.row(9))[0];
+        }
 
         double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
         if( fabs(a00) > FLT_EPSILON )
diff --git a/modules/ocl/src/kernels/arithm_2_mat.cl b/modules/ocl/src/opencl/arithm_2_mat.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_2_mat.cl
rename to modules/ocl/src/opencl/arithm_2_mat.cl
diff --git a/modules/ocl/src/kernels/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_LUT.cl
rename to modules/ocl/src/opencl/arithm_LUT.cl
diff --git a/modules/ocl/src/kernels/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_absdiff.cl
rename to modules/ocl/src/opencl/arithm_absdiff.cl
diff --git a/modules/ocl/src/kernels/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_add.cl
rename to modules/ocl/src/opencl/arithm_add.cl
diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl
similarity index 95%
rename from modules/ocl/src/kernels/arithm_addWeighted.cl
rename to modules/ocl/src/opencl/arithm_addWeighted.cl
index 7e9df6f25..d76f994aa 100644
--- a/modules/ocl/src/kernels/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
 
         x = x << 2;
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		uchar4 src1_data ,src2_data;
+        uchar4 src1_data ,src2_data;
 
-		src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-		src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-		src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-		src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
 
-		src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-		src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-		src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-		src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
 
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
 
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
-            
+
         x = x << 2;
 
         #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
 
         #define dst_align ((dst_offset >> bitOfInt) & 3)
 
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); 
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); 
-       
+        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
+        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
+
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
@@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-        
+
     if(src1_index < 0)
     {
         int4 tmp;
@@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
-            
+
         x = x << 2;
 
         #define dst_align ((dst_offset >> 2) & 3)
 
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
     {
-            
+
         x = x << 2;
 
         #define dst_align ((dst_offset >> 3) & 3)
 
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
diff --git a/modules/ocl/src/kernels/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_add_scalar.cl
rename to modules/ocl/src/opencl/arithm_add_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_add_scalar_mask.cl
rename to modules/ocl/src/opencl/arithm_add_scalar_mask.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_and.cl
similarity index 95%
rename from modules/ocl/src/kernels/arithm_bitwise_and.cl
rename to modules/ocl/src/opencl/arithm_bitwise_and.cl
index f954452b1..8adc56de5 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
      uchar4 src2_data = vload4(0, src2 + src2_index_fix);
 
      if(src1_index < 0)
-     {     
+     {
         uchar4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        uchar4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        uchar4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
      char4 src2_data = vload4(0, src2 + src2_index_fix);
 
      if(src1_index < 0)
-     {     
+     {
         char4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        char4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        char4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
         ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
 
      if(src1_index < 0)
-     {     
+     {
         ushort4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        ushort4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        ushort4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
         short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
 
      if(src1_index < 0)
-     {     
+     {
         short4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        short4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        short4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
index d1f745ff2..595fb2ceb 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl
rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
index 50304aa34..beafd7e0a 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_not.cl
rename to modules/ocl/src/opencl/arithm_bitwise_not.cl
index 64bcc1799..fd9d2ccf9 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = ~ src1_data;
-        
+
   /*  if(src1_index < 0)
     {
       uchar4 tmp;
@@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
     {
         int src_index = mad24(y, src_step, (x << 3) + src_offset);
         int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
-         
+
         char8 data;
 
         data = *((__global char8 *)((__global char *)src + src_index));
         data = ~ data;
-        
+
         *((__global char8 *)((__global char *)dst + dst_index)) = data;
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl
similarity index 98%
rename from modules/ocl/src/kernels/arithm_bitwise_or.cl
rename to modules/ocl/src/opencl/arithm_bitwise_or.cl
index 01e3a2f99..a95e59e0c 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
index 92d98ec01..aedb68c47 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
index bbd5f3fb2..5b94591a3 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
@@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
index 153398706..54066c21a 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
@@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
similarity index 95%
rename from modules/ocl/src/kernels/arithm_bitwise_xor.cl
rename to modules/ocl/src/opencl/arithm_bitwise_xor.cl
index 6e83ef50e..4f743776a 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
 
      if(src1_index < 0)
-     {     
+     {
         uchar4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        uchar4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        uchar4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
         char4 src2_data = vload4(0, src2 + src2_index_fix);
 
      if(src1_index < 0)
-     {     
+     {
         char4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        char4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        char4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
         ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
 
      if(src1_index < 0)
-     {     
+     {
         ushort4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        ushort4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        ushort4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
         short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
 
      if(src1_index < 0)
-     {     
+     {
         short4 tmp;
         tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
         src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        short4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        short4 tmp;
         tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
         src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
      }
@@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
index 248654ef7..4359d860a 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl
rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
index 4efa2dac6..57ad9ee71 100644
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_cartToPolar.cl
rename to modules/ocl/src/opencl/arithm_cartToPolar.cl
diff --git a/modules/ocl/src/kernels/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl
similarity index 74%
rename from modules/ocl/src/kernels/arithm_compare_eq.cl
rename to modules/ocl/src/opencl/arithm_compare_eq.cl
index 1db0b7dd1..f818532ba 100644
--- a/modules/ocl/src/kernels/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
@@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
 
-  
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
 
@@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
- 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
 
 
-  
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
 
@@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    {   
+    {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
         double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
 
- 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
 
@@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
 
- 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
 
@@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
- 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
 
@@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
         float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-		if(src1_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
- 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
 
@@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
         x = x << 2;
 
         #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
 
@@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-  		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
         float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-		if(src1_index < 0)
-		{
+        if(src1_index < 0)
+        {
 
-			float4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }               uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
 
         dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
     }
 }
 #endif
-
diff --git a/modules/ocl/src/kernels/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl
similarity index 73%
rename from modules/ocl/src/kernels/arithm_compare_ne.cl
rename to modules/ocl/src/opencl/arithm_compare_ne.cl
index 1c5063a46..713dc1316 100644
--- a/modules/ocl/src/kernels/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
@@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
- 		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-	
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		if(src1_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
 
@@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
 }
 #endif
 
-   
+
 /***********************************Compare LT*******************************/
 __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
                              __global uchar *src2, int src2_step, int src2_offset,
@@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
 
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
 
 
- 
-   
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
 
@@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
         float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-		if(src1_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         uchar4 src1_data = vload4(0, src1 + src1_index_fix);
         uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-		if(src1_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			uchar4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
         x = x << 2;
 
         #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));		
-		if(src1_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
 
         uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
 
         int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
         int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-		if(src1_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			int4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
@@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));		
-		if(src1_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			float4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
     {
         x = x << 2;
         #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
 
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-		int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-		int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
         double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));	
-		if(src1_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-			src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-		}
-		if(src2_index < 0)
-		{
-			double4 tmp;
-			tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-			src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-		}		
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
 
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
         uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
     }
 }
 #endif
-
-
diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_div.cl
rename to modules/ocl/src/opencl/arithm_div.cl
index 54fe3cdc1..dcbe30310 100644
--- a/modules/ocl/src/kernels/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
     }
 }
 #endif
-
-
diff --git a/modules/ocl/src/kernels/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_exp.cl
rename to modules/ocl/src/opencl/arithm_exp.cl
diff --git a/modules/ocl/src/kernels/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_flip.cl
rename to modules/ocl/src/opencl/arithm_flip.cl
diff --git a/modules/ocl/src/kernels/arithm_flip_rc.cl b/modules/ocl/src/opencl/arithm_flip_rc.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_flip_rc.cl
rename to modules/ocl/src/opencl/arithm_flip_rc.cl
diff --git a/modules/ocl/src/kernels/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_log.cl
rename to modules/ocl/src/opencl/arithm_log.cl
diff --git a/modules/ocl/src/kernels/arithm_magnitude.cl b/modules/ocl/src/opencl/arithm_magnitude.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_magnitude.cl
rename to modules/ocl/src/opencl/arithm_magnitude.cl
diff --git a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
similarity index 98%
rename from modules/ocl/src/kernels/arithm_magnitudeSqr.cl
rename to modules/ocl/src/opencl/arithm_magnitudeSqr.cl
index f1d0aa573..3fd697ff1 100644
--- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
 
     {
-            
+
         x = x << 2;
 
         #define dst_align ((dst_offset >> 2) & 3)
 
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
     int y = get_global_id(1);
 
     if (x < cols && y < rows)
-    
+
 
     {
-            
+
         x = x << 2;
 
         #define dst_align ((dst_offset >> 2) & 3)
 
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+
         int dst_start  = mad24(y, dst_step, dst_offset);
         int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
         int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
           src1_data.s01234567 = src1_data.s45670123;
     if(src1_index== -2)
           src1_data.s01234567 = src1_data.s23456701;
-        
-    
+
+
 
         float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
 
diff --git a/modules/ocl/src/kernels/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_minMax.cl
rename to modules/ocl/src/opencl/arithm_minMax.cl
diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_minMaxLoc.cl
rename to modules/ocl/src/opencl/arithm_minMaxLoc.cl
diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
rename to modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
index f87b928ce..0af4f7ba0 100644
--- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
@@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
    }
 }
-
diff --git a/modules/ocl/src/kernels/arithm_minMax_mask.cl b/modules/ocl/src/opencl/arithm_minMax_mask.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_minMax_mask.cl
rename to modules/ocl/src/opencl/arithm_minMax_mask.cl
index 409776233..734ccab75 100644
--- a/modules/ocl/src/kernels/arithm_minMax_mask.cl
+++ b/modules/ocl/src/opencl/arithm_minMax_mask.cl
@@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
        dst[gid + groupnum] = localmem_max[0];
    }
 }
-
diff --git a/modules/ocl/src/kernels/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_mul.cl
rename to modules/ocl/src/opencl/arithm_mul.cl
diff --git a/modules/ocl/src/kernels/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_nonzero.cl
rename to modules/ocl/src/opencl/arithm_nonzero.cl
diff --git a/modules/ocl/src/kernels/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_phase.cl
rename to modules/ocl/src/opencl/arithm_phase.cl
diff --git a/modules/ocl/src/kernels/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_polarToCart.cl
rename to modules/ocl/src/opencl/arithm_polarToCart.cl
diff --git a/modules/ocl/src/kernels/arithm_pow.cl b/modules/ocl/src/opencl/arithm_pow.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_pow.cl
rename to modules/ocl/src/opencl/arithm_pow.cl
diff --git a/modules/ocl/src/kernels/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_sub.cl
rename to modules/ocl/src/opencl/arithm_sub.cl
diff --git a/modules/ocl/src/kernels/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_sub_scalar.cl
rename to modules/ocl/src/opencl/arithm_sub_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_sub_scalar_mask.cl
rename to modules/ocl/src/opencl/arithm_sub_scalar_mask.cl
diff --git a/modules/ocl/src/kernels/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_sum.cl
rename to modules/ocl/src/opencl/arithm_sum.cl
index d29a71c69..280b0a511 100644
--- a/modules/ocl/src/kernels/arithm_sum.cl
+++ b/modules/ocl/src/opencl/arithm_sum.cl
@@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
        dst[gid] = localmem_sum[0];
    }
 }
-
diff --git a/modules/ocl/src/kernels/arithm_sum_3.cl b/modules/ocl/src/opencl/arithm_sum_3.cl
similarity index 99%
rename from modules/ocl/src/kernels/arithm_sum_3.cl
rename to modules/ocl/src/opencl/arithm_sum_3.cl
index 1401889a7..3f6ed0880 100644
--- a/modules/ocl/src/kernels/arithm_sum_3.cl
+++ b/modules/ocl/src/opencl/arithm_sum_3.cl
@@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
        dst[gid*3+2] = localmem_sum3[0];
    }
 }
-
diff --git a/modules/ocl/src/kernels/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl
similarity index 100%
rename from modules/ocl/src/kernels/arithm_transpose.cl
rename to modules/ocl/src/opencl/arithm_transpose.cl
diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/opencl/blend_linear.cl
similarity index 99%
rename from modules/ocl/src/kernels/blend_linear.cl
rename to modules/ocl/src/opencl/blend_linear.cl
index acea75bd2..50c5c39c5 100644
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/opencl/blend_linear.cl
@@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
         dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
     }
 }
-
diff --git a/modules/ocl/src/kernels/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl
similarity index 100%
rename from modules/ocl/src/kernels/brute_force_match.cl
rename to modules/ocl/src/opencl/brute_force_match.cl
diff --git a/modules/ocl/src/kernels/build_warps.cl b/modules/ocl/src/opencl/build_warps.cl
similarity index 99%
rename from modules/ocl/src/kernels/build_warps.cl
rename to modules/ocl/src/opencl/build_warps.cl
index 13d7bb95c..07cccee1a 100644
--- a/modules/ocl/src/kernels/build_warps.cl
+++ b/modules/ocl/src/opencl/build_warps.cl
@@ -234,4 +234,3 @@ __kernel
         map_y[y * step_y + x] = ycoo;
     }
 }
-
diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/opencl/convertC3C4.cl
similarity index 100%
rename from modules/ocl/src/kernels/convertC3C4.cl
rename to modules/ocl/src/opencl/convertC3C4.cl
diff --git a/modules/ocl/src/kernels/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl
similarity index 100%
rename from modules/ocl/src/kernels/cvt_color.cl
rename to modules/ocl/src/opencl/cvt_color.cl
diff --git a/modules/ocl/src/kernels/filter_sep_col.cl b/modules/ocl/src/opencl/filter_sep_col.cl
similarity index 100%
rename from modules/ocl/src/kernels/filter_sep_col.cl
rename to modules/ocl/src/opencl/filter_sep_col.cl
diff --git a/modules/ocl/src/kernels/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl
similarity index 99%
rename from modules/ocl/src/kernels/filter_sep_row.cl
rename to modules/ocl/src/opencl/filter_sep_row.cl
index dbca8bd3a..bfe6cd4dd 100644
--- a/modules/ocl/src/kernels/filter_sep_row.cl
+++ b/modules/ocl/src/opencl/filter_sep_row.cl
@@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
         dst[start_addr] = sum;
     }
 }
-
-
diff --git a/modules/ocl/src/kernels/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl
similarity index 100%
rename from modules/ocl/src/kernels/filtering_boxFilter.cl
rename to modules/ocl/src/opencl/filtering_boxFilter.cl
diff --git a/modules/ocl/src/kernels/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl
similarity index 100%
rename from modules/ocl/src/kernels/filtering_laplacian.cl
rename to modules/ocl/src/opencl/filtering_laplacian.cl
diff --git a/modules/ocl/src/kernels/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl
similarity index 98%
rename from modules/ocl/src/kernels/filtering_morph.cl
rename to modules/ocl/src/opencl/filtering_morph.cl
index f60d76a8b..49640008f 100644
--- a/modules/ocl/src/kernels/filtering_morph.cl
+++ b/modules/ocl/src/opencl/filtering_morph.cl
@@ -120,8 +120,7 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
     int gidy = get_global_id(1);
     int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
 
-#ifdef USEROI
-    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3==0))
+    if(gidx+3<cols && gidy<rows && (dst_offset_in_pixel&3)==0)
     {
         *(__global uchar4*)&dst[out_addr] = res;
     }
@@ -150,9 +149,6 @@ __kernel void morph_C1_D0(__global const uchar * restrict src,
             dst[out_addr] = res.x;
         }
     }
-#else
-    *(__global uchar4*)&dst[out_addr] = res;
-#endif
 }
 #else
 __kernel void morph(__global const GENTYPE * restrict src,
diff --git a/modules/ocl/src/kernels/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
similarity index 99%
rename from modules/ocl/src/kernels/haarobjectdetect.cl
rename to modules/ocl/src/opencl/haarobjectdetect.cl
index 7835b4bcc..2fa0906b4 100644
--- a/modules/ocl/src/kernels/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -559,7 +559,3 @@ if(result)
 }
 }
 */
-
-
-
-
diff --git a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
similarity index 99%
rename from modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
rename to modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
index 22d3004e2..9912b9c7a 100644
--- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
         newnode[counter].alpha[0] = t1.alpha[0];
         newnode[counter].alpha[1] = t1.alpha[1];
 }
-
diff --git a/modules/ocl/src/kernels/imgproc_bilateral.cl b/modules/ocl/src/opencl/imgproc_bilateral.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_bilateral.cl
rename to modules/ocl/src/opencl/imgproc_bilateral.cl
diff --git a/modules/ocl/src/kernels/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_calcHarris.cl
rename to modules/ocl/src/opencl/imgproc_calcHarris.cl
diff --git a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl
rename to modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
diff --git a/modules/ocl/src/kernels/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_canny.cl
rename to modules/ocl/src/opencl/imgproc_canny.cl
diff --git a/modules/ocl/src/kernels/imgproc_columnsum.cl b/modules/ocl/src/opencl/imgproc_columnsum.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_columnsum.cl
rename to modules/ocl/src/opencl/imgproc_columnsum.cl
diff --git a/modules/ocl/src/kernels/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_convolve.cl
rename to modules/ocl/src/opencl/imgproc_convolve.cl
index d113eb816..76e7cfc55 100644
--- a/modules/ocl/src/kernels/imgproc_convolve.cl
+++ b/modules/ocl/src/opencl/imgproc_convolve.cl
@@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
         dst[gy*(dst_step >> 2)+gx] = res;
    }
 }
-
-
diff --git a/modules/ocl/src/kernels/imgproc_copymakeboder.cl b/modules/ocl/src/opencl/imgproc_copymakeboder.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_copymakeboder.cl
rename to modules/ocl/src/opencl/imgproc_copymakeboder.cl
diff --git a/modules/ocl/src/kernels/imgproc_histogram.cl b/modules/ocl/src/opencl/imgproc_histogram.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_histogram.cl
rename to modules/ocl/src/opencl/imgproc_histogram.cl
index 01e333fbc..6bfa095f3 100644
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/opencl/imgproc_histogram.cl
@@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
     }
 }
 */
-
diff --git a/modules/ocl/src/kernels/imgproc_hough.cl b/modules/ocl/src/opencl/imgproc_hough.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_hough.cl
rename to modules/ocl/src/opencl/imgproc_hough.cl
index 06655dea3..fd1c5b9a8 100644
--- a/modules/ocl/src/kernels/imgproc_hough.cl
+++ b/modules/ocl/src/opencl/imgproc_hough.cl
@@ -66,7 +66,7 @@ __kernel void buildPointList(__global const uchar* src,
     if (get_local_id(0) == 0)
         s_qsize[get_local_id(1)] = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
-        
+
     if (y < rows)
     {
         // fill the queue
@@ -102,7 +102,7 @@ __kernel void buildPointList(__global const uchar* src,
     }
 
     barrier(CLK_GLOBAL_MEM_FENCE);
-    
+
     // copy local queues to global queue
     const int qsize = s_qsize[get_local_id(1)];
     int gidx = s_globStart[get_local_id(1)] + get_local_id(0);
@@ -131,7 +131,7 @@ __kernel void circlesAccumCenters(__global const unsigned int* list,
     const int dxStepInPixel    = dxStep    / sizeof(int);
     const int dyStepInPixel    = dyStep    / sizeof(int);
     const int accumStepInPixel = accumStep / sizeof(int);
-    
+
     const int SHIFT = 10;
     const int ONE = 1 << SHIFT;
 
@@ -195,7 +195,7 @@ __kernel void buildCentersList(__global const int* accum,
                                __global int* counter)
 {
     const int accumStepInPixel = accumStep/sizeof(int);
-    
+
     const int x = get_global_id(0);
     const int y = get_global_id(1);
 
@@ -206,7 +206,7 @@ __kernel void buildCentersList(__global const int* accum,
         const int left   = accum[mad24(y + 1, accumStepInPixel, x)];
         const int cur    = accum[mad24(y + 1, accumStepInPixel, x + 1)];
         const int right  = accum[mad24(y + 1, accumStepInPixel, x + 2)];
-        
+
         const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];;
 
         if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
@@ -268,7 +268,7 @@ __kernel void circlesAccumRadius(__global const unsigned int* centers,
         const int curVotes = smem[i + 1];
 
         if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
-            
+
         {
             const int ind = atomic_add(counter, 1);
             if (ind < maxCircles)
diff --git a/modules/ocl/src/kernels/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_integral.cl
rename to modules/ocl/src/opencl/imgproc_integral.cl
diff --git a/modules/ocl/src/kernels/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_integral_sum.cl
rename to modules/ocl/src/opencl/imgproc_integral_sum.cl
diff --git a/modules/ocl/src/kernels/imgproc_median.cl b/modules/ocl/src/opencl/imgproc_median.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_median.cl
rename to modules/ocl/src/opencl/imgproc_median.cl
index 2d9cd45f6..b87af9689 100644
--- a/modules/ocl/src/kernels/imgproc_median.cl
+++ b/modules/ocl/src/opencl/imgproc_median.cl
@@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  i
         dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
 }
 #undef op(a,b)
-
diff --git a/modules/ocl/src/kernels/imgproc_remap.cl b/modules/ocl/src/opencl/imgproc_remap.cl
similarity index 98%
rename from modules/ocl/src/kernels/imgproc_remap.cl
rename to modules/ocl/src/opencl/imgproc_remap.cl
index 491774956..ee40e935c 100644
--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/opencl/imgproc_remap.cl
@@ -48,7 +48,7 @@
 #if defined DOUBLE_SUPPORT
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 typedef double4 F4 ;
-#else 
+#else
 typedef float4 F4;
 #endif
 
@@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         x = x << 2;
@@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
 
         map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
         int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-   
+
         uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0));
         uchar4 src_data = val;
 
@@ -91,12 +91,12 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
         src_data.s2 = *(src + srcIdx.s2);
         if (con.s3 == 0)
         src_data.s3 = *(src + srcIdx.s3);
-        
+
         uchar4 dst_data;
- 
+
         __global uchar4* d = (__global uchar4 *)(dst + dstStart);
 
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
 
         int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
         dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
@@ -113,7 +113,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         x = x << 2;
@@ -131,9 +131,9 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
         map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
         int8 map1_dataZ = convert_int8_sat_rte(map1_data);
         int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
-    
+
         uchar4 src_data = val;
-        uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0)); 
+        uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
 
         if (con.s0 == 0)
         src_data.s0 = *(src + srcIdx.s0);
@@ -147,10 +147,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
        // dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
         __global uchar4* d = (__global uchar4 *)(dst + dstStart);
 
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
 
         int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
         dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
         *d = dst_data;
     }
@@ -162,7 +162,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         x = x << 2;
@@ -183,9 +183,9 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
         float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
         int8 map_dataZ = convert_int8_sat_rte(map_data);
         int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
-     
+
         uchar4 src_data = val;
-        uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0)); 
+        uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
 
         if (con.s0 == 0)
         src_data.s0 = *(src + srcIdx.s0);
@@ -196,14 +196,14 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
         if (con.s3 == 0)
         src_data.s3 = *(src + srcIdx.s3);
         uchar4 dst_data;
-    
+
     //    dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
         __global uchar4* d = (__global uchar4 *)(dst + dstStart);
 
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
 
         int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
         dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
         *d = dst_data;
     }
@@ -272,7 +272,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
     int y = get_global_id(1);
 
     if(x < threadCols && y < dst_rows)
-    { 
+    {
          int dstIdx = y * dst_step + (x << 2) + dst_offset;
         int mapIdx = y * map1_step + (x << 2) + map1_offset;
         float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
@@ -294,7 +294,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -309,7 +309,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
            src_data = *((__global float *)((__global uchar *)src + srcIdx));
         *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
 
- 
+
     }
 
 
@@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -337,7 +337,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
            src_data = *((__global float *)((__global uchar *)src + srcIdx));
         *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
 
- 
+
     }
 
 }
@@ -348,7 +348,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-     
+
     if(x < threadCols && y < dst_rows)
     {
         int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -367,7 +367,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
            src_data = *((__global float *)((__global uchar *)src + srcIdx));
         *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
 
- 
+
     }
 
 }
@@ -391,9 +391,9 @@ __kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const
           src_data = nval;
       else
           src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
-      *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; 
+      *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
+
 
-      
     }
 }
 
@@ -454,13 +454,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
     int y = get_global_id(1);
     if(x < threadCols && y < dst_rows)
     {
-      x = x << 2; 
+      x = x << 2;
       int gx = x - (dst_offset&3);
       int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
 
       uchar4 nval =convert_uchar4(nVal);
       uchar4 val = (uchar4)(nval.s0);
-  
+
 
       int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
 
@@ -518,12 +518,12 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
           d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
       if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
           d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
- 
+
       uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
       __global uchar4* D = (__global uchar4 *)(dst + dstStart);
 
-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
       int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
       dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
 
@@ -540,13 +540,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
     int y = get_global_id(1);
     if(x < threadCols && y < dst_rows)
     {
-      x = x << 2; 
+      x = x << 2;
       int gx = x - (dst_offset&3);
       int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
 
       uchar4 nval =convert_uchar4(nVal);
       uchar4 val = (uchar4)(nval.s0);
-  
+
 
       int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
 
@@ -607,13 +607,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
           d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
       if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
           d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
- 
+
 
       uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
       __global uchar4* D = (__global uchar4 *)(dst + dstStart);
 
-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
       int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
       dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
 
@@ -725,13 +725,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
     int y = get_global_id(1);
     if(x < threadCols && y < dst_rows)
     {
-      x = x << 4; 
+      x = x << 4;
       int gx = x - (dst_offset&15);
       int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
 
       float4 nval =convert_float4(nVal);
       float4 val = (float4)(nval.s0);
-  
+
       int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&15);
       int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
       float8 map1_data;
@@ -787,12 +787,12 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
           d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
       if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
           d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
-    
+
       float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
       __global float4* D = (__global float4 *)((__global char*)dst + dstStart);
 
-      float4 dVal = *D;      
+      float4 dVal = *D;
       int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
       dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
 
@@ -809,13 +809,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
     int y = get_global_id(1);
     if(x < threadCols && y < dst_rows)
     {
-      x = x << 4; 
+      x = x << 4;
       int gx = x - (dst_offset&15);
       int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
 
       float4 nval =convert_float4(nVal);
       float4 val = (float4)(nval.s0);
-  
+
       int dstStart = y * dst_step + x  + dst_offset - (dst_offset & 15);
       int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
       float4 map1_data;
@@ -874,13 +874,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
           d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
       if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
           d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
- 
-      
+
+
       float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
       __global float4* D = (__global float4 *)((__global char*)dst + dstStart);
 
-      float4 dVal = *D;      
+      float4 dVal = *D;
       int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
       dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
 
@@ -928,7 +928,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
       else
       d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
 
-      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); 
+      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
       *((__global float4 *)((__global uchar*)dst + dstIdx)) =  dst_data ;
 
     }
@@ -974,12 +974,9 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
       else
       d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
 
-      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); 
+      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
       *((__global float4 *)((__global uchar*)dst + dstIdx)) =  dst_data ;
 
 
     }
 }
-
-
-
diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_resize.cl
rename to modules/ocl/src/opencl/imgproc_resize.cl
index b6a25d382..fd486de40 100644
--- a/modules/ocl/src/kernels/imgproc_resize.cl
+++ b/modules/ocl/src/opencl/imgproc_resize.cl
@@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
         dst[dpos] = src[spos];
 
 }
-
diff --git a/modules/ocl/src/kernels/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_threshold.cl
rename to modules/ocl/src/opencl/imgproc_threshold.cl
index e046b49a7..8ad501f7c 100644
--- a/modules/ocl/src/kernels/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
         }
     }
 }
-
diff --git a/modules/ocl/src/kernels/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl
similarity index 100%
rename from modules/ocl/src/kernels/imgproc_warpAffine.cl
rename to modules/ocl/src/opencl/imgproc_warpAffine.cl
diff --git a/modules/ocl/src/kernels/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
similarity index 99%
rename from modules/ocl/src/kernels/imgproc_warpPerspective.cl
rename to modules/ocl/src/opencl/imgproc_warpPerspective.cl
index 9a5ec83ed..a37ffa1be 100644
--- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
@@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
         }
    }
 }
-
diff --git a/modules/ocl/src/kernels/interpolate_frames.cl b/modules/ocl/src/opencl/interpolate_frames.cl
similarity index 100%
rename from modules/ocl/src/kernels/interpolate_frames.cl
rename to modules/ocl/src/opencl/interpolate_frames.cl
diff --git a/modules/ocl/src/kernels/match_template.cl b/modules/ocl/src/opencl/match_template.cl
similarity index 99%
rename from modules/ocl/src/kernels/match_template.cl
rename to modules/ocl/src/opencl/match_template.cl
index ddbd86ba4..3133e6237 100644
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
         res[res_idx] = normAcc(num, denum);
     }
 }
-
diff --git a/modules/ocl/src/kernels/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl
similarity index 99%
rename from modules/ocl/src/kernels/meanShift.cl
rename to modules/ocl/src/opencl/meanShift.cl
index 4b5a08b35..a5b110812 100644
--- a/modules/ocl/src/kernels/meanShift.cl
+++ b/modules/ocl/src/opencl/meanShift.cl
@@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
 //        outsp[basesp] =(short2)((short)x0,(short)y0);
     }
 }
-
diff --git a/modules/ocl/src/kernels/merge_mat.cl b/modules/ocl/src/opencl/merge_mat.cl
similarity index 100%
rename from modules/ocl/src/kernels/merge_mat.cl
rename to modules/ocl/src/opencl/merge_mat.cl
diff --git a/modules/ocl/src/kernels/moments.cl b/modules/ocl/src/opencl/moments.cl
similarity index 95%
rename from modules/ocl/src/kernels/moments.cl
rename to modules/ocl/src/opencl/moments.cl
index 852dd6af5..fae5ad109 100644
--- a/modules/ocl/src/kernels/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@@ -1,42 +1,56 @@
 #if defined (DOUBLE_SUPPORT)
+
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+typedef double T;
+
 #else
 typedef float double;
 typedef float4 double4;
+typedef long T;
 #define convert_double4 convert_float4
 #endif
 //#pragma OPENCL EXTENSION cl_amd_printf:enable
 //#if defined (DOUBLE_SUPPORT)
+#define DST_ROW_A00     0
+#define DST_ROW_A10     1
+#define DST_ROW_A01     2
+#define DST_ROW_A20     3
+#define DST_ROW_A11     4
+#define DST_ROW_A02     5
+#define DST_ROW_A30     6
+#define DST_ROW_A21     7
+#define DST_ROW_A12     8
+#define DST_ROW_A03     9
+
 __kernel void icvContourMoments(int contour_total,
                                 __global float* reader_oclmat_data,
-                                __global double* dst_a00,
-                                __global double* dst_a10,
-                                __global double* dst_a01,
-                                __global double* dst_a20,
-                                __global double* dst_a11,
-                                __global double* dst_a02,
-                                __global double* dst_a30,
-                                __global double* dst_a21,
-                                __global double* dst_a12,
-                                __global double* dst_a03)
+                                __global T* dst_a,
+                                int dst_step)
 {
-    double xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
+    T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
     int idx = get_global_id(0);
 
-    xi_1 = *(reader_oclmat_data + (get_global_id(0) << 1));
-    yi_1 = *(reader_oclmat_data + (get_global_id(0) << 1) + 1);
+    if (idx < 0 || idx >= contour_total)
+        return;
+
+    xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
+    yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
     xi_12 = xi_1 * xi_1;
     yi_12 = yi_1 * yi_1;
 
     if(idx == contour_total - 1)
     {
-        xi = *(reader_oclmat_data);
-        yi = *(reader_oclmat_data + 1);
+        xi = (T)(*(reader_oclmat_data));
+        yi = (T)(*(reader_oclmat_data + 1));
     }
     else
     {
-        xi = *(reader_oclmat_data + (idx + 1) * 2);
-        yi = *(reader_oclmat_data + (idx + 1) * 2 + 1);
+        xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
+        yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
     }
 
     xi2 = xi * xi;
@@ -45,18 +59,19 @@ __kernel void icvContourMoments(int contour_total,
     xii_1 = xi_1 + xi;
     yii_1 = yi_1 + yi;
 
-    dst_a00[idx] = dxy;
-    dst_a10[idx] = dxy * xii_1;
-    dst_a01[idx] = dxy * yii_1;
-    dst_a20[idx] = dxy * (xi_1 * xii_1 + xi2);
-    dst_a11[idx] = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
-    dst_a02[idx] = dxy * (yi_1 * yii_1 + yi2);
-    dst_a30[idx] = dxy * xii_1 * (xi_12 + xi2);
-    dst_a03[idx] = dxy * yii_1 * (yi_12 + yi2);
-    dst_a21[idx] =
+    dst_step /= sizeof(T);
+    *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
+    *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
+    *( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1;
+    *( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
+    *( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+    *( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
+    *( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
+    *( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
+    *( dst_a + DST_ROW_A21 * dst_step + idx) =
         dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
                xi2 * (yi_1 + 3 * yi));
-    dst_a12[idx] =
+    *( dst_a + DST_ROW_A12 * dst_step + idx) =
         dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
                yi2 * (xi_1 + 3 * xi));
 }
diff --git a/modules/ocl/src/kernels/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl
similarity index 100%
rename from modules/ocl/src/kernels/objdetect_hog.cl
rename to modules/ocl/src/opencl/objdetect_hog.cl
diff --git a/modules/ocl/src/kernels/operator_convertTo.cl b/modules/ocl/src/opencl/operator_convertTo.cl
similarity index 100%
rename from modules/ocl/src/kernels/operator_convertTo.cl
rename to modules/ocl/src/opencl/operator_convertTo.cl
diff --git a/modules/ocl/src/kernels/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl
similarity index 100%
rename from modules/ocl/src/kernels/operator_copyToM.cl
rename to modules/ocl/src/opencl/operator_copyToM.cl
diff --git a/modules/ocl/src/kernels/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl
similarity index 100%
rename from modules/ocl/src/kernels/operator_setTo.cl
rename to modules/ocl/src/opencl/operator_setTo.cl
diff --git a/modules/ocl/src/kernels/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl
similarity index 99%
rename from modules/ocl/src/kernels/operator_setToM.cl
rename to modules/ocl/src/opencl/operator_setToM.cl
index 59357fad6..dde12d86f 100644
--- a/modules/ocl/src/kernels/operator_setToM.cl
+++ b/modules/ocl/src/opencl/operator_setToM.cl
@@ -57,4 +57,3 @@ __kernel void set_to_with_mask(
         }
 
 }
-
diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl
similarity index 100%
rename from modules/ocl/src/kernels/pyr_down.cl
rename to modules/ocl/src/opencl/pyr_down.cl
diff --git a/modules/ocl/src/kernels/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl
similarity index 100%
rename from modules/ocl/src/kernels/pyr_up.cl
rename to modules/ocl/src/opencl/pyr_up.cl
diff --git a/modules/ocl/src/kernels/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl
similarity index 100%
rename from modules/ocl/src/kernels/pyrlk.cl
rename to modules/ocl/src/opencl/pyrlk.cl
diff --git a/modules/ocl/src/kernels/pyrlk_no_image.cl b/modules/ocl/src/opencl/pyrlk_no_image.cl
similarity index 100%
rename from modules/ocl/src/kernels/pyrlk_no_image.cl
rename to modules/ocl/src/opencl/pyrlk_no_image.cl
diff --git a/modules/ocl/src/kernels/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl
similarity index 87%
rename from modules/ocl/src/kernels/split_mat.cl
rename to modules/ocl/src/opencl/split_mat.cl
index 3c7085926..caee4366d 100644
--- a/modules/ocl/src/kernels/split_mat.cl
+++ b/modules/ocl/src/opencl/split_mat.cl
@@ -51,9 +51,9 @@
 ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
 ////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                   __global uchar *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
 
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
         int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
         int dst3_idx   = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-           
-        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); 
-        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx))); 
-        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx))); 
-        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); 
 
-        int total_bytes = src_offset + rows * src_step; 
-        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx))); 
-        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx))); 
-        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));  
+        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
+        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx)));
+        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx)));
+        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
+
+        int total_bytes = src_offset + rows * src_step;
+        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx)));
+        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx)));
+        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
 
         uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
 
@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
 }
 
 __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
         uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
         uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
         uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
 
         uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
         int index = 3 - dst0_offset & 3;
-        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
 
         uchar4 data0, data1, data2;
-        
+
         data0     = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
         data1     = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
         data2     = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -263,33 +263,33 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
 }
 
 __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
         #define dst0_align ((dst0_offset & 3) << 1)
         #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-           
-		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
         uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
         uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
         if(src_idx_0 == -6)
@@ -326,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int s
 }
 
 __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                   __global char *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -336,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
 
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
         int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
         int dst3_idx   = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-           
-        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); 
-        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); 
-        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); 
-        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); 
-        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); 
-        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); 
-        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); 
+
+        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
+        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
+        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
+        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
+        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
+        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
+        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
 
         char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
 
@@ -437,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
 }
 
 __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
         char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
         char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
         char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
@@ -500,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
 
         char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
         int index = 3 - dst0_offset & 3;
-        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
 
         char4 data0, data1, data2;
-        
+
         data0     = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
         data1     = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
         data2     = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -536,32 +536,32 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
 }
 
 __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 2;
 
         #define dst0_align ((dst0_offset & 3) << 1)
         #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-   	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
         char8 src_data_0 = vload8(0, mat_src + src_idx_0);
         char8 src_data_1 = vload8(0, mat_src + src_idx_1);
         if(src_idx_0 == -6)
@@ -597,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int sr
 }
 
 __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                   __global ushort *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -607,30 +607,30 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
         int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
         int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-           
-   	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+
+    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
         ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
              if(src_idx_0 == -6)
             src_data0.s01234567 = src_data0.s67012345;
@@ -672,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
 }
 
 __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
         ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
         ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
         ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -735,48 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int
 }
 
 __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
         #define dst0_align ((dst0_offset & 3) << 1)
         #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-           
-		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
         ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
         ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
-		if(src_idx_0 < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-			src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
-		}
-		if(src_idx_1 < 0)
-		{
-			ushort4 tmp;
-			tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
-			src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
-		}		
-  
+        if(src_idx_0 < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+            src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
+        }
+        if(src_idx_1 < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
+            src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
+        }
+
         ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
         ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
 
@@ -793,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int
     }
 }
 __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                   __global short *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -803,38 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
         int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
         int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-     	int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
         short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
- 
+
         if(src_idx_0 == -6)
             src_data0.s01234567 = src_data0.s67012345;
         if(src_idx_0 == -4)
             src_data0.s01234567 = src_data0.s45670123;
         if(src_idx_0 == -2)
             src_data0.s01234567 = src_data0.s23456701;
-          
+
         short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
 
         short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
@@ -868,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
     }
 }
 __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
         int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
         int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
         short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
         short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
         short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -932,47 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int s
 
 
 __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
         x = x << 1;
 
         #define dst0_align ((dst0_offset & 3) << 1)
         #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
 
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
         int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
         int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
 
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
         int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
         int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- 		int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-		int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
         short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
         short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
-		if(src_idx_0 < 0)
-		{
-			short4 tmp;
-			tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-			src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
-		}
-		if(src_idx_1< 0)
-		{
-			short4 tmp;
-			tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
-			src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
-		}		
-             
+        if(src_idx_0 < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+            src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+        }
+        if(src_idx_1< 0)
+        {
+            short4 tmp;
+            tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
+            src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
+        }
+
 
         short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
         short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
@@ -990,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int s
     }
 }
 __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                   __global int *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -1000,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
         int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
         int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1017,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
     }
 }
 __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1044,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src
 }
 
 __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
         int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1066,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src
 }
 
 __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                   __global float *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -1076,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
         int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
         float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1094,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
 }
 
 __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1121,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int s
 }
 
 __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
         float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1144,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int s
 
 #if defined (DOUBLE_SUPPORT)
 __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                   __global double *mat_dst3, int dst3_step, int dst3_offset,
                                   int rows, int cols, int dst_step1)
 
@@ -1154,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
         int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
         double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1172,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
 }
 
 __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-	                                __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
         int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1199,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int
 }
 
 __kernel void split_vector_C2_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
                                   int rows, int cols, int dst_step1)
 
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
 
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
     {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
         int dst0_idx = mad24(y, dst0_step, dst0_offset);
         int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
         double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
 
         ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
diff --git a/modules/ocl/src/opencl/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl
new file mode 100644
index 000000000..954283987
--- /dev/null
+++ b/modules/ocl/src/opencl/stereobm.cl
@@ -0,0 +1,427 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define ROWSperTHREAD 21     // the number of rows a thread will process
+#define BLOCK_W       128    // the thread block width (464)
+#define N_DISPARITIES 8
+
+#define STEREO_MIND 0                    // The minimum d range to check
+#define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
+
+int SQ(int a)
+{
+    return a * a;
+}
+
+unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
+                     volatile __local unsigned int *col_ssd, int radius)
+{
+    unsigned int cache = 0;
+    unsigned int cache2 = 0;
+
+    for(int i = 1; i <= radius; i++)
+        cache += col_ssd[i];
+
+    col_ssd_cache[0] = cache;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(0) < BLOCK_W - radius)
+        cache2 = col_ssd_cache[radius];
+    else
+        for(int i = radius + 1; i < (2 * radius + 1); i++)
+            cache2 += col_ssd[i];
+
+    return col_ssd[0] + cache + cache2;
+}
+
+uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
+             volatile __local unsigned int *col_ssd, int radius)
+{
+    unsigned int ssd[N_DISPARITIES];
+
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
+
+    int bestIdx = 0;
+    for (int i = 0; i < N_DISPARITIES; i++)
+    {
+        if (mssd == ssd[i])
+            bestIdx = i;
+    }
+
+    return (uint2)(mssd, bestIdx);
+}
+
+void StepDown(int idx1, int idx2, __global unsigned char* imageL,
+              __global unsigned char* imageR, int d, volatile  __local unsigned int *col_ssd, int radius)
+{
+    unsigned char leftPixel1;
+    unsigned char leftPixel2;
+    unsigned char rightPixel1[8];
+    unsigned char rightPixel2[8];
+    unsigned int diff1, diff2;
+
+    leftPixel1 = imageL[idx1];
+    leftPixel2 = imageL[idx2];
+
+    idx1 = idx1 - d;
+    idx2 = idx2 - d;
+
+    rightPixel1[7] = imageR[idx1 - 7];
+    rightPixel1[0] = imageR[idx1 - 0];
+    rightPixel1[1] = imageR[idx1 - 1];
+    rightPixel1[2] = imageR[idx1 - 2];
+    rightPixel1[3] = imageR[idx1 - 3];
+    rightPixel1[4] = imageR[idx1 - 4];
+    rightPixel1[5] = imageR[idx1 - 5];
+    rightPixel1[6] = imageR[idx1 - 6];
+
+    rightPixel2[7] = imageR[idx2 - 7];
+    rightPixel2[0] = imageR[idx2 - 0];
+    rightPixel2[1] = imageR[idx2 - 1];
+    rightPixel2[2] = imageR[idx2 - 2];
+    rightPixel2[3] = imageR[idx2 - 3];
+    rightPixel2[4] = imageR[idx2 - 4];
+    rightPixel2[5] = imageR[idx2 - 5];
+    rightPixel2[6] = imageR[idx2 - 6];
+
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    diff1 = leftPixel1 - rightPixel1[0];
+    diff2 = leftPixel2 - rightPixel2[0];
+    col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[1];
+    diff2 = leftPixel2 - rightPixel2[1];
+    col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[2];
+    diff2 = leftPixel2 - rightPixel2[2];
+    col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[3];
+    diff2 = leftPixel2 - rightPixel2[3];
+    col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[4];
+    diff2 = leftPixel2 - rightPixel2[4];
+    col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[5];
+    diff2 = leftPixel2 - rightPixel2[5];
+    col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[6];
+    diff2 = leftPixel2 - rightPixel2[6];
+    col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+
+    diff1 = leftPixel1 - rightPixel1[7];
+    diff2 = leftPixel2 - rightPixel2[7];
+    col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+}
+
+void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
+                __global unsigned char* imageR, int d,
+                volatile __local unsigned int *col_ssd, int radius)
+{
+    unsigned char leftPixel1;
+    int idx;
+    unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    for(int i = 0; i < (2 * radius + 1); i++)
+    {
+        idx = y_tex * im_pitch + x_tex;
+        leftPixel1 = imageL[idx];
+        idx = idx - d;
+
+        diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+        diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+        diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+        diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+        diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+        diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+        diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+        diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+        y_tex += 1;
+    }
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
+    col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
+    col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
+    col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
+    col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
+    col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
+    col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
+    col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
+    col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
+}
+
+__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
+                           __global unsigned int *cminSSDImage, int cminSSD_step,
+                           __global unsigned char *disp, int disp_step,int cwidth, int cheight,
+                           int img_step, int maxdisp, int radius,
+                           __local unsigned int *col_ssd_cache)
+{
+
+    volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
+    volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
+
+    int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
+   // int Y = get_group_id(1) * ROWSperTHREAD + radius;
+
+    #define Y (get_group_id(1) * ROWSperTHREAD + radius)
+
+    volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+    __global unsigned char* disparImage = disp + X + Y * disp_step;
+
+    int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
+    int y_tex;
+    int x_tex = X - radius;
+
+    if (x_tex >= cwidth)
+        return;
+
+    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+    {
+        y_tex = Y - radius;
+
+        InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
+        if (col_ssd_extra > 0)
+            if (x_tex + BLOCK_W < cwidth)
+                InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
+
+        barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
+
+        if (X < cwidth - radius && Y < cheight - radius)
+        {
+            uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+            if (minSSD.x < minSSDImage[0])
+            {
+                disparImage[0] = (unsigned char)(d + minSSD.y);
+                minSSDImage[0] = minSSD.x;
+            }
+        }
+
+        for(int row = 1; row < end_row; row++)
+        {
+            int idx1 = y_tex * img_step + x_tex;
+            int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
+
+            barrier(CLK_GLOBAL_MEM_FENCE);
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            StepDown(idx1, idx2, left, right, d, col_ssd, radius);
+            if (col_ssd_extra > 0)
+                if (x_tex + BLOCK_W < cwidth)
+                    StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
+
+            y_tex += 1;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (X < cwidth - radius && row < cheight - radius - Y)
+            {
+                int idx = row * cminSSD_step;
+                uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+                if (minSSD.x < minSSDImage[idx])
+                {
+                    disparImage[disp_step * row] = (unsigned char)(d + minSSD.y);
+                    minSSDImage[idx] = minSSD.x;
+                }
+            }
+        } // for row loop
+    } // for d loop
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
+                               int rows, int cols, int prefilterCap)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if(x < cols && y < rows)
+    {
+        int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
+                  input[(y)   * cols + (x-1)] * (-2) + input[(y)   * cols + (x+1)] * (2) +
+                  input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
+
+        cov = min(min(max(-prefilterCap, cov), prefilterCap) + prefilterCap, 255);
+        output[y * cols + x] = cov & 0xFF;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////// Textureness filtering ////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
+{
+    float conv = 0;
+    int y1 = y==0? 0 : y-1;
+    int x1 = x==0? 0 : x-1;
+    if(x < cols && y < rows)
+    {
+        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) +
+               (float)input[(y)   * cols + (x1)] * (-2) + (float)input[(y)   * cols + (x+1)] * (2) +
+               (float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
+
+    }
+    return fabs(conv);
+}
+
+float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
+{
+    float cache = 0;
+    float cache2 = 0;
+    int winsz2 = winsz/2;
+
+    int x = get_local_id(0);
+    int group_size_x = get_local_size(0);
+
+    for(int i = 1; i <= winsz2; i++)
+        cache += cols[i];
+
+    cols_cache[0] = cache;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (x < group_size_x - winsz2)
+        cache2 = cols_cache[winsz2];
+    else
+        for(int i = winsz2 + 1; i < winsz; i++)
+            cache2 += cols[i];
+
+    return cols[0] + cache + cache2;
+}
+
+#define RpT (2 * ROWSperTHREAD)  // got experimentally
+__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
+                                 int disp_step, __global unsigned char *input, int input_rows,
+                                 int input_cols,int winsz, float threshold,
+                                 __local float *cols_cache)
+{
+    int winsz2 = winsz/2;
+    int n_dirty_pixels = (winsz2) * 2;
+
+    int local_id_x = get_local_id(0);
+    int group_size_x = get_local_size(0);
+    int group_id_y = get_group_id(1);
+
+    __local float *cols = cols_cache + group_size_x + local_id_x;
+    __local float *cols_extra = local_id_x < n_dirty_pixels ? cols + group_size_x : 0;
+
+    int x = get_global_id(0);
+    int beg_row = group_id_y * RpT;
+    int end_row = min(beg_row + RpT, disp_rows);
+
+ //   if (x < disp_cols)
+ //   {
+        int y = beg_row;
+
+        float sum = 0;
+        float sum_extra = 0;
+
+        for(int i = y - winsz2; i <= y + winsz2; ++i)
+        {
+            sum += sobel(input, x - winsz2, i, input_rows, input_cols);
+            if (cols_extra)
+                sum_extra += sobel(input, x + group_size_x - winsz2, i, input_rows, input_cols);
+        }
+        *cols = sum;
+        if (cols_extra)
+            *cols_extra = sum_extra;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
+        if (sum_win < threshold)
+            disp[y * disp_step + x] = 0;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(int y = beg_row + 1; y < end_row; ++y)
+        {
+            sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
+                  sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
+            *cols = sum;
+
+            if (cols_extra)
+            {
+                sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+                            + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
+                *cols_extra = sum_extra;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+            float sum_win = CalcSums(cols, cols_cache + local_id_x, winsz) * 255;
+            if (sum_win < threshold)
+                disp[y * disp_step + x] = 0;
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+  //  }
+}
diff --git a/modules/ocl/src/precomp.hpp b/modules/ocl/src/precomp.hpp
index 873e11b80..d4f743a50 100644
--- a/modules/ocl/src/precomp.hpp
+++ b/modules/ocl/src/precomp.hpp
@@ -65,9 +65,11 @@
 #include <exception>
 #include <stdio.h>
 
-#include "opencv2/ocl.hpp"
+#include "opencv2/imgproc.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/core/core_c.h"
+#include "opencv2/objdetect.hpp"
+#include "opencv2/ocl.hpp"
 
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/internal.hpp"
@@ -77,77 +79,9 @@
 
 #if defined (HAVE_OPENCL)
 
-#if defined __APPLE__
-#include <OpenCL/OpenCL.h>
-#else
-#include <CL/opencl.h>
-#endif
-
+#include "opencv2/ocl/private/util.hpp"
 #include "safe_call.hpp"
 
-namespace cv
-{
-    namespace ocl
-    {
-        ///////////////////////////OpenCL call wrappers////////////////////////////
-        void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
-                               size_t widthInBytes, size_t height);
-        void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
-                               size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
-        void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
-                            const void *src, size_t spitch,
-                            size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
-        void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
-                                const void *src, size_t spitch,
-                                size_t width, size_t height, int src_offset);
-        void openCLFree(void *devPtr);
-        cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
-        void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
-        cl_kernel openCLGetKernelFromSource(const Context *clCxt,
-                                            const char **source, std::string kernelName);
-        cl_kernel openCLGetKernelFromSource(const Context *clCxt,
-                                            const char **source, std::string kernelName, const char *build_options);
-        void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
-        void openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, std::vector< std::pair<size_t, const void *> > &args,
-                                 int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
-        void openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
-                                  size_t globalThreads[3], size_t localThreads[3],
-                                  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
-        void openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
-        void openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                 size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                 int depth, const char *build_options);
-
-        cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
-                             const size_t size);
-
-        cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
-
-        //void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr,
-        //                                 enum openCLMemcpyKind kind, cl_bool blocking_write);
-        int savetofile(const Context *clcxt,  cl_program &program, const char *fileName);
-        struct Context::Impl
-        {
-            //Information of the OpenCL context
-            cl_context clContext;
-            cl_command_queue clCmdQueue;
-            cl_device_id devices;
-            std::string devName;
-            cl_uint maxDimensions;
-            size_t maxWorkGroupSize;
-            size_t maxWorkItemSizes[4];
-            cl_uint maxComputeUnits;
-            int double_support;
-            //extra options to recognize vendor specific fp64 extensions
-            char extra_options[512];
-            std::string Binpath;
-            int unified_memory; //1 means integrated GPU, otherwise this value is 0
-        };
-    }
-}
-
-
 #else /* defined(HAVE_OPENCL) */
 
 static inline void throw_nogpu()
diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp
index 31009464b..6c7f1cc72 100644
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@@ -47,7 +47,6 @@
 
 
 #include "precomp.hpp"
-#include "mcwutil.hpp"
 using namespace cv;
 using namespace cv::ocl;
 
@@ -357,7 +356,7 @@ static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar,
 #ifdef CL_VERSION_1_2
     if(dst.offset == 0 && dst.cols == dst.wholecols)
     {
-        clEnqueueFillBuffer(dst.clCxt->impl->clCmdQueue, (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
+        clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
     }
     else
     {
@@ -464,7 +463,7 @@ static void copyTo(const oclMat &src, oclMat &m )
 
 static void arithmetic_run(const oclMat &src1, oclMat &dst, std::string kernelName, const char **kernelString, void *_scalar)
 {
-    if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
+    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
         CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
         return;
@@ -712,7 +711,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next
                      level, /*block, */patch, winSize, iters);
     }
 
-    clFinish(prevImg.clCxt->impl->clCmdQueue);
+    clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue());
 
     if(errMat)
         delete err;
@@ -851,5 +850,5 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI
     copyTo(uPyr_[idx], u);
     copyTo(vPyr_[idx], v);
 
-    clFinish(prevImg.clCxt->impl->clCmdQueue);
+    clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue());
 }
diff --git a/modules/ocl/src/safe_call.hpp b/modules/ocl/src/safe_call.hpp
index c8c19f6ed..441495f86 100644
--- a/modules/ocl/src/safe_call.hpp
+++ b/modules/ocl/src/safe_call.hpp
@@ -65,12 +65,6 @@ namespace cv
 {
     namespace ocl
     {
-        enum openCLMemcpyKind
-        {
-            clMemcpyHostToDevice = 0,
-            clMemcpyDeviceToHost,
-            clMemcpyDeviceToDevice
-        };
         void error( const char *error_string, const char *file, const int line, const char *func = "");
         const char *getOpenCLErrorString( int err );
 
diff --git a/modules/ocl/src/split_merge.cpp b/modules/ocl/src/split_merge.cpp
index 1916f50df..b36c7a7f4 100644
--- a/modules/ocl/src/split_merge.cpp
+++ b/modules/ocl/src/split_merge.cpp
@@ -125,7 +125,7 @@ namespace cv
 
             static void merge_vector_run(const oclMat *mat_src, size_t n, oclMat &mat_dst)
             {
-                if(mat_dst.clCxt -> impl -> double_support == 0 && mat_dst.type() == CV_64F)
+                if(!mat_dst.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_dst.type() == CV_64F)
                 {
                     CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                     return;
@@ -274,7 +274,7 @@ namespace cv
             static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
             {
 
-                if(mat_src.clCxt -> impl -> double_support == 0 && mat_src.type() == CV_64F)
+                if(!mat_src.clCxt->supportsFeature(Context::CL_DOUBLE) && mat_src.type() == CV_64F)
                 {
                     CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
                     return;
diff --git a/modules/ocl/src/stereobm.cpp b/modules/ocl/src/stereobm.cpp
new file mode 100644
index 000000000..ec5623f37
--- /dev/null
+++ b/modules/ocl/src/stereobm.cpp
@@ -0,0 +1,260 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Xiaopeng Fu, xiaopeng@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <vector>
+
+using namespace cv;
+using namespace cv::ocl;
+
+namespace cv
+{
+namespace ocl
+{
+
+///////////////////////////OpenCL kernel strings///////////////////////////
+extern const char *stereobm;
+
+}
+}
+namespace cv
+{
+namespace ocl
+{
+namespace stereoBM
+{
+/////////////////////////////////////////////////////////////////////////
+//////////////////////////prefilter_xsbel////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
+{
+    Context *clCxt = input.clCxt;
+
+    std::string kernelName = "prefilter_xsobel";
+    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
+
+    size_t blockSize = 1;
+    size_t globalThreads[3] = { input.cols, input.rows, 1 };
+    size_t localThreads[3]  = { blockSize, blockSize, 1 };
+
+    openCLVerifyKernel(clCxt, kernel,  localThreads);
+    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data));
+    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data));
+    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows));
+    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols));
+    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap));
+
+    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL,
+                                          globalThreads, localThreads, 0, NULL, NULL));
+
+    clFinish((cl_command_queue)clCxt->oclCommandQueue());
+    openCLSafeCall(clReleaseKernel(kernel));
+
+}
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////common////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+#define N_DISPARITIES 8
+#define ROWSperTHREAD 21
+#define BLOCK_W 128
+static inline int divUp(int total, int grain)
+{
+    return (total + grain - 1) / grain;
+}
+////////////////////////////////////////////////////////////////////////////
+///////////////////////////////stereoBM_GPU////////////////////////////////
+////////////////////////////////////////////////////////////////////////////
+static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
+               int maxdisp, int winSize,  oclMat &minSSD_buf)
+{
+    int winsz2 = winSize >> 1;
+
+    //if(winsz2 == 0 || winsz2 >= calles_num)
+    //cv::ocl:error("Unsupported window size", __FILE__, __LINE__, __FUNCTION__);
+
+    Context *clCxt = left.clCxt;
+
+    std::string kernelName = "stereoKernel";
+    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
+
+    disp.setTo(Scalar_<unsigned char>::all(0));
+    minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));
+
+    size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
+    size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
+                            sizeof(cl_uint);
+    //size_t blockSize = 1;
+    size_t localThreads[]  = { BLOCK_W, 1,1};
+    size_t globalThreads[] = { divUp(left.cols - maxdisp - 2 * winsz2, BLOCK_W) *BLOCK_W,
+                               divUp(left.rows - 2 * winsz2, ROWSperTHREAD),
+                               1
+                             };
+
+    openCLVerifyKernel(clCxt, kernel, localThreads);
+    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data));
+    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data));
+    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data));
+    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step));
+    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data));
+    openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step));
+    openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
+    openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows));
+    openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step));
+    openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp));
+    openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2));
+    openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL));
+
+    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
+                                          globalThreads, localThreads, 0, NULL, NULL));
+
+
+    clFinish((cl_command_queue)clCxt->oclCommandQueue());
+    openCLSafeCall(clReleaseKernel(kernel));
+}
+////////////////////////////////////////////////////////////////////////////
+///////////////////////////////postfilter_textureness///////////////////////
+////////////////////////////////////////////////////////////////////////////
+static void postfilter_textureness(oclMat &left, int winSize,
+                            float avergeTexThreshold, oclMat &disparity)
+{
+    Context *clCxt = left.clCxt;
+
+    std::string kernelName = "textureness_kernel";
+    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
+
+    size_t blockSize = 1;
+    size_t localThreads[]  = { BLOCK_W, blockSize ,1};
+    size_t globalThreads[] = { divUp(left.cols, BLOCK_W) *BLOCK_W,
+                               divUp(left.rows, 2 * ROWSperTHREAD),
+                               1
+                             };
+
+    size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
+
+    openCLVerifyKernel(clCxt, kernel,  localThreads);
+    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data));
+    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows));
+    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols));
+    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step));
+    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data));
+    openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
+    openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
+    openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize));
+    openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold));
+    openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL));
+    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
+                                          globalThreads, localThreads, 0, NULL, NULL));
+
+    clFinish((cl_command_queue)clCxt->oclCommandQueue());
+    openCLSafeCall(clReleaseKernel(kernel));
+}
+//////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////operator/////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+static void operator_(oclMat &minSSD, oclMat &leBuf, oclMat &riBuf, int preset, int ndisp,
+               int winSize, float avergeTexThreshold, const oclMat &left,
+               const oclMat &right, oclMat &disparity)
+
+{
+    CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
+    CV_DbgAssert(left.type() == CV_8UC1);
+    CV_DbgAssert(right.type() == CV_8UC1);
+
+    disparity.create(left.size(), CV_8UC1);
+    minSSD.create(left.size(), CV_32SC1);
+
+    oclMat le_for_bm =  left;
+    oclMat ri_for_bm = right;
+
+    if (preset == cv::ocl::StereoBM_OCL::PREFILTER_XSOBEL)
+    {
+        leBuf.create( left.size(),  left.type());
+        riBuf.create(right.size(), right.type());
+
+        prefilter_xsobel( left, leBuf, 31);
+        prefilter_xsobel(right, riBuf, 31);
+
+        le_for_bm = leBuf;
+        ri_for_bm = riBuf;
+    }
+
+    stereo_bm(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD);
+
+    if (avergeTexThreshold)
+    {
+        postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity);
+    }
+}
+}
+}
+}
+const float defaultAvgTexThreshold = 3;
+
+cv::ocl::StereoBM_OCL::StereoBM_OCL()
+    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ),
+      avergeTexThreshold(defaultAvgTexThreshold)  {}
+
+cv::ocl::StereoBM_OCL::StereoBM_OCL(int preset_, int ndisparities_, int winSize_)
+    : preset(preset_), ndisp(ndisparities_), winSize(winSize_),
+      avergeTexThreshold(defaultAvgTexThreshold)
+{
+    const int max_supported_ndisp = 1 << (sizeof(unsigned char) * 8);
+    CV_Assert(0 < ndisp && ndisp <= max_supported_ndisp);
+    CV_Assert(ndisp % 8 == 0);
+    CV_Assert(winSize % 2 == 1);
+}
+
+bool cv::ocl::StereoBM_OCL::checkIfGpuCallReasonable()
+{
+    return true;
+}
+
+void cv::ocl::StereoBM_OCL::operator() ( const oclMat &left, const oclMat &right,
+        oclMat &disparity)
+{
+    cv::ocl::stereoBM::operator_(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity);
+}
diff --git a/modules/ocl/test/precomp.hpp b/modules/ocl/test/precomp.hpp
index 755a2d2bc..cef6479ac 100644
--- a/modules/ocl/test/precomp.hpp
+++ b/modules/ocl/test/precomp.hpp
@@ -68,7 +68,6 @@
 #include "opencv2/video.hpp"
 #include "opencv2/ocl.hpp"
 //#include "opencv2/calib3d.hpp"
-//#include "opencv2/nonfree.hpp"
 
 #include "utility.hpp"
 #include "interpolation.hpp"
diff --git a/modules/ocl/test/test_brute_force_matcher.cpp b/modules/ocl/test/test_brute_force_matcher.cpp
index 424781fe0..bdf1f8a4a 100644
--- a/modules/ocl/test/test_brute_force_matcher.cpp
+++ b/modules/ocl/test/test_brute_force_matcher.cpp
@@ -110,7 +110,7 @@ namespace
         }
     };
 
-    TEST_P(BruteForceMatcher, Match_Single)
+    TEST_P(BruteForceMatcher, DISABLED_Match_Single)
     {
         cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
 
@@ -130,7 +130,7 @@ namespace
         ASSERT_EQ(0, badCount);
     }
 
-    TEST_P(BruteForceMatcher, KnnMatch_2_Single)
+    TEST_P(BruteForceMatcher, DISABLED_KnnMatch_2_Single)
     {
         const int knn = 2;
 
diff --git a/modules/ocl/src/mcwutil.hpp b/modules/ocl/test/test_calib3d.cpp
similarity index 56%
rename from modules/ocl/src/mcwutil.hpp
rename to modules/ocl/test/test_calib3d.cpp
index a199b70d4..b42962582 100644
--- a/modules/ocl/src/mcwutil.hpp
+++ b/modules/ocl/test/test_calib3d.cpp
@@ -1,4 +1,4 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
@@ -10,12 +10,12 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
-//    Peng Xiao, pengxiao@multicorewareinc.com
+
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors as is and
+// This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
@@ -43,38 +43,52 @@
 //
 //M*/
 
-#ifndef _OPENCV_MCWUTIL_
-#define _OPENCV_MCWUTIL_
-
 #include "precomp.hpp"
+#include <iomanip>
 
-namespace cv
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+
+extern std::string workdir;
+PARAM_TEST_CASE(StereoMatchBM, int, int)
 {
-    namespace ocl
+    int n_disp;
+    int winSize;
+
+    virtual void SetUp()
     {
-        enum FLUSH_MODE
+        n_disp  = GET_PARAM(0);
+        winSize = GET_PARAM(1);
+    }
+};
+
+TEST_P(StereoMatchBM, Accuracy)
         {
-            CLFINISH = 0,
-            CLFLUSH,
-            DISABLE
-        };
-        void openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
-        void openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
-                                  size_t localThreads[3],  std::vector< std::pair<size_t, const void *> > &args, int channels,
-                                  int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
-        // bind oclMat to OpenCL image textures
-        // note:
-        //   1. there is no memory management. User need to explicitly release the resource
-        //   2. for faster clamping, there is no buffer padding for the constructed texture
-        cl_mem bindTexture(const oclMat &mat);
-        void releaseTexture(cl_mem& texture);
 
-        // returns whether the current context supports image2d_t format or not
-        bool support_image2d(Context *clCxt = Context::getContext());
+    Mat left_image  = readImage(workdir + "../ocl/aloe-L.png", IMREAD_GRAYSCALE);
+    Mat right_image = readImage(workdir + "../ocl/aloe-R.png", IMREAD_GRAYSCALE);
+    Mat disp_gold   = readImage(workdir + "../ocl/aloe-disp.png", IMREAD_GRAYSCALE);
+    ocl::oclMat d_left, d_right;
+    ocl::oclMat d_disp(left_image.size(), CV_8U);
+    Mat  disp;
 
-    }//namespace ocl
+    ASSERT_FALSE(left_image.empty());
+    ASSERT_FALSE(right_image.empty());
+    ASSERT_FALSE(disp_gold.empty());
+    d_left.upload(left_image);
+    d_right.upload(right_image);
 
-}//namespace cv
+    ocl::StereoBM_OCL bm(0, n_disp, winSize);
 
-#endif //_OPENCV_MCWUTIL_
+
+    bm(d_left, d_right, d_disp);
+    d_disp.download(disp);
+
+    EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Calib3D, StereoMatchBM, testing::Combine(testing::Values(128),
+                                       testing::Values(19)));
+
+#endif // HAVE_OPENCL
diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp
index c948e1d53..2fc6a10f5 100644
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
@@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
     }
 };
 
-TEST_P(MatchTemplate8U, Accuracy)
+TEST_P(MatchTemplate8U, DISABLED_Accuracy)
 {
 
     std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
diff --git a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
index d6e3b78b1..7461a67c3 100644
--- a/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/matchers.hpp
@@ -47,8 +47,8 @@
 #include "opencv2/features2d.hpp"
 
 #include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_GPU
-#include "opencv2/gpu.hpp"
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
+#include "opencv2/nonfree/gpu.hpp"
 #endif
 
 namespace cv {
@@ -103,7 +103,7 @@ private:
 };
 
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
 class CV_EXPORTS SurfFeaturesFinderGpu : public FeaturesFinder
 {
 public:
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 4351e82e4..013639fb4 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -428,7 +428,7 @@ void OrbFeaturesFinder::find(const Mat &image, ImageFeatures &features)
     }
 }
 
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
 SurfFeaturesFinderGpu::SurfFeaturesFinderGpu(double hess_thresh, int num_octaves, int num_layers,
                                              int num_octaves_descr, int num_layers_descr)
 {
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 7d75786d7..15508a47f 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -71,7 +71,10 @@
 #include "opencv2/features2d.hpp"
 #include "opencv2/calib3d.hpp"
 #ifdef HAVE_OPENCV_GPU
-# include "opencv2/gpu.hpp"
+#  include "opencv2/gpu.hpp"
+#  ifdef HAVE_OPENCV_NONFREE
+#    include "opencv2/nonfree/gpu.hpp"
+#  endif
 #endif
 
 #include "../../imgproc/src/gcgraph.hpp"
diff --git a/modules/stitching/src/stitcher.cpp b/modules/stitching/src/stitcher.cpp
index cef9fe4e9..72f6e67ed 100644
--- a/modules/stitching/src/stitcher.cpp
+++ b/modules/stitching/src/stitcher.cpp
@@ -59,7 +59,7 @@ Stitcher Stitcher::createDefault(bool try_use_gpu)
 #ifdef HAVE_OPENCV_GPU
     if (try_use_gpu && gpu::getCudaEnabledDeviceCount() > 0)
     {
-#ifdef HAVE_OPENCV_NONFREE
+#if defined(HAVE_OPENCV_NONFREE)
         stitcher.setFeaturesFinder(new detail::SurfFeaturesFinderGpu());
 #else
         stitcher.setFeaturesFinder(new detail::OrbFeaturesFinder());
diff --git a/modules/superres/CMakeLists.txt b/modules/superres/CMakeLists.txt
new file mode 100644
index 000000000..5e82629ae
--- /dev/null
+++ b/modules/superres/CMakeLists.txt
@@ -0,0 +1,33 @@
+if(ANDROID OR IOS)
+  ocv_module_disable(superres)
+endif()
+
+set(the_description "Super Resolution")
+ocv_add_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui)
+ocv_module_include_directories()
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef /wd4127)
+
+if(HAVE_CUDA)
+  string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu")
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include" ${CUDA_INCLUDE_DIRS})
+
+  file(GLOB lib_cuda "src/cuda/*.cu")
+  ocv_cuda_compile(cuda_objs ${lib_cuda})
+
+  set(cuda_link_libs ${CUDA_LIBRARIES})
+else()
+  set(lib_cuda "")
+  set(cuda_objs "")
+  set(cuda_link_libs "")
+endif()
+
+ocv_glob_module_sources(SOURCES ${lib_cuda} ${cuda_objs})
+
+ocv_create_module(${cuda_link_libs})
+ocv_add_precompiled_headers(${the_module})
+
+ocv_add_accuracy_tests()
+ocv_add_perf_tests()
diff --git a/modules/superres/doc/super_resolution.rst b/modules/superres/doc/super_resolution.rst
new file mode 100644
index 000000000..1772b0d2d
--- /dev/null
+++ b/modules/superres/doc/super_resolution.rst
@@ -0,0 +1,84 @@
+Super Resolution
+================
+
+.. highlight:: cpp
+
+The Super Resolution module contains a set of functions and classes that can be used to solve the problem of resolution enhancement. There are a few methods implemented, most of them are descibed in the papers [Farsiu03]_ and [Mitzel09]_.
+
+
+
+superres::SuperResolution
+-------------------------
+Base class for Super Resolution algorithms.
+
+.. ocv:class:: superres::SuperResolution : public Algorithm, public superres::FrameSource
+
+The class is only used to define the common interface for the whole family of Super Resolution algorithms.
+
+
+
+superres::SuperResolution::setInput
+-----------------------------------
+Set input frame source for Super Resolution algorithm.
+
+.. ocv:function:: void superres::SuperResolution::setInput(const Ptr<FrameSource>& frameSource)
+
+    :param frameSource: Input frame source
+
+
+
+superres::SuperResolution::nextFrame
+------------------------------------
+Process next frame from input and return output result.
+
+.. ocv:function:: void superres::SuperResolution::nextFrame(OutputArray frame)
+
+    :param frame: Output result
+
+
+
+superres::SuperResolution::collectGarbage
+-----------------------------------------
+Clear all inner buffers.
+
+.. ocv:function:: void superres::SuperResolution::collectGarbage()
+
+
+
+superres::createSuperResolution_BTVL1
+-------------------------------------
+Create Bilateral TV-L1 Super Resolution.
+
+.. ocv:function:: Ptr<SuperResolution> superres::createSuperResolution_BTVL1()
+
+.. ocv:function:: Ptr<SuperResolution> superres::createSuperResolution_BTVL1_GPU()
+
+This class implements Super Resolution algorithm described in the papers [Farsiu03]_ and [Mitzel09]_ .
+
+Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+
+    * **int scale** Scale factor.
+
+    * **int iterations** Iteration count.
+
+    * **double tau** Asymptotic value of steepest descent method.
+
+    * **double lambda** Weight parameter to balance data term and smoothness term.
+
+    * **double alpha** Parameter of spacial distribution in Bilateral-TV.
+
+    * **int btvKernelSize** Kernel size of Bilateral-TV filter.
+
+    * **int blurKernelSize** Gaussian blur kernel size.
+
+    * **double blurSigma** Gaussian blur sigma.
+
+    * **int temporalAreaRadius** Radius of the temporal search area.
+
+    * **Ptr<DenseOpticalFlowExt> opticalFlow** Dense optical flow algorithm.
+
+
+
+.. [Farsiu03] S. Farsiu, D. Robinson, M. Elad, P. Milanfar. Fast and robust Super-Resolution. Proc 2003 IEEE Int Conf on Image Process, pp. 291–294, 2003.
+
+.. [Mitzel09] D. Mitzel, T. Pock, T. Schoenemann, D. Cremers. Video super resolution using duality based TV-L1 optical flow. DAGM, 2009.
diff --git a/modules/superres/doc/superres.rst b/modules/superres/doc/superres.rst
new file mode 100644
index 000000000..6a69fcaeb
--- /dev/null
+++ b/modules/superres/doc/superres.rst
@@ -0,0 +1,8 @@
+**************************
+superres. Super Resolution
+**************************
+
+.. toctree::
+    :maxdepth: 2
+
+    super_resolution
diff --git a/modules/superres/include/opencv2/superres.hpp b/modules/superres/include/opencv2/superres.hpp
new file mode 100644
index 000000000..bb2161064
--- /dev/null
+++ b/modules/superres/include/opencv2/superres.hpp
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SUPERRES_HPP__
+#define __OPENCV_SUPERRES_HPP__
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+    namespace superres
+    {
+        CV_EXPORTS bool initModule_superres();
+
+        class CV_EXPORTS FrameSource
+        {
+        public:
+            virtual ~FrameSource();
+
+            virtual void nextFrame(OutputArray frame) = 0;
+            virtual void reset() = 0;
+        };
+
+        CV_EXPORTS Ptr<FrameSource> createFrameSource_Empty();
+
+        CV_EXPORTS Ptr<FrameSource> createFrameSource_Video(const std::string& fileName);
+        CV_EXPORTS Ptr<FrameSource> createFrameSource_Video_GPU(const std::string& fileName);
+
+        CV_EXPORTS Ptr<FrameSource> createFrameSource_Camera(int deviceId = 0);
+
+        class CV_EXPORTS SuperResolution : public cv::Algorithm, public FrameSource
+        {
+        public:
+            void setInput(const Ptr<FrameSource>& frameSource);
+
+            void nextFrame(OutputArray frame);
+            void reset();
+
+            virtual void collectGarbage();
+
+        protected:
+            SuperResolution();
+
+            virtual void initImpl(Ptr<FrameSource>& frameSource) = 0;
+            virtual void processImpl(Ptr<FrameSource>& frameSource, OutputArray output) = 0;
+
+        private:
+            Ptr<FrameSource> frameSource_;
+            bool firstCall_;
+        };
+
+        // S. Farsiu , D. Robinson, M. Elad, P. Milanfar. Fast and robust multiframe super resolution.
+        // Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
+        CV_EXPORTS Ptr<SuperResolution> createSuperResolution_BTVL1();
+        CV_EXPORTS Ptr<SuperResolution> createSuperResolution_BTVL1_GPU();
+    }
+}
+
+#endif // __OPENCV_SUPERRES_HPP__
diff --git a/modules/superres/include/opencv2/superres/optical_flow.hpp b/modules/superres/include/opencv2/superres/optical_flow.hpp
new file mode 100644
index 000000000..cd0bb31c7
--- /dev/null
+++ b/modules/superres/include/opencv2/superres/optical_flow.hpp
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SUPERRES_OPTICAL_FLOW_HPP__
+#define __OPENCV_SUPERRES_OPTICAL_FLOW_HPP__
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+    namespace superres
+    {
+        class CV_EXPORTS DenseOpticalFlowExt : public cv::Algorithm
+        {
+        public:
+            virtual void calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2 = noArray()) = 0;
+            virtual void collectGarbage() = 0;
+        };
+
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_Farneback();
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_Farneback_GPU();
+
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_Simple();
+
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_DualTVL1();
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_DualTVL1_GPU();
+
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_Brox_GPU();
+
+        CV_EXPORTS Ptr<DenseOpticalFlowExt> createOptFlow_PyrLK_GPU();
+    }
+}
+
+#endif // __OPENCV_SUPERRES_OPTICAL_FLOW_HPP__
diff --git a/modules/superres/perf/perf_main.cpp b/modules/superres/perf/perf_main.cpp
new file mode 100644
index 000000000..80575d89f
--- /dev/null
+++ b/modules/superres/perf/perf_main.cpp
@@ -0,0 +1,5 @@
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_MAIN(superres, printCudaInfo())
diff --git a/modules/superres/perf/perf_precomp.cpp b/modules/superres/perf/perf_precomp.cpp
new file mode 100644
index 000000000..8552ac3d4
--- /dev/null
+++ b/modules/superres/perf/perf_precomp.cpp
@@ -0,0 +1 @@
+#include "perf_precomp.hpp"
diff --git a/modules/superres/perf/perf_precomp.hpp b/modules/superres/perf/perf_precomp.hpp
new file mode 100644
index 000000000..27dfd1808
--- /dev/null
+++ b/modules/superres/perf/perf_precomp.hpp
@@ -0,0 +1,27 @@
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#ifdef HAVE_CVCONFIG_H
+#include "cvconfig.h"
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
+#include "opencv2/superres.hpp"
+#include "opencv2/superres/optical_flow.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
diff --git a/modules/superres/perf/perf_superres.cpp b/modules/superres/perf/perf_superres.cpp
new file mode 100644
index 000000000..eec01a785
--- /dev/null
+++ b/modules/superres/perf/perf_superres.cpp
@@ -0,0 +1,141 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace std::tr1;
+using namespace testing;
+using namespace perf;
+using namespace cv;
+using namespace cv::superres;
+using namespace cv::gpu;
+
+namespace
+{
+    class OneFrameSource_CPU : public FrameSource
+    {
+    public:
+        explicit OneFrameSource_CPU(const Mat& frame) : frame_(frame) {}
+
+        void nextFrame(OutputArray frame)
+        {
+            frame.getMatRef() = frame_;
+        }
+
+        void reset()
+        {
+        }
+
+    private:
+        Mat frame_;
+    };
+
+    class OneFrameSource_GPU : public FrameSource
+    {
+    public:
+        explicit OneFrameSource_GPU(const GpuMat& frame) : frame_(frame) {}
+
+        void nextFrame(OutputArray frame)
+        {
+            frame.getGpuMatRef() = frame_;
+        }
+
+        void reset()
+        {
+        }
+
+    private:
+        GpuMat frame_;
+    };
+
+    class ZeroOpticalFlow : public DenseOpticalFlowExt
+    {
+    public:
+        void calc(InputArray frame0, InputArray, OutputArray flow1, OutputArray flow2)
+        {
+            cv::Size size = frame0.size();
+
+            if (!flow2.needed())
+            {
+                flow1.create(size, CV_32FC2);
+
+                if (flow1.kind() == cv::_InputArray::GPU_MAT)
+                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
+                else
+                    flow1.getMatRef().setTo(cv::Scalar::all(0));
+            }
+            else
+            {
+                flow1.create(size, CV_32FC1);
+                flow2.create(size, CV_32FC1);
+
+                if (flow1.kind() == cv::_InputArray::GPU_MAT)
+                    flow1.getGpuMatRef().setTo(cv::Scalar::all(0));
+                else
+                    flow1.getMatRef().setTo(cv::Scalar::all(0));
+
+                if (flow2.kind() == cv::_InputArray::GPU_MAT)
+                    flow2.getGpuMatRef().setTo(cv::Scalar::all(0));
+                else
+                    flow2.getMatRef().setTo(cv::Scalar::all(0));
+            }
+        }
+
+        void collectGarbage()
+        {
+        }
+    };
+}
+
+PERF_TEST_P(Size_MatType, SuperResolution_BTVL1,
+            Combine(Values(szSmall64, szSmall128),
+                    Values(MatType(CV_8UC1), MatType(CV_8UC3))))
+{
+    declare.time(5 * 60);
+
+    const Size size = get<0>(GetParam());
+    const int type = get<1>(GetParam());
+
+    Mat frame(size, type);
+    declare.in(frame, WARMUP_RNG);
+
+    const int scale = 2;
+    const int iterations = 50;
+    const int temporalAreaRadius = 1;
+    Ptr<DenseOpticalFlowExt> opticalFlow(new ZeroOpticalFlow);
+
+    if (PERF_RUN_GPU())
+    {
+        Ptr<SuperResolution> superRes = createSuperResolution_BTVL1_GPU();
+
+        superRes->set("scale", scale);
+        superRes->set("iterations", iterations);
+        superRes->set("temporalAreaRadius", temporalAreaRadius);
+        superRes->set("opticalFlow", opticalFlow);
+
+        superRes->setInput(new OneFrameSource_GPU(GpuMat(frame)));
+
+        GpuMat dst;
+        superRes->nextFrame(dst);
+
+        TEST_CYCLE_N(10) superRes->nextFrame(dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        Ptr<SuperResolution> superRes = createSuperResolution_BTVL1();
+
+        superRes->set("scale", scale);
+        superRes->set("iterations", iterations);
+        superRes->set("temporalAreaRadius", temporalAreaRadius);
+        superRes->set("opticalFlow", opticalFlow);
+
+        superRes->setInput(new OneFrameSource_CPU(frame));
+
+        Mat dst;
+        superRes->nextFrame(dst);
+
+        TEST_CYCLE_N(10) superRes->nextFrame(dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
diff --git a/modules/superres/src/btv_l1.cpp b/modules/superres/src/btv_l1.cpp
new file mode 100644
index 000000000..ed5acaf43
--- /dev/null
+++ b/modules/superres/src/btv_l1.cpp
@@ -0,0 +1,618 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// S. Farsiu , D. Robinson, M. Elad, P. Milanfar. Fast and robust multiframe super resolution.
+// Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::superres;
+using namespace cv::superres::detail;
+
+namespace
+{
+    void calcRelativeMotions(const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
+                             std::vector<Mat>& relForwardMotions, std::vector<Mat>& relBackwardMotions,
+                             int baseIdx, Size size)
+    {
+        const int count = static_cast<int>(forwardMotions.size());
+
+        relForwardMotions.resize(count);
+        relForwardMotions[baseIdx].create(size, CV_32FC2);
+        relForwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        relBackwardMotions.resize(count);
+        relBackwardMotions[baseIdx].create(size, CV_32FC2);
+        relBackwardMotions[baseIdx].setTo(Scalar::all(0));
+
+        for (int i = baseIdx - 1; i >= 0; --i)
+        {
+            add(relForwardMotions[i + 1], forwardMotions[i], relForwardMotions[i]);
+
+            add(relBackwardMotions[i + 1], backwardMotions[i + 1], relBackwardMotions[i]);
+        }
+
+        for (int i = baseIdx + 1; i < count; ++i)
+        {
+            add(relForwardMotions[i - 1], backwardMotions[i], relForwardMotions[i]);
+
+            add(relBackwardMotions[i - 1], forwardMotions[i - 1], relBackwardMotions[i]);
+        }
+    }
+
+    void upscaleMotions(const std::vector<Mat>& lowResMotions, std::vector<Mat>& highResMotions, int scale)
+    {
+        highResMotions.resize(lowResMotions.size());
+
+        for (size_t i = 0; i < lowResMotions.size(); ++i)
+        {
+            resize(lowResMotions[i], highResMotions[i], Size(), scale, scale, INTER_CUBIC);
+            multiply(highResMotions[i], Scalar::all(scale), highResMotions[i]);
+        }
+    }
+
+    void buildMotionMaps(const Mat& forwardMotion, const Mat& backwardMotion, Mat& forwardMap, Mat& backwardMap)
+    {
+        forwardMap.create(forwardMotion.size(), CV_32FC2);
+        backwardMap.create(forwardMotion.size(), CV_32FC2);
+
+        for (int y = 0; y < forwardMotion.rows; ++y)
+        {
+            const Point2f* forwardMotionRow = forwardMotion.ptr<Point2f>(y);
+            const Point2f* backwardMotionRow = backwardMotion.ptr<Point2f>(y);
+            Point2f* forwardMapRow = forwardMap.ptr<Point2f>(y);
+            Point2f* backwardMapRow = backwardMap.ptr<Point2f>(y);
+
+            for (int x = 0; x < forwardMotion.cols; ++x)
+            {
+                Point2f base(static_cast<float>(x), static_cast<float>(y));
+
+                forwardMapRow[x] = base + backwardMotionRow[x];
+                backwardMapRow[x] = base + forwardMotionRow[x];
+            }
+        }
+    }
+
+    template <typename T>
+    void upscaleImpl(const Mat& src, Mat& dst, int scale)
+    {
+        dst.create(src.rows * scale, src.cols * scale, src.type());
+        dst.setTo(Scalar::all(0));
+
+        for (int y = 0, Y = 0; y < src.rows; ++y, Y += scale)
+        {
+            const T* srcRow = src.ptr<T>(y);
+            T* dstRow = dst.ptr<T>(Y);
+
+            for (int x = 0, X = 0; x < src.cols; ++x, X += scale)
+                dstRow[X] = srcRow[x];
+        }
+    }
+
+    void upscale(const Mat& src, Mat& dst, int scale)
+    {
+        typedef void (*func_t)(const Mat& src, Mat& dst, int scale);
+        static const func_t funcs[] =
+        {
+            0, upscaleImpl<float>, 0, upscaleImpl<Point3f>
+        };
+
+        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+
+        const func_t func = funcs[src.channels()];
+
+        func(src, dst, scale);
+    }
+
+    float diffSign(float a, float b)
+    {
+        return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
+    }
+    Point3f diffSign(Point3f a, Point3f b)
+    {
+        return Point3f(
+            a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f,
+            a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f,
+            a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f
+        );
+    }
+
+    void diffSign(const Mat& src1, const Mat& src2, Mat& dst)
+    {
+        const int count = src1.cols * src1.channels();
+
+        dst.create(src1.size(), src1.type());
+
+        for (int y = 0; y < src1.rows; ++y)
+        {
+            const float* src1Ptr = src1.ptr<float>(y);
+            const float* src2Ptr = src2.ptr<float>(y);
+            float* dstPtr = dst.ptr<float>(y);
+
+            for (int x = 0; x < count; ++x)
+                dstPtr[x] = diffSign(src1Ptr[x], src2Ptr[x]);
+        }
+    }
+
+    void calcBtvWeights(int btvKernelSize, double alpha, std::vector<float>& btvWeights)
+    {
+        const size_t size = btvKernelSize * btvKernelSize;
+
+        btvWeights.resize(size);
+
+        const int ksize = (btvKernelSize - 1) / 2;
+        const float alpha_f = static_cast<float>(alpha);
+
+        for (int m = 0, ind = 0; m <= ksize; ++m)
+        {
+            for (int l = ksize; l + m >= 0; --l, ++ind)
+                btvWeights[ind] = pow(alpha_f, std::abs(m) + std::abs(l));
+        }
+    }
+
+    template <typename T>
+    struct BtvRegularizationBody : ParallelLoopBody
+    {
+        void operator ()(const Range& range) const;
+
+        Mat src;
+        mutable Mat dst;
+        int ksize;
+        const float* btvWeights;
+    };
+
+    template <typename T>
+    void BtvRegularizationBody<T>::operator ()(const Range& range) const
+    {
+        for (int i = range.start; i < range.end; ++i)
+        {
+            const T* srcRow = src.ptr<T>(i);
+            T* dstRow = dst.ptr<T>(i);
+
+            for(int j = ksize; j < src.cols - ksize; ++j)
+            {
+                const T srcVal = srcRow[j];
+
+                for (int m = 0, ind = 0; m <= ksize; ++m)
+                {
+                    const T* srcRow2 = src.ptr<T>(i - m);
+                    const T* srcRow3 = src.ptr<T>(i + m);
+
+                    for (int l = ksize; l + m >= 0; --l, ++ind)
+                    {
+                        dstRow[j] += btvWeights[ind] * (diffSign(srcVal, srcRow3[j + l]) - diffSign(srcRow2[j - l], srcVal));
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void calcBtvRegularizationImpl(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+    {
+        dst.create(src.size(), src.type());
+        dst.setTo(Scalar::all(0));
+
+        const int ksize = (btvKernelSize - 1) / 2;
+
+        BtvRegularizationBody<T> body;
+
+        body.src = src;
+        body.dst = dst;
+        body.ksize = ksize;
+        body.btvWeights = &btvWeights[0];
+
+        parallel_for_(Range(ksize, src.rows - ksize), body);
+    }
+
+    void calcBtvRegularization(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights)
+    {
+        typedef void (*func_t)(const Mat& src, Mat& dst, int btvKernelSize, const std::vector<float>& btvWeights);
+        static const func_t funcs[] =
+        {
+            0, calcBtvRegularizationImpl<float>, 0, calcBtvRegularizationImpl<Point3f>
+        };
+
+        const func_t func = funcs[src.channels()];
+
+        func(src, dst, btvKernelSize, btvWeights);
+    }
+
+    class BTVL1_Base
+    {
+    public:
+        BTVL1_Base();
+
+        void process(const std::vector<Mat>& src, Mat& dst,
+                     const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions,
+                     int baseIdx);
+
+        void collectGarbage();
+
+    protected:
+        int scale_;
+        int iterations_;
+        double tau_;
+        double lambda_;
+        double alpha_;
+        int btvKernelSize_;
+        int blurKernelSize_;
+        double blurSigma_;
+        Ptr<DenseOpticalFlowExt> opticalFlow_;
+
+    private:
+        Ptr<FilterEngine> filter_;
+        int curBlurKernelSize_;
+        double curBlurSigma_;
+        int curSrcType_;
+
+        std::vector<float> btvWeights_;
+        int curBtvKernelSize_;
+        double curAlpha_;
+
+        std::vector<Mat> lowResForwardMotions_;
+        std::vector<Mat> lowResBackwardMotions_;
+
+        std::vector<Mat> highResForwardMotions_;
+        std::vector<Mat> highResBackwardMotions_;
+
+        std::vector<Mat> forwardMaps_;
+        std::vector<Mat> backwardMaps_;
+
+        Mat highRes_;
+
+        Mat diffTerm_, regTerm_;
+        Mat a_, b_, c_;
+    };
+
+    BTVL1_Base::BTVL1_Base()
+    {
+        scale_ = 4;
+        iterations_ = 180;
+        lambda_ = 0.03;
+        tau_ = 1.3;
+        alpha_ = 0.7;
+        btvKernelSize_ = 7;
+        blurKernelSize_ = 5;
+        blurSigma_ = 0.0;
+        opticalFlow_ = createOptFlow_Farneback();
+
+        curBlurKernelSize_ = -1;
+        curBlurSigma_ = -1.0;
+        curSrcType_ = -1;
+
+        curBtvKernelSize_ = -1;
+        curAlpha_ = -1.0;
+    }
+
+    void BTVL1_Base::process(const std::vector<Mat>& src, Mat& dst, const std::vector<Mat>& forwardMotions, const std::vector<Mat>& backwardMotions, int baseIdx)
+    {
+        CV_Assert( scale_ > 1 );
+        CV_Assert( iterations_ > 0 );
+        CV_Assert( tau_ > 0.0 );
+        CV_Assert( alpha_ > 0.0 );
+        CV_Assert( btvKernelSize_ > 0 );
+        CV_Assert( blurKernelSize_ > 0 );
+        CV_Assert( blurSigma_ >= 0.0 );
+
+        // update blur filter and btv weights
+
+        if (filter_.empty() || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
+        {
+            filter_ = createGaussianFilter(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+            curBlurKernelSize_ = blurKernelSize_;
+            curBlurSigma_ = blurSigma_;
+            curSrcType_ = src[0].type();
+        }
+
+        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
+        {
+            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
+            curBtvKernelSize_ = btvKernelSize_;
+            curAlpha_ = alpha_;
+        }
+
+        // calc high res motions
+
+        calcRelativeMotions(forwardMotions, backwardMotions, lowResForwardMotions_, lowResBackwardMotions_, baseIdx, src[0].size());
+
+        upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
+        upscaleMotions(lowResBackwardMotions_, highResBackwardMotions_, scale_);
+
+        forwardMaps_.resize(highResForwardMotions_.size());
+        backwardMaps_.resize(highResForwardMotions_.size());
+        for (size_t i = 0; i < highResForwardMotions_.size(); ++i)
+            buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
+
+        // initial estimation
+
+        const Size lowResSize = src[0].size();
+        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
+
+        resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_CUBIC);
+
+        // iterations
+
+        diffTerm_.create(highResSize, highRes_.type());
+        a_.create(highResSize, highRes_.type());
+        b_.create(highResSize, highRes_.type());
+        c_.create(lowResSize, highRes_.type());
+
+        for (int i = 0; i < iterations_; ++i)
+        {
+            diffTerm_.setTo(Scalar::all(0));
+
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                // a = M * Ih
+                remap(highRes_, a_, backwardMaps_[k], noArray(), INTER_NEAREST);
+                // b = HM * Ih
+                filter_->apply(a_, b_);
+                // c = DHM * Ih
+                resize(b_, c_, lowResSize, 0, 0, INTER_NEAREST);
+
+                diffSign(src[k], c_, c_);
+
+                // a = Dt * diff
+                upscale(c_, a_, scale_);
+                // b = HtDt * diff
+                filter_->apply(a_, b_);
+                // a = MtHtDt * diff
+                remap(b_, a_, forwardMaps_[k], noArray(), INTER_NEAREST);
+
+                add(diffTerm_, a_, diffTerm_);
+            }
+
+            if (lambda_ > 0)
+            {
+                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_, btvWeights_);
+                addWeighted(diffTerm_, 1.0, regTerm_, -lambda_, 0.0, diffTerm_);
+            }
+
+            addWeighted(highRes_, 1.0, diffTerm_, tau_, 0.0, highRes_);
+        }
+
+        Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
+        highRes_(inner).copyTo(dst);
+    }
+
+    void BTVL1_Base::collectGarbage()
+    {
+        filter_.release();
+
+        lowResForwardMotions_.clear();
+        lowResBackwardMotions_.clear();
+
+        highResForwardMotions_.clear();
+        highResBackwardMotions_.clear();
+
+        forwardMaps_.clear();
+        backwardMaps_.clear();
+
+        highRes_.release();
+
+        diffTerm_.release();
+        regTerm_.release();
+        a_.release();
+        b_.release();
+        c_.release();
+    }
+
+////////////////////////////////////////////////////////////////////
+
+    class BTVL1 : public SuperResolution, private BTVL1_Base
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        BTVL1();
+
+        void collectGarbage();
+
+    protected:
+        void initImpl(Ptr<FrameSource>& frameSource);
+        void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
+
+    private:
+        int temporalAreaRadius_;
+
+        void readNextFrame(Ptr<FrameSource>& frameSource);
+        void processFrame(int idx);
+
+        Mat curFrame_;
+        Mat prevFrame_;
+
+        std::vector<Mat> frames_;
+        std::vector<Mat> forwardMotions_;
+        std::vector<Mat> backwardMotions_;
+        std::vector<Mat> outputs_;
+
+        int storePos_;
+        int procPos_;
+        int outPos_;
+
+        std::vector<Mat> srcFrames_;
+        std::vector<Mat> srcForwardMotions_;
+        std::vector<Mat> srcBackwardMotions_;
+        Mat finalOutput_;
+    };
+
+    CV_INIT_ALGORITHM(BTVL1, "SuperResolution.BTVL1",
+                      obj.info()->addParam(obj, "scale", obj.scale_, false, 0, 0, "Scale factor.");
+                      obj.info()->addParam(obj, "iterations", obj.iterations_, false, 0, 0, "Iteration count.");
+                      obj.info()->addParam(obj, "tau", obj.tau_, false, 0, 0, "Asymptotic value of steepest descent method.");
+                      obj.info()->addParam(obj, "lambda", obj.lambda_, false, 0, 0, "Weight parameter to balance data term and smoothness term.");
+                      obj.info()->addParam(obj, "alpha", obj.alpha_, false, 0, 0, "Parameter of spacial distribution in Bilateral-TV.");
+                      obj.info()->addParam(obj, "btvKernelSize", obj.btvKernelSize_, false, 0, 0, "Kernel size of Bilateral-TV filter.");
+                      obj.info()->addParam(obj, "blurKernelSize", obj.blurKernelSize_, false, 0, 0, "Gaussian blur kernel size.");
+                      obj.info()->addParam(obj, "blurSigma", obj.blurSigma_, false, 0, 0, "Gaussian blur sigma.");
+                      obj.info()->addParam(obj, "temporalAreaRadius", obj.temporalAreaRadius_, false, 0, 0, "Radius of the temporal search area.");
+                      obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."));
+
+    BTVL1::BTVL1()
+    {
+        temporalAreaRadius_ = 4;
+    }
+
+    void BTVL1::collectGarbage()
+    {
+        curFrame_.release();
+        prevFrame_.release();
+
+        frames_.clear();
+        forwardMotions_.clear();
+        backwardMotions_.clear();
+        outputs_.clear();
+
+        srcFrames_.clear();
+        srcForwardMotions_.clear();
+        srcBackwardMotions_.clear();
+        finalOutput_.release();
+
+        SuperResolution::collectGarbage();
+        BTVL1_Base::collectGarbage();
+    }
+
+    void BTVL1::initImpl(Ptr<FrameSource>& frameSource)
+    {
+        const int cacheSize = 2 * temporalAreaRadius_ + 1;
+
+        frames_.resize(cacheSize);
+        forwardMotions_.resize(cacheSize);
+        backwardMotions_.resize(cacheSize);
+        outputs_.resize(cacheSize);
+
+        storePos_ = -1;
+
+        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
+            readNextFrame(frameSource);
+
+        for (int i = 0; i <= temporalAreaRadius_; ++i)
+            processFrame(i);
+
+        procPos_ = temporalAreaRadius_;
+        outPos_ = -1;
+    }
+
+    void BTVL1::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
+    {
+        if (outPos_ >= storePos_)
+        {
+            _output.release();
+            return;
+        }
+
+        readNextFrame(frameSource);
+
+        if (procPos_ < storePos_)
+        {
+            ++procPos_;
+            processFrame(procPos_);
+        }
+
+        ++outPos_;
+        const Mat& curOutput = at(outPos_, outputs_);
+
+        if (_output.kind() < _InputArray::OPENGL_BUFFER)
+            curOutput.convertTo(_output, CV_8U);
+        else
+        {
+            curOutput.convertTo(finalOutput_, CV_8U);
+            arrCopy(finalOutput_, _output);
+        }
+    }
+
+    void BTVL1::readNextFrame(Ptr<FrameSource>& frameSource)
+    {
+        frameSource->nextFrame(curFrame_);
+
+        if (curFrame_.empty())
+            return;
+
+        ++storePos_;
+        curFrame_.convertTo(at(storePos_, frames_), CV_32F);
+
+        if (storePos_ > 0)
+        {
+            opticalFlow_->calc(prevFrame_, curFrame_, at(storePos_ - 1, forwardMotions_));
+            opticalFlow_->calc(curFrame_, prevFrame_, at(storePos_, backwardMotions_));
+        }
+
+        curFrame_.copyTo(prevFrame_);
+    }
+
+    void BTVL1::processFrame(int idx)
+    {
+        const int startIdx = std::max(idx - temporalAreaRadius_, 0);
+        const int procIdx = idx;
+        const int endIdx = std::min(startIdx + 2 * temporalAreaRadius_, storePos_);
+
+        const int count = endIdx - startIdx + 1;
+
+        srcFrames_.resize(count);
+        srcForwardMotions_.resize(count);
+        srcBackwardMotions_.resize(count);
+
+        int baseIdx = -1;
+
+        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
+        {
+            if (i == procIdx)
+                baseIdx = k;
+
+            srcFrames_[k] = at(i, frames_);
+
+            if (i < endIdx)
+                srcForwardMotions_[k] = at(i, forwardMotions_);
+            if (i > startIdx)
+                srcBackwardMotions_[k] = at(i, backwardMotions_);
+        }
+
+        process(srcFrames_, at(idx, outputs_), srcForwardMotions_, srcBackwardMotions_, baseIdx);
+    }
+}
+
+Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1()
+{
+    return new BTVL1;
+}
diff --git a/modules/superres/src/btv_l1_gpu.cpp b/modules/superres/src/btv_l1_gpu.cpp
new file mode 100644
index 000000000..36b69708f
--- /dev/null
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -0,0 +1,579 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// S. Farsiu , D. Robinson, M. Elad, P. Milanfar. Fast and robust multiframe super resolution.
+// Dennis Mitzel, Thomas Pock, Thomas Schoenemann, Daniel Cremers. Video Super Resolution using Duality Based TV-L1 Optical Flow.
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::superres;
+using namespace cv::superres::detail;
+
+#if !defined(HAVE_CUDA) || !defined(HAVE_OPENCV_GPU)
+
+Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1_GPU()
+{
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<SuperResolution>();
+}
+
+#else // HAVE_CUDA
+
+namespace btv_l1_device
+{
+    void buildMotionMaps(PtrStepSzf forwardMotionX, PtrStepSzf forwardMotionY,
+                         PtrStepSzf backwardMotionX, PtrStepSzf bacwardMotionY,
+                         PtrStepSzf forwardMapX, PtrStepSzf forwardMapY,
+                         PtrStepSzf backwardMapX, PtrStepSzf backwardMapY);
+
+    template <int cn>
+    void upscale(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+
+    void diffSign(PtrStepSzf src1, PtrStepSzf src2, PtrStepSzf dst, cudaStream_t stream);
+
+    void loadBtvWeights(const float* weights, size_t count);
+    template <int cn> void calcBtvRegularization(PtrStepSzb src, PtrStepSzb dst, int ksize);
+}
+
+namespace
+{
+    void calcRelativeMotions(const vector<pair<GpuMat, GpuMat> >& forwardMotions, const vector<pair<GpuMat, GpuMat> >& backwardMotions,
+                             vector<pair<GpuMat, GpuMat> >& relForwardMotions, vector<pair<GpuMat, GpuMat> >& relBackwardMotions,
+                             int baseIdx, Size size)
+    {
+        const int count = static_cast<int>(forwardMotions.size());
+
+        relForwardMotions.resize(count);
+        relForwardMotions[baseIdx].first.create(size, CV_32FC1);
+        relForwardMotions[baseIdx].first.setTo(Scalar::all(0));
+        relForwardMotions[baseIdx].second.create(size, CV_32FC1);
+        relForwardMotions[baseIdx].second.setTo(Scalar::all(0));
+
+        relBackwardMotions.resize(count);
+        relBackwardMotions[baseIdx].first.create(size, CV_32FC1);
+        relBackwardMotions[baseIdx].first.setTo(Scalar::all(0));
+        relBackwardMotions[baseIdx].second.create(size, CV_32FC1);
+        relBackwardMotions[baseIdx].second.setTo(Scalar::all(0));
+
+        for (int i = baseIdx - 1; i >= 0; --i)
+        {
+            gpu::add(relForwardMotions[i + 1].first, forwardMotions[i].first, relForwardMotions[i].first);
+            gpu::add(relForwardMotions[i + 1].second, forwardMotions[i].second, relForwardMotions[i].second);
+
+            gpu::add(relBackwardMotions[i + 1].first, backwardMotions[i + 1].first, relBackwardMotions[i].first);
+            gpu::add(relBackwardMotions[i + 1].second, backwardMotions[i + 1].second, relBackwardMotions[i].second);
+        }
+
+        for (int i = baseIdx + 1; i < count; ++i)
+        {
+            gpu::add(relForwardMotions[i - 1].first, backwardMotions[i].first, relForwardMotions[i].first);
+            gpu::add(relForwardMotions[i - 1].second, backwardMotions[i].second, relForwardMotions[i].second);
+
+            gpu::add(relBackwardMotions[i - 1].first, forwardMotions[i - 1].first, relBackwardMotions[i].first);
+            gpu::add(relBackwardMotions[i - 1].second, forwardMotions[i - 1].second, relBackwardMotions[i].second);
+        }
+    }
+
+    void upscaleMotions(const vector<pair<GpuMat, GpuMat> >& lowResMotions, vector<pair<GpuMat, GpuMat> >& highResMotions, int scale)
+    {
+        highResMotions.resize(lowResMotions.size());
+
+        for (size_t i = 0; i < lowResMotions.size(); ++i)
+        {
+            gpu::resize(lowResMotions[i].first, highResMotions[i].first, Size(), scale, scale, INTER_CUBIC);
+            gpu::resize(lowResMotions[i].second, highResMotions[i].second, Size(), scale, scale, INTER_CUBIC);
+
+            gpu::multiply(highResMotions[i].first, Scalar::all(scale), highResMotions[i].first);
+            gpu::multiply(highResMotions[i].second, Scalar::all(scale), highResMotions[i].second);
+        }
+    }
+
+    void buildMotionMaps(const pair<GpuMat, GpuMat>& forwardMotion, const pair<GpuMat, GpuMat>& backwardMotion,
+                         pair<GpuMat, GpuMat>& forwardMap, pair<GpuMat, GpuMat>& backwardMap)
+    {
+        forwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
+        forwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
+
+        backwardMap.first.create(forwardMotion.first.size(), CV_32FC1);
+        backwardMap.second.create(forwardMotion.first.size(), CV_32FC1);
+
+        btv_l1_device::buildMotionMaps(forwardMotion.first, forwardMotion.second,
+                                       backwardMotion.first, backwardMotion.second,
+                                       forwardMap.first, forwardMap.second,
+                                       backwardMap.first, backwardMap.second);
+    }
+
+    void upscale(const GpuMat& src, GpuMat& dst, int scale, Stream& stream)
+    {
+        typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0, btv_l1_device::upscale<1>, 0, btv_l1_device::upscale<3>, btv_l1_device::upscale<4>
+        };
+
+        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+
+        dst.create(src.rows * scale, src.cols * scale, src.type());
+        dst.setTo(Scalar::all(0));
+
+        const func_t func = funcs[src.channels()];
+
+        func(src, dst, scale, StreamAccessor::getStream(stream));
+    }
+
+    void diffSign(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        dst.create(src1.size(), src1.type());
+
+        btv_l1_device::diffSign(src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    }
+
+    void calcBtvWeights(int btvKernelSize, double alpha, vector<float>& btvWeights)
+    {
+        const size_t size = btvKernelSize * btvKernelSize;
+
+        btvWeights.resize(size);
+
+        const int ksize = (btvKernelSize - 1) / 2;
+        const float alpha_f = static_cast<float>(alpha);
+
+        for (int m = 0, ind = 0; m <= ksize; ++m)
+        {
+            for (int l = ksize; l + m >= 0; --l, ++ind)
+                btvWeights[ind] = pow(alpha_f, std::abs(m) + std::abs(l));
+        }
+
+        btv_l1_device::loadBtvWeights(&btvWeights[0], size);
+    }
+
+    void calcBtvRegularization(const GpuMat& src, GpuMat& dst, int btvKernelSize)
+    {
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int ksize);
+        static const func_t funcs[] =
+        {
+            0,
+            btv_l1_device::calcBtvRegularization<1>,
+            0,
+            btv_l1_device::calcBtvRegularization<3>,
+            btv_l1_device::calcBtvRegularization<4>
+        };
+
+        dst.create(src.size(), src.type());
+        dst.setTo(Scalar::all(0));
+
+        const int ksize = (btvKernelSize - 1) / 2;
+
+        funcs[src.channels()](src, dst, ksize);
+    }
+
+    class BTVL1_GPU_Base
+    {
+    public:
+        BTVL1_GPU_Base();
+
+        void process(const vector<GpuMat>& src, GpuMat& dst,
+                     const vector<pair<GpuMat, GpuMat> >& forwardMotions, const vector<pair<GpuMat, GpuMat> >& backwardMotions,
+                     int baseIdx);
+
+        void collectGarbage();
+
+    protected:
+        int scale_;
+        int iterations_;
+        double lambda_;
+        double tau_;
+        double alpha_;
+        int btvKernelSize_;
+        int blurKernelSize_;
+        double blurSigma_;
+        Ptr<DenseOpticalFlowExt> opticalFlow_;
+
+    private:
+        vector<Ptr<FilterEngine_GPU> > filters_;
+        int curBlurKernelSize_;
+        double curBlurSigma_;
+        int curSrcType_;
+
+        vector<float> btvWeights_;
+        int curBtvKernelSize_;
+        double curAlpha_;
+
+        vector<pair<GpuMat, GpuMat> > lowResForwardMotions_;
+        vector<pair<GpuMat, GpuMat> > lowResBackwardMotions_;
+
+        vector<pair<GpuMat, GpuMat> > highResForwardMotions_;
+        vector<pair<GpuMat, GpuMat> > highResBackwardMotions_;
+
+        vector<pair<GpuMat, GpuMat> > forwardMaps_;
+        vector<pair<GpuMat, GpuMat> > backwardMaps_;
+
+        GpuMat highRes_;
+
+        vector<Stream> streams_;
+        vector<GpuMat> diffTerms_;
+        vector<GpuMat> a_, b_, c_;
+        GpuMat regTerm_;
+    };
+
+    BTVL1_GPU_Base::BTVL1_GPU_Base()
+    {
+        scale_ = 4;
+        iterations_ = 180;
+        lambda_ = 0.03;
+        tau_ = 1.3;
+        alpha_ = 0.7;
+        btvKernelSize_ = 7;
+        blurKernelSize_ = 5;
+        blurSigma_ = 0.0;
+        opticalFlow_ = createOptFlow_Farneback_GPU();
+
+        curBlurKernelSize_ = -1;
+        curBlurSigma_ = -1.0;
+        curSrcType_ = -1;
+
+        curBtvKernelSize_ = -1;
+        curAlpha_ = -1.0;
+    }
+
+    void BTVL1_GPU_Base::process(const vector<GpuMat>& src, GpuMat& dst,
+                                 const vector<pair<GpuMat, GpuMat> >& forwardMotions, const vector<pair<GpuMat, GpuMat> >& backwardMotions,
+                                 int baseIdx)
+    {
+        CV_Assert( scale_ > 1 );
+        CV_Assert( iterations_ > 0 );
+        CV_Assert( tau_ > 0.0 );
+        CV_Assert( alpha_ > 0.0 );
+        CV_Assert( btvKernelSize_ > 0 && btvKernelSize_ <= 16 );
+        CV_Assert( blurKernelSize_ > 0 );
+        CV_Assert( blurSigma_ >= 0.0 );
+
+        // update blur filter and btv weights
+
+        if (filters_.size() != src.size() || blurKernelSize_ != curBlurKernelSize_ || blurSigma_ != curBlurSigma_ || src[0].type() != curSrcType_)
+        {
+            filters_.resize(src.size());
+            for (size_t i = 0; i < src.size(); ++i)
+                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+            curBlurKernelSize_ = blurKernelSize_;
+            curBlurSigma_ = blurSigma_;
+            curSrcType_ = src[0].type();
+        }
+
+        if (btvWeights_.empty() || btvKernelSize_ != curBtvKernelSize_ || alpha_ != curAlpha_)
+        {
+            calcBtvWeights(btvKernelSize_, alpha_, btvWeights_);
+            curBtvKernelSize_ = btvKernelSize_;
+            curAlpha_ = alpha_;
+        }
+
+        // calc motions between input frames
+
+        calcRelativeMotions(forwardMotions, backwardMotions, lowResForwardMotions_, lowResBackwardMotions_, baseIdx, src[0].size());
+
+        upscaleMotions(lowResForwardMotions_, highResForwardMotions_, scale_);
+        upscaleMotions(lowResBackwardMotions_, highResBackwardMotions_, scale_);
+
+        forwardMaps_.resize(highResForwardMotions_.size());
+        backwardMaps_.resize(highResForwardMotions_.size());
+        for (size_t i = 0; i < highResForwardMotions_.size(); ++i)
+            buildMotionMaps(highResForwardMotions_[i], highResBackwardMotions_[i], forwardMaps_[i], backwardMaps_[i]);
+
+        // initial estimation
+
+        const Size lowResSize = src[0].size();
+        const Size highResSize(lowResSize.width * scale_, lowResSize.height * scale_);
+
+        gpu::resize(src[baseIdx], highRes_, highResSize, 0, 0, INTER_CUBIC);
+
+        // iterations
+
+        streams_.resize(src.size());
+        diffTerms_.resize(src.size());
+        a_.resize(src.size());
+        b_.resize(src.size());
+        c_.resize(src.size());
+
+        for (int i = 0; i < iterations_; ++i)
+        {
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                // a = M * Ih
+                gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
+                // b = HM * Ih
+                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                // c = DHF * Ih
+                gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
+
+                diffSign(src[k], c_[k], c_[k], streams_[k]);
+
+                // a = Dt * diff
+                upscale(c_[k], a_[k], scale_, streams_[k]);
+                // b = HtDt * diff
+                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                // diffTerm = MtHtDt * diff
+                gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
+            }
+
+            if (lambda_ > 0)
+            {
+                calcBtvRegularization(highRes_, regTerm_, btvKernelSize_);
+                gpu::addWeighted(highRes_, 1.0, regTerm_, -tau_ * lambda_, 0.0, highRes_);
+            }
+
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                streams_[k].waitForCompletion();
+                gpu::addWeighted(highRes_, 1.0, diffTerms_[k], tau_, 0.0, highRes_);
+            }
+        }
+
+        Rect inner(btvKernelSize_, btvKernelSize_, highRes_.cols - 2 * btvKernelSize_, highRes_.rows - 2 * btvKernelSize_);
+        highRes_(inner).copyTo(dst);
+    }
+
+    void BTVL1_GPU_Base::collectGarbage()
+    {
+        filters_.clear();
+
+        lowResForwardMotions_.clear();
+        lowResBackwardMotions_.clear();
+
+        highResForwardMotions_.clear();
+        highResBackwardMotions_.clear();
+
+        forwardMaps_.clear();
+        backwardMaps_.clear();
+
+        highRes_.release();
+
+        diffTerms_.clear();
+        a_.clear();
+        b_.clear();
+        c_.clear();
+        regTerm_.release();
+    }
+
+////////////////////////////////////////////////////////////
+
+    class BTVL1_GPU : public SuperResolution, private BTVL1_GPU_Base
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        BTVL1_GPU();
+
+        void collectGarbage();
+
+    protected:
+        void initImpl(Ptr<FrameSource>& frameSource);
+        void processImpl(Ptr<FrameSource>& frameSource, OutputArray output);
+
+    private:
+        int temporalAreaRadius_;
+
+        void readNextFrame(Ptr<FrameSource>& frameSource);
+        void processFrame(int idx);
+
+        GpuMat curFrame_;
+        GpuMat prevFrame_;
+
+        vector<GpuMat> frames_;
+        vector<pair<GpuMat, GpuMat> > forwardMotions_;
+        vector<pair<GpuMat, GpuMat> > backwardMotions_;
+        vector<GpuMat> outputs_;
+
+        int storePos_;
+        int procPos_;
+        int outPos_;
+
+        vector<GpuMat> srcFrames_;
+        vector<pair<GpuMat, GpuMat> > srcForwardMotions_;
+        vector<pair<GpuMat, GpuMat> > srcBackwardMotions_;
+        GpuMat finalOutput_;
+    };
+
+    CV_INIT_ALGORITHM(BTVL1_GPU, "SuperResolution.BTVL1_GPU",
+                      obj.info()->addParam(obj, "scale", obj.scale_, false, 0, 0, "Scale factor.");
+                      obj.info()->addParam(obj, "iterations", obj.iterations_, false, 0, 0, "Iteration count.");
+                      obj.info()->addParam(obj, "tau", obj.tau_, false, 0, 0, "Asymptotic value of steepest descent method.");
+                      obj.info()->addParam(obj, "lambda", obj.lambda_, false, 0, 0, "Weight parameter to balance data term and smoothness term.");
+                      obj.info()->addParam(obj, "alpha", obj.alpha_, false, 0, 0, "Parameter of spacial distribution in Bilateral-TV.");
+                      obj.info()->addParam(obj, "btvKernelSize", obj.btvKernelSize_, false, 0, 0, "Kernel size of Bilateral-TV filter.");
+                      obj.info()->addParam(obj, "blurKernelSize", obj.blurKernelSize_, false, 0, 0, "Gaussian blur kernel size.");
+                      obj.info()->addParam(obj, "blurSigma", obj.blurSigma_, false, 0, 0, "Gaussian blur sigma.");
+                      obj.info()->addParam(obj, "temporalAreaRadius", obj.temporalAreaRadius_, false, 0, 0, "Radius of the temporal search area.");
+                      obj.info()->addParam<DenseOpticalFlowExt>(obj, "opticalFlow", obj.opticalFlow_, false, 0, 0, "Dense optical flow algorithm."));
+
+    BTVL1_GPU::BTVL1_GPU()
+    {
+        temporalAreaRadius_ = 4;
+    }
+
+    void BTVL1_GPU::collectGarbage()
+    {
+        curFrame_.release();
+        prevFrame_.release();
+
+        frames_.clear();
+        forwardMotions_.clear();
+        backwardMotions_.clear();
+        outputs_.clear();
+
+        srcFrames_.clear();
+        srcForwardMotions_.clear();
+        srcBackwardMotions_.clear();
+        finalOutput_.release();
+
+        SuperResolution::collectGarbage();
+        BTVL1_GPU_Base::collectGarbage();
+    }
+
+    void BTVL1_GPU::initImpl(Ptr<FrameSource>& frameSource)
+    {
+        const int cacheSize = 2 * temporalAreaRadius_ + 1;
+
+        frames_.resize(cacheSize);
+        forwardMotions_.resize(cacheSize);
+        backwardMotions_.resize(cacheSize);
+        outputs_.resize(cacheSize);
+
+        storePos_ = -1;
+
+        for (int t = -temporalAreaRadius_; t <= temporalAreaRadius_; ++t)
+            readNextFrame(frameSource);
+
+        for (int i = 0; i <= temporalAreaRadius_; ++i)
+            processFrame(i);
+
+        procPos_ = temporalAreaRadius_;
+        outPos_ = -1;
+    }
+
+    void BTVL1_GPU::processImpl(Ptr<FrameSource>& frameSource, OutputArray _output)
+    {
+        if (outPos_ >= storePos_)
+        {
+            _output.release();
+            return;
+        }
+
+        readNextFrame(frameSource);
+
+        if (procPos_ < storePos_)
+        {
+            ++procPos_;
+            processFrame(procPos_);
+        }
+
+        ++outPos_;
+        const GpuMat& curOutput = at(outPos_, outputs_);
+
+        if (_output.kind() == _InputArray::GPU_MAT)
+            curOutput.convertTo(_output.getGpuMatRef(), CV_8U);
+        else
+        {
+            curOutput.convertTo(finalOutput_, CV_8U);
+            arrCopy(finalOutput_, _output);
+        }
+    }
+
+    void BTVL1_GPU::readNextFrame(Ptr<FrameSource>& frameSource)
+    {
+        frameSource->nextFrame(curFrame_);
+
+        if (curFrame_.empty())
+            return;
+
+        ++storePos_;
+        curFrame_.convertTo(at(storePos_, frames_), CV_32F);
+
+        if (storePos_ > 0)
+        {
+            pair<GpuMat, GpuMat>& forwardMotion = at(storePos_ - 1, forwardMotions_);
+            pair<GpuMat, GpuMat>& backwardMotion = at(storePos_, backwardMotions_);
+
+            opticalFlow_->calc(prevFrame_, curFrame_, forwardMotion.first, forwardMotion.second);
+            opticalFlow_->calc(curFrame_, prevFrame_, backwardMotion.first, backwardMotion.second);
+        }
+
+        curFrame_.copyTo(prevFrame_);
+    }
+
+    void BTVL1_GPU::processFrame(int idx)
+    {
+        const int startIdx = max(idx - temporalAreaRadius_, 0);
+        const int procIdx = idx;
+        const int endIdx = min(startIdx + 2 * temporalAreaRadius_, storePos_);
+
+        const int count = endIdx - startIdx + 1;
+
+        srcFrames_.resize(count);
+        srcForwardMotions_.resize(count);
+        srcBackwardMotions_.resize(count);
+
+        int baseIdx = -1;
+
+        for (int i = startIdx, k = 0; i <= endIdx; ++i, ++k)
+        {
+            if (i == procIdx)
+                baseIdx = k;
+
+            srcFrames_[k] = at(i, frames_);
+
+            if (i < endIdx)
+                srcForwardMotions_[k] = at(i, forwardMotions_);
+            if (i > startIdx)
+                srcBackwardMotions_[k] = at(i, backwardMotions_);
+        }
+
+        process(srcFrames_, at(idx, outputs_), srcForwardMotions_, srcBackwardMotions_, baseIdx);
+    }
+}
+
+Ptr<SuperResolution> cv::superres::createSuperResolution_BTVL1_GPU()
+{
+    return new BTVL1_GPU;
+}
+
+#endif // HAVE_CUDA
diff --git a/modules/superres/src/cuda/btv_l1_gpu.cu b/modules/superres/src/cuda/btv_l1_gpu.cu
new file mode 100644
index 000000000..772e11d4f
--- /dev/null
+++ b/modules/superres/src/cuda/btv_l1_gpu.cu
@@ -0,0 +1,234 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace btv_l1_device
+{
+    void buildMotionMaps(PtrStepSzf forwardMotionX, PtrStepSzf forwardMotionY,
+                         PtrStepSzf backwardMotionX, PtrStepSzf bacwardMotionY,
+                         PtrStepSzf forwardMapX, PtrStepSzf forwardMapY,
+                         PtrStepSzf backwardMapX, PtrStepSzf backwardMapY);
+
+    template <int cn>
+    void upscale(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+
+    void diffSign(PtrStepSzf src1, PtrStepSzf src2, PtrStepSzf dst, cudaStream_t stream);
+
+    void loadBtvWeights(const float* weights, size_t count);
+    template <int cn> void calcBtvRegularization(PtrStepSzb src, PtrStepSzb dst, int ksize);
+}
+
+namespace btv_l1_device
+{
+    __global__ void buildMotionMapsKernel(const PtrStepSzf forwardMotionX, const PtrStepf forwardMotionY,
+                                          PtrStepf backwardMotionX, PtrStepf backwardMotionY,
+                                          PtrStepf forwardMapX, PtrStepf forwardMapY,
+                                          PtrStepf backwardMapX, PtrStepf backwardMapY)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= forwardMotionX.cols || y >= forwardMotionX.rows)
+            return;
+
+        const float fx = forwardMotionX(y, x);
+        const float fy = forwardMotionY(y, x);
+
+        const float bx = backwardMotionX(y, x);
+        const float by = backwardMotionY(y, x);
+
+        forwardMapX(y, x) = x + bx;
+        forwardMapY(y, x) = y + by;
+
+        backwardMapX(y, x) = x + fx;
+        backwardMapY(y, x) = y + fy;
+    }
+
+    void buildMotionMaps(PtrStepSzf forwardMotionX, PtrStepSzf forwardMotionY,
+                         PtrStepSzf backwardMotionX, PtrStepSzf bacwardMotionY,
+                         PtrStepSzf forwardMapX, PtrStepSzf forwardMapY,
+                         PtrStepSzf backwardMapX, PtrStepSzf backwardMapY)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(forwardMapX.cols, block.x), divUp(forwardMapX.rows, block.y));
+
+        buildMotionMapsKernel<<<grid, block>>>(forwardMotionX, forwardMotionY,
+                                               backwardMotionX, bacwardMotionY,
+                                               forwardMapX, forwardMapY,
+                                               backwardMapX, backwardMapY);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    __global__ void upscaleKernel(const PtrStepSz<T> src, PtrStep<T> dst, const int scale)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        dst(y * scale, x * scale) = src(y, x);
+    }
+
+    template <int cn>
+    void upscale(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream)
+    {
+        typedef typename TypeVec<float, cn>::vec_type src_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        upscaleKernel<src_t><<<grid, block, 0, stream>>>((PtrStepSz<src_t>) src, (PtrStepSz<src_t>) dst, scale);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void upscale<1>(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+    template void upscale<3>(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+    template void upscale<4>(const PtrStepSzb src, PtrStepSzb dst, int scale, cudaStream_t stream);
+
+    __device__ __forceinline__ float diffSign(float a, float b)
+    {
+        return a > b ? 1.0f : a < b ? -1.0f : 0.0f;
+    }
+    __device__ __forceinline__ float3 diffSign(const float3& a, const float3& b)
+    {
+        return make_float3(
+            a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f,
+            a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f,
+            a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f
+        );
+    }
+    __device__ __forceinline__ float4 diffSign(const float4& a, const float4& b)
+    {
+        return make_float4(
+            a.x > b.x ? 1.0f : a.x < b.x ? -1.0f : 0.0f,
+            a.y > b.y ? 1.0f : a.y < b.y ? -1.0f : 0.0f,
+            a.z > b.z ? 1.0f : a.z < b.z ? -1.0f : 0.0f,
+            0.0f
+        );
+    }
+
+    struct DiffSign : binary_function<float, float, float>
+    {
+        __device__ __forceinline__ float operator ()(float a, float b) const
+        {
+            return diffSign(a, b);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits<btv_l1_device::DiffSign> : DefaultTransformFunctorTraits<btv_l1_device::DiffSign>
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace btv_l1_device
+{
+    void diffSign(PtrStepSzf src1, PtrStepSzf src2, PtrStepSzf dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, DiffSign(), WithOutMask(), stream);
+    }
+
+    __constant__ float c_btvRegWeights[16*16];
+
+    template <typename T>
+    __global__ void calcBtvRegularizationKernel(const PtrStepSz<T> src, PtrStep<T> dst, const int ksize)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x + ksize;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y + ksize;
+
+        if (y >= src.rows - ksize || x >= src.cols - ksize)
+            return;
+
+        const T srcVal = src(y, x);
+
+        T dstVal = VecTraits<T>::all(0);
+
+        for (int m = 0, count = 0; m <= ksize; ++m)
+        {
+            for (int l = ksize; l + m >= 0; --l, ++count)
+                dstVal = dstVal + c_btvRegWeights[count] * (diffSign(srcVal, src(y + m, x + l)) - diffSign(src(y - m, x - l), srcVal));
+        }
+
+        dst(y, x) = dstVal;
+    }
+
+    void loadBtvWeights(const float* weights, size_t count)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_btvRegWeights, weights, count * sizeof(float)) );
+    }
+
+    template <int cn>
+    void calcBtvRegularization(PtrStepSzb src, PtrStepSzb dst, int ksize)
+    {
+        typedef typename TypeVec<float, cn>::vec_type src_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        calcBtvRegularizationKernel<src_t><<<grid, block>>>((PtrStepSz<src_t>) src, (PtrStepSz<src_t>) dst, ksize);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcBtvRegularization<1>(PtrStepSzb src, PtrStepSzb dst, int ksize);
+    template void calcBtvRegularization<3>(PtrStepSzb src, PtrStepSzb dst, int ksize);
+    template void calcBtvRegularization<4>(PtrStepSzb src, PtrStepSzb dst, int ksize);
+}
diff --git a/modules/superres/src/frame_source.cpp b/modules/superres/src/frame_source.cpp
new file mode 100644
index 000000000..12845dd9a
--- /dev/null
+++ b/modules/superres/src/frame_source.cpp
@@ -0,0 +1,254 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::superres;
+using namespace cv::superres::detail;
+
+cv::superres::FrameSource::~FrameSource()
+{
+}
+
+//////////////////////////////////////////////////////
+// EmptyFrameSource
+
+namespace
+{
+    class EmptyFrameSource : public FrameSource
+    {
+    public:
+        void nextFrame(OutputArray frame);
+        void reset();
+    };
+
+    void EmptyFrameSource::nextFrame(OutputArray frame)
+    {
+        frame.release();
+    }
+
+    void EmptyFrameSource::reset()
+    {
+    }
+}
+
+Ptr<FrameSource> cv::superres::createFrameSource_Empty()
+{
+    return new EmptyFrameSource;
+}
+
+//////////////////////////////////////////////////////
+// VideoFrameSource & CameraFrameSource
+
+#ifndef HAVE_OPENCV_HIGHGUI
+
+Ptr<FrameSource> cv::superres::createFrameSource_Video(const string& fileName)
+{
+    (void) fileName;
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<FrameSource>();
+}
+
+Ptr<FrameSource> cv::superres::createFrameSource_Camera(int deviceId)
+{
+    (void) deviceId;
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<FrameSource>();
+}
+
+#else // HAVE_OPENCV_HIGHGUI
+
+namespace
+{
+    class CaptureFrameSource : public FrameSource
+    {
+    public:
+        void nextFrame(OutputArray frame);
+
+    protected:
+        VideoCapture vc_;
+
+    private:
+        Mat frame_;
+    };
+
+    void CaptureFrameSource::nextFrame(OutputArray _frame)
+    {
+        if (_frame.kind() == _InputArray::MAT)
+        {
+            vc_ >> _frame.getMatRef();
+        }
+        else
+        {
+            vc_ >> frame_;
+            arrCopy(frame_, _frame);
+        }
+    }
+
+    class VideoFrameSource : public CaptureFrameSource
+    {
+    public:
+        VideoFrameSource(const std::string& fileName);
+
+        void reset();
+
+    private:
+        std::string fileName_;
+    };
+
+    VideoFrameSource::VideoFrameSource(const std::string& fileName) : fileName_(fileName)
+    {
+        reset();
+    }
+
+    void VideoFrameSource::reset()
+    {
+        vc_.release();
+        vc_.open(fileName_);
+        CV_Assert( vc_.isOpened() );
+    }
+
+    class CameraFrameSource : public CaptureFrameSource
+    {
+    public:
+        CameraFrameSource(int deviceId);
+
+        void reset();
+
+    private:
+        int deviceId_;
+    };
+
+    CameraFrameSource::CameraFrameSource(int deviceId) : deviceId_(deviceId)
+    {
+        reset();
+    }
+
+    void CameraFrameSource::reset()
+    {
+        vc_.release();
+        vc_.open(deviceId_);
+        CV_Assert( vc_.isOpened() );
+    }
+}
+
+Ptr<FrameSource> cv::superres::createFrameSource_Video(const std::string& fileName)
+{
+    return new VideoFrameSource(fileName);
+}
+
+Ptr<FrameSource> cv::superres::createFrameSource_Camera(int deviceId)
+{
+    return new CameraFrameSource(deviceId);
+}
+
+#endif // HAVE_OPENCV_HIGHGUI
+
+//////////////////////////////////////////////////////
+// VideoFrameSource_GPU
+
+#ifndef HAVE_OPENCV_GPU
+
+Ptr<FrameSource> cv::superres::createFrameSource_Video_GPU(const string& fileName)
+{
+    (void) fileName;
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<FrameSource>();
+}
+
+#else // HAVE_OPENCV_GPU
+
+namespace
+{
+    class VideoFrameSource_GPU : public FrameSource
+    {
+    public:
+        VideoFrameSource_GPU(const std::string& fileName);
+
+        void nextFrame(OutputArray frame);
+        void reset();
+
+    private:
+        std::string fileName_;
+        VideoReader_GPU reader_;
+        GpuMat frame_;
+    };
+
+    VideoFrameSource_GPU::VideoFrameSource_GPU(const std::string& fileName) : fileName_(fileName)
+    {
+        reset();
+    }
+
+    void VideoFrameSource_GPU::nextFrame(OutputArray _frame)
+    {
+        if (_frame.kind() == _InputArray::GPU_MAT)
+        {
+            bool res = reader_.read(_frame.getGpuMatRef());
+            if (!res)
+                _frame.release();
+        }
+        else
+        {
+            bool res = reader_.read(frame_);
+            if (!res)
+                _frame.release();
+            else
+                arrCopy(frame_, _frame);
+        }
+    }
+
+    void VideoFrameSource_GPU::reset()
+    {
+        reader_.close();
+        reader_.open(fileName_);
+        CV_Assert( reader_.isOpened() );
+    }
+}
+
+Ptr<FrameSource> cv::superres::createFrameSource_Video_GPU(const std::string& fileName)
+{
+    return new VideoFrameSource(fileName);
+}
+
+#endif // HAVE_OPENCV_GPU
diff --git a/modules/superres/src/input_array_utility.cpp b/modules/superres/src/input_array_utility.cpp
new file mode 100644
index 000000000..5220cb498
--- /dev/null
+++ b/modules/superres/src/input_array_utility.cpp
@@ -0,0 +1,272 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+Mat cv::superres::arrGetMat(InputArray arr, Mat& buf)
+{
+    switch (arr.kind())
+    {
+    case _InputArray::GPU_MAT:
+        arr.getGpuMat().download(buf);
+        return buf;
+
+    case _InputArray::OPENGL_BUFFER:
+        arr.getOGlBuffer().copyTo(buf);
+        return buf;
+
+    case _InputArray::OPENGL_TEXTURE:
+        arr.getOGlTexture2D().copyTo(buf);
+        return buf;
+
+    default:
+        return arr.getMat();
+    }
+}
+
+GpuMat cv::superres::arrGetGpuMat(InputArray arr, GpuMat& buf)
+{
+    switch (arr.kind())
+    {
+    case _InputArray::GPU_MAT:
+        return arr.getGpuMat();
+
+    case _InputArray::OPENGL_BUFFER:
+        arr.getOGlBuffer().copyTo(buf);
+        return buf;
+
+    case _InputArray::OPENGL_TEXTURE:
+        arr.getOGlTexture2D().copyTo(buf);
+        return buf;
+
+    default:
+        buf.upload(arr.getMat());
+        return buf;
+    }
+}
+
+namespace
+{
+    void mat2mat(InputArray src, OutputArray dst)
+    {
+        src.getMat().copyTo(dst);
+    }
+    void arr2buf(InputArray src, OutputArray dst)
+    {
+        dst.getOGlBufferRef().copyFrom(src);
+    }
+    void arr2tex(InputArray src, OutputArray dst)
+    {
+        dst.getOGlTexture2D().copyFrom(src);
+    }
+    void mat2gpu(InputArray src, OutputArray dst)
+    {
+        dst.getGpuMatRef().upload(src.getMat());
+    }
+    void buf2arr(InputArray src, OutputArray dst)
+    {
+        src.getOGlBuffer().copyTo(dst);
+    }
+    void tex2arr(InputArray src, OutputArray dst)
+    {
+        src.getOGlTexture2D().copyTo(dst);
+    }
+    void gpu2mat(InputArray src, OutputArray dst)
+    {
+        GpuMat d = src.getGpuMat();
+        dst.create(d.size(), d.type());
+        Mat m = dst.getMat();
+        d.download(m);
+    }
+    void gpu2gpu(InputArray src, OutputArray dst)
+    {
+        src.getGpuMat().copyTo(dst.getGpuMatRef());
+    }
+}
+
+void cv::superres::arrCopy(InputArray src, OutputArray dst)
+{
+    typedef void (*func_t)(InputArray src, OutputArray dst);
+    static const func_t funcs[10][10] =
+    {
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, mat2mat, arr2buf, arr2tex, mat2gpu},
+        {0, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr, buf2arr},
+        {0, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr, tex2arr},
+        {0, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, gpu2mat, arr2buf, arr2tex, gpu2gpu}
+    };
+
+    const int src_kind = src.kind() >> _InputArray::KIND_SHIFT;
+    const int dst_kind = dst.kind() >> _InputArray::KIND_SHIFT;
+
+    CV_DbgAssert( src_kind >= 0 && src_kind < 10 );
+    CV_DbgAssert( dst_kind >= 0 && dst_kind < 10 );
+
+    const func_t func = funcs[src_kind][dst_kind];
+    CV_DbgAssert( func != 0 );
+
+    func(src, dst);
+}
+
+namespace
+{
+    void convertToCn(InputArray src, OutputArray dst, int cn)
+    {
+        CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 );
+        CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+        static const int codes[5][5] =
+        {
+            {-1, -1, -1, -1, -1},
+            {-1, -1, -1, COLOR_GRAY2BGR, COLOR_GRAY2BGRA},
+            {-1, -1, -1, -1, -1},
+            {-1, COLOR_BGR2GRAY, -1, -1, COLOR_BGR2BGRA},
+            {-1, COLOR_BGRA2GRAY, -1, COLOR_BGRA2BGR, -1},
+        };
+
+        const int code = codes[src.channels()][cn];
+        CV_DbgAssert( code >= 0 );
+
+        switch (src.kind())
+        {
+        case _InputArray::GPU_MAT:
+            #ifdef HAVE_OPENCV_GPU
+                gpu::cvtColor(src.getGpuMat(), dst.getGpuMatRef(), code, cn);
+            #else
+                CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+            #endif
+            break;
+
+        default:
+            cvtColor(src, dst, code, cn);
+            break;
+        }
+    }
+
+    void convertToDepth(InputArray src, OutputArray dst, int depth)
+    {
+        CV_Assert( src.depth() <= CV_64F );
+        CV_Assert( depth == CV_8U || depth == CV_32F );
+
+        static const double maxVals[] =
+        {
+            std::numeric_limits<uchar>::max(),
+            std::numeric_limits<schar>::max(),
+            std::numeric_limits<ushort>::max(),
+            std::numeric_limits<short>::max(),
+            std::numeric_limits<int>::max(),
+            1.0,
+            1.0,
+        };
+
+        const double scale = maxVals[depth] / maxVals[src.depth()];
+
+        switch (src.kind())
+        {
+        case _InputArray::GPU_MAT:
+            src.getGpuMat().convertTo(dst.getGpuMatRef(), depth, scale);
+            break;
+
+        default:
+            src.getMat().convertTo(dst, depth, scale);
+            break;
+        }
+    }
+}
+
+Mat cv::superres::convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1)
+{
+    if (src.type() == type)
+        return src;
+
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (src.depth() == depth)
+    {
+        convertToCn(src, buf0, cn);
+        return buf0;
+    }
+
+    if (src.channels() == cn)
+    {
+        convertToDepth(src, buf1, depth);
+        return buf1;
+    }
+
+    convertToCn(src, buf0, cn);
+    convertToDepth(buf0, buf1, depth);
+    return buf1;
+}
+
+GpuMat cv::superres::convertToType(const GpuMat& src, int type, GpuMat& buf0, GpuMat& buf1)
+{
+    if (src.type() == type)
+        return src;
+
+    const int depth = CV_MAT_DEPTH(type);
+    const int cn = CV_MAT_CN(type);
+
+    if (src.depth() == depth)
+    {
+        convertToCn(src, buf0, cn);
+        return buf0;
+    }
+
+    if (src.channels() == cn)
+    {
+        convertToDepth(src, buf1, depth);
+        return buf1;
+    }
+
+    convertToCn(src, buf0, cn);
+    convertToDepth(buf0, buf1, depth);
+    return buf1;
+}
diff --git a/modules/superres/src/input_array_utility.hpp b/modules/superres/src/input_array_utility.hpp
new file mode 100644
index 000000000..95dc82174
--- /dev/null
+++ b/modules/superres/src/input_array_utility.hpp
@@ -0,0 +1,63 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_SUPERRES_INPUT_ARRAY_UTILITY_HPP__
+#define __OPENCV_SUPERRES_INPUT_ARRAY_UTILITY_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+
+namespace cv
+{
+    namespace superres
+    {
+        CV_EXPORTS Mat arrGetMat(InputArray arr, Mat& buf);
+        CV_EXPORTS gpu::GpuMat arrGetGpuMat(InputArray arr, gpu::GpuMat& buf);
+
+        CV_EXPORTS void arrCopy(InputArray src, OutputArray dst);
+
+        CV_EXPORTS Mat convertToType(const Mat& src, int type, Mat& buf0, Mat& buf1);
+        CV_EXPORTS gpu::GpuMat convertToType(const gpu::GpuMat& src, int type, gpu::GpuMat& buf0, gpu::GpuMat& buf1);
+    }
+}
+
+#endif // __OPENCV_SUPERRES_INPUT_ARRAY_UTILITY_HPP__
diff --git a/modules/superres/src/optical_flow.cpp b/modules/superres/src/optical_flow.cpp
new file mode 100644
index 000000000..21d29244f
--- /dev/null
+++ b/modules/superres/src/optical_flow.cpp
@@ -0,0 +1,720 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::superres;
+using namespace cv::superres::detail;
+
+///////////////////////////////////////////////////////////////////
+// CpuOpticalFlow
+
+namespace
+{
+    class CpuOpticalFlow : public DenseOpticalFlowExt
+    {
+    public:
+        explicit CpuOpticalFlow(int work_type);
+
+        void calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
+        void collectGarbage();
+
+    protected:
+        virtual void impl(const Mat& input0, const Mat& input1, OutputArray dst) = 0;
+
+    private:
+        int work_type_;
+        Mat buf_[6];
+        Mat flow_;
+        Mat flows_[2];
+    };
+
+    CpuOpticalFlow::CpuOpticalFlow(int work_type) : work_type_(work_type)
+    {
+    }
+
+    void CpuOpticalFlow::calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
+    {
+        Mat frame0 = arrGetMat(_frame0, buf_[0]);
+        Mat frame1 = arrGetMat(_frame1, buf_[1]);
+
+        CV_Assert( frame1.type() == frame0.type() );
+        CV_Assert( frame1.size() == frame0.size() );
+
+        Mat input0 = convertToType(frame0, work_type_, buf_[2], buf_[3]);
+        Mat input1 = convertToType(frame1, work_type_, buf_[4], buf_[5]);
+
+        if (!_flow2.needed() && _flow1.kind() < _InputArray::OPENGL_BUFFER)
+        {
+            impl(input0, input1, _flow1);
+            return;
+        }
+
+        impl(input0, input1, flow_);
+
+        if (!_flow2.needed())
+        {
+            arrCopy(flow_, _flow1);
+        }
+        else
+        {
+            split(flow_, flows_);
+
+            arrCopy(flows_[0], _flow1);
+            arrCopy(flows_[1], _flow2);
+        }
+    }
+
+    void CpuOpticalFlow::collectGarbage()
+    {
+        for (int i = 0; i < 6; ++i)
+            buf_[i].release();
+        flow_.release();
+        flows_[0].release();
+        flows_[1].release();
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Farneback
+
+namespace
+{
+    class Farneback : public CpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        Farneback();
+
+    protected:
+        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+
+    private:
+        double pyrScale_;
+        int numLevels_;
+        int winSize_;
+        int numIters_;
+        int polyN_;
+        double polySigma_;
+        int flags_;
+    };
+
+    CV_INIT_ALGORITHM(Farneback, "DenseOpticalFlowExt.Farneback",
+                      obj.info()->addParam(obj, "pyrScale", obj.pyrScale_);
+                      obj.info()->addParam(obj, "numLevels", obj.numLevels_);
+                      obj.info()->addParam(obj, "winSize", obj.winSize_);
+                      obj.info()->addParam(obj, "numIters", obj.numIters_);
+                      obj.info()->addParam(obj, "polyN", obj.polyN_);
+                      obj.info()->addParam(obj, "polySigma", obj.polySigma_);
+                      obj.info()->addParam(obj, "flags", obj.flags_));
+
+    Farneback::Farneback() : CpuOpticalFlow(CV_8UC1)
+    {
+        pyrScale_ = 0.5;
+        numLevels_ = 5;
+        winSize_ = 13;
+        numIters_ = 10;
+        polyN_ = 5;
+        polySigma_ = 1.1;
+        flags_ = 0;
+    }
+
+    void Farneback::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    {
+        calcOpticalFlowFarneback(input0, input1, dst, pyrScale_, numLevels_, winSize_, numIters_, polyN_, polySigma_, flags_);
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Farneback()
+{
+    return new Farneback;
+}
+
+///////////////////////////////////////////////////////////////////
+// Simple
+
+namespace
+{
+    class Simple : public CpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        Simple();
+
+    protected:
+        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+
+    private:
+        int layers_;
+        int averagingBlockSize_;
+        int maxFlow_;
+        double sigmaDist_;
+        double sigmaColor_;
+        int postProcessWindow_;
+        double sigmaDistFix_;
+        double sigmaColorFix_;
+        double occThr_;
+        int upscaleAveragingRadius_;
+        double upscaleSigmaDist_;
+        double upscaleSigmaColor_;
+        double speedUpThr_;
+    };
+
+    CV_INIT_ALGORITHM(Simple, "DenseOpticalFlowExt.Simple",
+                      obj.info()->addParam(obj, "layers", obj.layers_);
+                      obj.info()->addParam(obj, "averagingBlockSize", obj.averagingBlockSize_);
+                      obj.info()->addParam(obj, "maxFlow", obj.maxFlow_);
+                      obj.info()->addParam(obj, "sigmaDist", obj.sigmaDist_);
+                      obj.info()->addParam(obj, "sigmaColor", obj.sigmaColor_);
+                      obj.info()->addParam(obj, "postProcessWindow", obj.postProcessWindow_);
+                      obj.info()->addParam(obj, "sigmaDistFix", obj.sigmaDistFix_);
+                      obj.info()->addParam(obj, "sigmaColorFix", obj.sigmaColorFix_);
+                      obj.info()->addParam(obj, "occThr", obj.occThr_);
+                      obj.info()->addParam(obj, "upscaleAveragingRadius", obj.upscaleAveragingRadius_);
+                      obj.info()->addParam(obj, "upscaleSigmaDist", obj.upscaleSigmaDist_);
+                      obj.info()->addParam(obj, "upscaleSigmaColor", obj.upscaleSigmaColor_);
+                      obj.info()->addParam(obj, "speedUpThr", obj.speedUpThr_));
+
+    Simple::Simple() : CpuOpticalFlow(CV_8UC3)
+    {
+        layers_ = 3;
+        averagingBlockSize_ = 2;
+        maxFlow_ = 4;
+        sigmaDist_ = 4.1;
+        sigmaColor_ = 25.5;
+        postProcessWindow_ = 18;
+        sigmaDistFix_ = 55.0;
+        sigmaColorFix_ = 25.5;
+        occThr_ = 0.35;
+        upscaleAveragingRadius_ = 18;
+        upscaleSigmaDist_ = 55.0;
+        upscaleSigmaColor_ = 25.5;
+        speedUpThr_ = 10;
+    }
+
+    void Simple::impl(const Mat& _input0, const Mat& _input1, OutputArray dst)
+    {
+        Mat input0 = _input0;
+        Mat input1 = _input1;
+        calcOpticalFlowSF(input0, input1, dst.getMatRef(),
+                          layers_,
+                          averagingBlockSize_,
+                          maxFlow_,
+                          sigmaDist_,
+                          sigmaColor_,
+                          postProcessWindow_,
+                          sigmaDistFix_,
+                          sigmaColorFix_,
+                          occThr_,
+                          upscaleAveragingRadius_,
+                          upscaleSigmaDist_,
+                          upscaleSigmaColor_,
+                          speedUpThr_);
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Simple()
+{
+    return new Simple;
+}
+
+///////////////////////////////////////////////////////////////////
+// DualTVL1
+
+namespace
+{
+    class DualTVL1 : public CpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        DualTVL1();
+
+        void collectGarbage();
+
+    protected:
+        void impl(const Mat& input0, const Mat& input1, OutputArray dst);
+
+    private:
+        double tau_;
+        double lambda_;
+        double theta_;
+        int nscales_;
+        int warps_;
+        double epsilon_;
+        int iterations_;
+        bool useInitialFlow_;
+
+        Ptr<DenseOpticalFlow> alg_;
+    };
+
+    CV_INIT_ALGORITHM(DualTVL1, "DenseOpticalFlowExt.DualTVL1",
+                      obj.info()->addParam(obj, "tau", obj.tau_);
+                      obj.info()->addParam(obj, "lambda", obj.lambda_);
+                      obj.info()->addParam(obj, "theta", obj.theta_);
+                      obj.info()->addParam(obj, "nscales", obj.nscales_);
+                      obj.info()->addParam(obj, "warps", obj.warps_);
+                      obj.info()->addParam(obj, "epsilon", obj.epsilon_);
+                      obj.info()->addParam(obj, "iterations", obj.iterations_);
+                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_));
+
+    DualTVL1::DualTVL1() : CpuOpticalFlow(CV_8UC1)
+    {
+        alg_ = cv::createOptFlow_DualTVL1();
+        tau_ = alg_->getDouble("tau");
+        lambda_ = alg_->getDouble("lambda");
+        theta_ = alg_->getDouble("theta");
+        nscales_ = alg_->getInt("nscales");
+        warps_ = alg_->getInt("warps");
+        epsilon_ = alg_->getDouble("epsilon");
+        iterations_ = alg_->getInt("iterations");
+        useInitialFlow_ = alg_->getBool("useInitialFlow");
+    }
+
+    void DualTVL1::impl(const Mat& input0, const Mat& input1, OutputArray dst)
+    {
+        alg_->set("tau", tau_);
+        alg_->set("lambda", lambda_);
+        alg_->set("theta", theta_);
+        alg_->set("nscales", nscales_);
+        alg_->set("warps", warps_);
+        alg_->set("epsilon", epsilon_);
+        alg_->set("iterations", iterations_);
+        alg_->set("useInitialFlow", useInitialFlow_);
+
+        alg_->calc(input0, input1, dst);
+    }
+
+    void DualTVL1::collectGarbage()
+    {
+        alg_->collectGarbage();
+        CpuOpticalFlow::collectGarbage();
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1()
+{
+    return new DualTVL1;
+}
+
+///////////////////////////////////////////////////////////////////
+// GpuOpticalFlow
+
+#ifndef HAVE_OPENCV_GPU
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Farneback_GPU()
+{
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<DenseOpticalFlowExt>();
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_GPU()
+{
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<DenseOpticalFlowExt>();
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Brox_GPU()
+{
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<DenseOpticalFlowExt>();
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_PyrLK_GPU()
+{
+    CV_Error(CV_StsNotImplemented, "The called functionality is disabled for current build or platform");
+    return Ptr<DenseOpticalFlowExt>();
+}
+
+#else // HAVE_OPENCV_GPU
+
+namespace
+{
+    class GpuOpticalFlow : public DenseOpticalFlowExt
+    {
+    public:
+        explicit GpuOpticalFlow(int work_type);
+
+        void calc(InputArray frame0, InputArray frame1, OutputArray flow1, OutputArray flow2);
+        void collectGarbage();
+
+    protected:
+        virtual void impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2) = 0;
+
+    private:
+        int work_type_;
+        GpuMat buf_[6];
+        GpuMat u_, v_, flow_;
+    };
+
+    GpuOpticalFlow::GpuOpticalFlow(int work_type) : work_type_(work_type)
+    {
+    }
+
+    void GpuOpticalFlow::calc(InputArray _frame0, InputArray _frame1, OutputArray _flow1, OutputArray _flow2)
+    {
+        GpuMat frame0 = arrGetGpuMat(_frame0, buf_[0]);
+        GpuMat frame1 = arrGetGpuMat(_frame1, buf_[1]);
+
+        CV_Assert( frame1.type() == frame0.type() );
+        CV_Assert( frame1.size() == frame0.size() );
+
+        GpuMat input0 = convertToType(frame0, work_type_, buf_[2], buf_[3]);
+        GpuMat input1 = convertToType(frame1, work_type_, buf_[4], buf_[5]);
+
+        if (_flow2.needed() && _flow1.kind() == _InputArray::GPU_MAT && _flow2.kind() == _InputArray::GPU_MAT)
+        {
+            impl(input0, input1, _flow1.getGpuMatRef(), _flow2.getGpuMatRef());
+            return;
+        }
+
+        impl(input0, input1, u_, v_);
+
+        if (_flow2.needed())
+        {
+            arrCopy(u_, _flow1);
+            arrCopy(v_, _flow2);
+        }
+        else
+        {
+            GpuMat src[] = {u_, v_};
+            merge(src, 2, flow_);
+            arrCopy(flow_, _flow1);
+        }
+    }
+
+    void GpuOpticalFlow::collectGarbage()
+    {
+        for (int i = 0; i < 6; ++i)
+            buf_[i].release();
+        u_.release();
+        v_.release();
+        flow_.release();
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// Brox_GPU
+
+namespace
+{
+    class Brox_GPU : public GpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        Brox_GPU();
+
+        void collectGarbage();
+
+    protected:
+        void impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2);
+
+    private:
+        double alpha_;
+        double gamma_;
+        double scaleFactor_;
+        int innerIterations_;
+        int outerIterations_;
+        int solverIterations_;
+
+        BroxOpticalFlow alg_;
+    };
+
+    CV_INIT_ALGORITHM(Brox_GPU, "DenseOpticalFlowExt.Brox_GPU",
+                      obj.info()->addParam(obj, "alpha", obj.alpha_, false, 0, 0, "Flow smoothness");
+                      obj.info()->addParam(obj, "gamma", obj.gamma_, false, 0, 0, "Gradient constancy importance");
+                      obj.info()->addParam(obj, "scaleFactor", obj.scaleFactor_, false, 0, 0, "Pyramid scale factor");
+                      obj.info()->addParam(obj, "innerIterations", obj.innerIterations_, false, 0, 0, "Number of lagged non-linearity iterations (inner loop)");
+                      obj.info()->addParam(obj, "outerIterations", obj.outerIterations_, false, 0, 0, "Number of warping iterations (number of pyramid levels)");
+                      obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"));
+
+    Brox_GPU::Brox_GPU() : GpuOpticalFlow(CV_32FC1), alg_(0.197f, 50.0f, 0.8f, 10, 77, 10)
+    {
+        alpha_ = alg_.alpha;
+        gamma_ = alg_.gamma;
+        scaleFactor_ = alg_.scale_factor;
+        innerIterations_ = alg_.inner_iterations;
+        outerIterations_ = alg_.outer_iterations;
+        solverIterations_ = alg_.solver_iterations;
+    }
+
+    void Brox_GPU::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
+    {
+        alg_.alpha = static_cast<float>(alpha_);
+        alg_.gamma = static_cast<float>(gamma_);
+        alg_.scale_factor = static_cast<float>(scaleFactor_);
+        alg_.inner_iterations = innerIterations_;
+        alg_.outer_iterations = outerIterations_;
+        alg_.solver_iterations = solverIterations_;
+
+        alg_(input0, input1, dst1, dst2);
+    }
+
+    void Brox_GPU::collectGarbage()
+    {
+        alg_.buf.release();
+        GpuOpticalFlow::collectGarbage();
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Brox_GPU()
+{
+    return new Brox_GPU;
+}
+
+///////////////////////////////////////////////////////////////////
+// PyrLK_GPU
+
+namespace
+{
+    class PyrLK_GPU : public GpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        PyrLK_GPU();
+
+        void collectGarbage();
+
+    protected:
+        void impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2);
+
+    private:
+        int winSize_;
+        int maxLevel_;
+        int iterations_;
+
+        PyrLKOpticalFlow alg_;
+    };
+
+    CV_INIT_ALGORITHM(PyrLK_GPU, "DenseOpticalFlowExt.PyrLK_GPU",
+                      obj.info()->addParam(obj, "winSize", obj.winSize_);
+                      obj.info()->addParam(obj, "maxLevel", obj.maxLevel_);
+                      obj.info()->addParam(obj, "iterations", obj.iterations_));
+
+    PyrLK_GPU::PyrLK_GPU() : GpuOpticalFlow(CV_8UC1)
+    {
+        winSize_ = alg_.winSize.width;
+        maxLevel_ = alg_.maxLevel;
+        iterations_ = alg_.iters;
+    }
+
+    void PyrLK_GPU::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
+    {
+        alg_.winSize.width = winSize_;
+        alg_.winSize.height = winSize_;
+        alg_.maxLevel = maxLevel_;
+        alg_.iters = iterations_;
+
+        alg_.dense(input0, input1, dst1, dst2);
+    }
+
+    void PyrLK_GPU::collectGarbage()
+    {
+        alg_.releaseMemory();
+        GpuOpticalFlow::collectGarbage();
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_PyrLK_GPU()
+{
+    return new PyrLK_GPU;
+}
+
+///////////////////////////////////////////////////////////////////
+// Farneback_GPU
+
+namespace
+{
+    class Farneback_GPU : public GpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        Farneback_GPU();
+
+        void collectGarbage();
+
+    protected:
+        void impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2);
+
+    private:
+        double pyrScale_;
+        int numLevels_;
+        int winSize_;
+        int numIters_;
+        int polyN_;
+        double polySigma_;
+        int flags_;
+
+        FarnebackOpticalFlow alg_;
+    };
+
+    CV_INIT_ALGORITHM(Farneback_GPU, "DenseOpticalFlowExt.Farneback_GPU",
+                      obj.info()->addParam(obj, "pyrScale", obj.pyrScale_);
+                      obj.info()->addParam(obj, "numLevels", obj.numLevels_);
+                      obj.info()->addParam(obj, "winSize", obj.winSize_);
+                      obj.info()->addParam(obj, "numIters", obj.numIters_);
+                      obj.info()->addParam(obj, "polyN", obj.polyN_);
+                      obj.info()->addParam(obj, "polySigma", obj.polySigma_);
+                      obj.info()->addParam(obj, "flags", obj.flags_));
+
+    Farneback_GPU::Farneback_GPU() : GpuOpticalFlow(CV_8UC1)
+    {
+        pyrScale_ = alg_.pyrScale;
+        numLevels_ = alg_.numLevels;
+        winSize_ = alg_.winSize;
+        numIters_ = alg_.numIters;
+        polyN_ = alg_.polyN;
+        polySigma_ = alg_.polySigma;
+        flags_ = alg_.flags;
+    }
+
+    void Farneback_GPU::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
+    {
+        alg_.pyrScale = pyrScale_;
+        alg_.numLevels = numLevels_;
+        alg_.winSize = winSize_;
+        alg_.numIters = numIters_;
+        alg_.polyN = polyN_;
+        alg_.polySigma = polySigma_;
+        alg_.flags = flags_;
+
+        alg_(input0, input1, dst1, dst2);
+    }
+
+    void Farneback_GPU::collectGarbage()
+    {
+        alg_.releaseMemory();
+        GpuOpticalFlow::collectGarbage();
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_Farneback_GPU()
+{
+    return new Farneback_GPU;
+}
+
+///////////////////////////////////////////////////////////////////
+// DualTVL1_GPU
+
+namespace
+{
+    class DualTVL1_GPU : public GpuOpticalFlow
+    {
+    public:
+        AlgorithmInfo* info() const;
+
+        DualTVL1_GPU();
+
+        void collectGarbage();
+
+    protected:
+        void impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2);
+
+    private:
+        double tau_;
+        double lambda_;
+        double theta_;
+        int nscales_;
+        int warps_;
+        double epsilon_;
+        int iterations_;
+        bool useInitialFlow_;
+
+        OpticalFlowDual_TVL1_GPU alg_;
+    };
+
+    CV_INIT_ALGORITHM(DualTVL1_GPU, "DenseOpticalFlowExt.DualTVL1_GPU",
+                      obj.info()->addParam(obj, "tau", obj.tau_);
+                      obj.info()->addParam(obj, "lambda", obj.lambda_);
+                      obj.info()->addParam(obj, "theta", obj.theta_);
+                      obj.info()->addParam(obj, "nscales", obj.nscales_);
+                      obj.info()->addParam(obj, "warps", obj.warps_);
+                      obj.info()->addParam(obj, "epsilon", obj.epsilon_);
+                      obj.info()->addParam(obj, "iterations", obj.iterations_);
+                      obj.info()->addParam(obj, "useInitialFlow", obj.useInitialFlow_));
+
+    DualTVL1_GPU::DualTVL1_GPU() : GpuOpticalFlow(CV_8UC1)
+    {
+        tau_ = alg_.tau;
+        lambda_ = alg_.lambda;
+        theta_ = alg_.theta;
+        nscales_ = alg_.nscales;
+        warps_ = alg_.warps;
+        epsilon_ = alg_.epsilon;
+        iterations_ = alg_.iterations;
+        useInitialFlow_ = alg_.useInitialFlow;
+    }
+
+    void DualTVL1_GPU::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
+    {
+        alg_.tau = tau_;
+        alg_.lambda = lambda_;
+        alg_.theta = theta_;
+        alg_.nscales = nscales_;
+        alg_.warps = warps_;
+        alg_.epsilon = epsilon_;
+        alg_.iterations = iterations_;
+        alg_.useInitialFlow = useInitialFlow_;
+
+        alg_(input0, input1, dst1, dst2);
+    }
+
+    void DualTVL1_GPU::collectGarbage()
+    {
+        alg_.collectGarbage();
+        GpuOpticalFlow::collectGarbage();
+    }
+}
+
+Ptr<DenseOpticalFlowExt> cv::superres::createOptFlow_DualTVL1_GPU()
+{
+    return new DualTVL1_GPU;
+}
+
+#endif // HAVE_OPENCV_GPU
diff --git a/modules/superres/src/precomp.cpp b/modules/superres/src/precomp.cpp
new file mode 100644
index 000000000..111385282
--- /dev/null
+++ b/modules/superres/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
diff --git a/modules/superres/src/precomp.hpp b/modules/superres/src/precomp.hpp
new file mode 100644
index 000000000..3049b7aeb
--- /dev/null
+++ b/modules/superres/src/precomp.hpp
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <vector>
+#include <limits>
+
+#ifdef HAVE_CVCONFIG_H
+  #include "cvconfig.h"
+#endif
+
+#include "opencv2/opencv_modules.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/opengl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/internal.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/video/tracking.hpp"
+
+#ifdef HAVE_OPENCV_GPU
+    #include "opencv2/gpu.hpp"
+    #ifdef HAVE_CUDA
+        #include "opencv2/gpu/stream_accessor.hpp"
+    #endif
+#endif
+
+#ifdef HAVE_OPENCV_HIGHGUI
+    #include "opencv2/highgui.hpp"
+#endif
+
+#include "opencv2/superres.hpp"
+#include "opencv2/superres/optical_flow.hpp"
+#include "input_array_utility.hpp"
+
+#include "ring_buffer.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/superres/src/ring_buffer.hpp b/modules/superres/src/ring_buffer.hpp
new file mode 100644
index 000000000..3c51d7a26
--- /dev/null
+++ b/modules/superres/src/ring_buffer.hpp
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __RING_BUFFER_HPP__
+#define __RING_BUFFER_HPP__
+
+#include "precomp.hpp"
+
+namespace cv
+{
+    namespace superres
+    {
+        namespace detail
+        {
+            template <typename T, class A>
+            inline const T& at(int index, const std::vector<T, A>& items)
+            {
+                const int len = static_cast<int>(items.size());
+                if (index < 0)
+                    index -= ((index - len + 1) / len) * len;
+                if (index >= len)
+                    index %= len;
+                return items[index];
+            }
+
+            template <typename T, class A>
+            inline T& at(int index, std::vector<T, A>& items)
+            {
+                const int len = static_cast<int>(items.size());
+                if (index < 0)
+                    index -= ((index - len + 1) / len) * len;
+                if (index >= len)
+                    index %= len;
+                return items[index];
+            }
+        }
+    }
+}
+
+#endif // __RING_BUFFER_HPP__
diff --git a/modules/superres/src/super_resolution.cpp b/modules/superres/src/super_resolution.cpp
new file mode 100644
index 000000000..fbaa08075
--- /dev/null
+++ b/modules/superres/src/super_resolution.cpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::superres;
+
+bool cv::superres::initModule_superres()
+{
+    return !createSuperResolution_BTVL1().empty();
+}
+
+cv::superres::SuperResolution::SuperResolution()
+{
+    frameSource_ = createFrameSource_Empty();
+    firstCall_ = true;
+}
+
+void cv::superres::SuperResolution::setInput(const Ptr<FrameSource>& frameSource)
+{
+    frameSource_ = frameSource;
+    firstCall_ = true;
+}
+
+void cv::superres::SuperResolution::nextFrame(OutputArray frame)
+{
+    if (firstCall_)
+    {
+        initImpl(frameSource_);
+        firstCall_ = false;
+    }
+
+    processImpl(frameSource_, frame);
+}
+
+void cv::superres::SuperResolution::reset()
+{
+    frameSource_->reset();
+    firstCall_ = true;
+}
+
+void cv::superres::SuperResolution::collectGarbage()
+{
+}
diff --git a/modules/superres/test/test_main.cpp b/modules/superres/test/test_main.cpp
new file mode 100644
index 000000000..146e26285
--- /dev/null
+++ b/modules/superres/test/test_main.cpp
@@ -0,0 +1,3 @@
+#include "test_precomp.hpp"
+
+CV_TEST_MAIN("superres")
diff --git a/modules/superres/test/test_precomp.cpp b/modules/superres/test/test_precomp.cpp
new file mode 100644
index 000000000..5956e13e3
--- /dev/null
+++ b/modules/superres/test/test_precomp.cpp
@@ -0,0 +1 @@
+#include "test_precomp.hpp"
diff --git a/modules/superres/test/test_precomp.hpp b/modules/superres/test/test_precomp.hpp
new file mode 100644
index 000000000..e770a9e8c
--- /dev/null
+++ b/modules/superres/test/test_precomp.hpp
@@ -0,0 +1,23 @@
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#ifdef HAVE_CVCONFIG_H
+#include "cvconfig.h"
+#endif
+
+#include "opencv2/opencv_modules.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/ts.hpp"
+#include "opencv2/superres.hpp"
+#include "input_array_utility.hpp"
+
+#endif
diff --git a/modules/superres/test/test_superres.cpp b/modules/superres/test/test_superres.cpp
new file mode 100644
index 000000000..44d6a7a2a
--- /dev/null
+++ b/modules/superres/test/test_superres.cpp
@@ -0,0 +1,236 @@
+#include "test_precomp.hpp"
+
+class AllignedFrameSource : public cv::superres::FrameSource
+{
+public:
+    AllignedFrameSource(const cv::Ptr<cv::superres::FrameSource>& base, int scale);
+
+    void nextFrame(cv::OutputArray frame);
+    void reset();
+
+private:
+    cv::Ptr<cv::superres::FrameSource> base_;
+    cv::Mat origFrame_;
+    int scale_;
+};
+
+AllignedFrameSource::AllignedFrameSource(const cv::Ptr<cv::superres::FrameSource>& base, int scale) :
+    base_(base), scale_(scale)
+{
+    CV_Assert( !base_.empty() );
+}
+
+void AllignedFrameSource::nextFrame(cv::OutputArray frame)
+{
+    base_->nextFrame(origFrame_);
+
+    if (origFrame_.rows % scale_ == 0 && origFrame_.cols % scale_ == 0)
+    {
+        cv::superres::arrCopy(origFrame_, frame);
+    }
+    else
+    {
+        cv::Rect ROI(0, 0, (origFrame_.cols / scale_) * scale_, (origFrame_.rows / scale_) * scale_);
+        cv::superres::arrCopy(origFrame_(ROI), frame);
+    }
+}
+
+void AllignedFrameSource::reset()
+{
+    base_->reset();
+}
+
+class DegradeFrameSource : public cv::superres::FrameSource
+{
+public:
+    DegradeFrameSource(const cv::Ptr<cv::superres::FrameSource>& base, int scale);
+
+    void nextFrame(cv::OutputArray frame);
+    void reset();
+
+private:
+    cv::Ptr<cv::superres::FrameSource> base_;
+    cv::Mat origFrame_;
+    cv::Mat blurred_;
+    cv::Mat deg_;
+    double iscale_;
+};
+
+DegradeFrameSource::DegradeFrameSource(const cv::Ptr<cv::superres::FrameSource>& base, int scale) :
+    base_(base), iscale_(1.0 / scale)
+{
+    CV_Assert( !base_.empty() );
+}
+
+void addGaussNoise(cv::Mat& image, double sigma)
+{
+    cv::Mat noise(image.size(), CV_32FC(image.channels()));
+    cvtest::TS::ptr()->get_rng().fill(noise, cv::RNG::NORMAL, 0.0, sigma);
+
+    cv::addWeighted(image, 1.0, noise, 1.0, 0.0, image, image.depth());
+}
+
+void addSpikeNoise(cv::Mat& image, int frequency)
+{
+    cv::Mat_<uchar> mask(image.size(), 0);
+
+    for (int y = 0; y < mask.rows; ++y)
+    {
+        for (int x = 0; x < mask.cols; ++x)
+        {
+            if (cvtest::TS::ptr()->get_rng().uniform(0, frequency) < 1)
+                mask(y, x) = 255;
+        }
+    }
+
+    image.setTo(cv::Scalar::all(255), mask);
+}
+
+void DegradeFrameSource::nextFrame(cv::OutputArray frame)
+{
+    base_->nextFrame(origFrame_);
+
+    cv::GaussianBlur(origFrame_, blurred_, cv::Size(5, 5), 0);
+    cv::resize(blurred_, deg_, cv::Size(), iscale_, iscale_, cv::INTER_NEAREST);
+
+    addGaussNoise(deg_, 10.0);
+    addSpikeNoise(deg_, 500);
+
+    cv::superres::arrCopy(deg_, frame);
+}
+
+void DegradeFrameSource::reset()
+{
+    base_->reset();
+}
+
+double MSSIM(const cv::Mat& i1, const cv::Mat& i2)
+{
+    const double C1 = 6.5025;
+    const double C2 = 58.5225;
+
+    const int depth = CV_32F;
+
+    cv::Mat I1, I2;
+    i1.convertTo(I1, depth);
+    i2.convertTo(I2, depth);
+
+    cv::Mat I2_2  = I2.mul(I2); // I2^2
+    cv::Mat I1_2  = I1.mul(I1); // I1^2
+    cv::Mat I1_I2 = I1.mul(I2); // I1 * I2
+
+    cv::Mat mu1, mu2;
+    cv::GaussianBlur(I1, mu1, cv::Size(11, 11), 1.5);
+    cv::GaussianBlur(I2, mu2, cv::Size(11, 11), 1.5);
+
+    cv::Mat mu1_2   = mu1.mul(mu1);
+    cv::Mat mu2_2   = mu2.mul(mu2);
+    cv::Mat mu1_mu2 = mu1.mul(mu2);
+
+    cv::Mat sigma1_2, sigma2_2, sigma12;
+
+    cv::GaussianBlur(I1_2, sigma1_2, cv::Size(11, 11), 1.5);
+    sigma1_2 -= mu1_2;
+
+    cv::GaussianBlur(I2_2, sigma2_2, cv::Size(11, 11), 1.5);
+    sigma2_2 -= mu2_2;
+
+    cv::GaussianBlur(I1_I2, sigma12, cv::Size(11, 11), 1.5);
+    sigma12 -= mu1_mu2;
+
+    cv::Mat t1, t2;
+    cv::Mat numerator;
+    cv::Mat denominator;
+
+    // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
+    t1 = 2 * mu1_mu2 + C1;
+    t2 = 2 * sigma12 + C2;
+    numerator = t1.mul(t2);
+
+    // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
+    t1 = mu1_2 + mu2_2 + C1;
+    t2 = sigma1_2 + sigma2_2 + C2;
+    denominator = t1.mul(t2);
+
+    // ssim_map =  numerator./denominator;
+    cv::Mat ssim_map;
+    cv::divide(numerator, denominator, ssim_map);
+
+    // mssim = average of ssim map
+    cv::Scalar mssim = cv::mean(ssim_map);
+
+    if (i1.channels() == 1)
+        return mssim[0];
+
+    return (mssim[0] + mssim[1] + mssim[3]) / 3;
+}
+
+class SuperResolution : public testing::Test
+{
+public:
+    void RunTest(cv::Ptr<cv::superres::SuperResolution> superRes);
+};
+
+void SuperResolution::RunTest(cv::Ptr<cv::superres::SuperResolution> superRes)
+{
+    const std::string inputVideoName = cvtest::TS::ptr()->get_data_path() + "car.avi";
+    const int scale = 2;
+    const int iterations = 100;
+    const int temporalAreaRadius = 2;
+
+    ASSERT_FALSE( superRes.empty() );
+
+    const int btvKernelSize = superRes->getInt("btvKernelSize");
+
+    superRes->set("scale", scale);
+    superRes->set("iterations", iterations);
+    superRes->set("temporalAreaRadius", temporalAreaRadius);
+
+    cv::Ptr<cv::superres::FrameSource> goldSource(new AllignedFrameSource(cv::superres::createFrameSource_Video(inputVideoName), scale));
+    cv::Ptr<cv::superres::FrameSource> lowResSource(new DegradeFrameSource(new AllignedFrameSource(cv::superres::createFrameSource_Video(inputVideoName), scale), scale));
+
+    // skip first frame
+    cv::Mat frame;
+
+    lowResSource->nextFrame(frame);
+    goldSource->nextFrame(frame);
+
+    cv::Rect inner(btvKernelSize, btvKernelSize, frame.cols - 2 * btvKernelSize, frame.rows - 2 * btvKernelSize);
+
+    superRes->setInput(lowResSource);
+
+    double srAvgMSSIM = 0.0;
+    const int count = 10;
+
+    cv::Mat goldFrame, superResFrame;
+    for (int i = 0; i < count; ++i)
+    {
+        goldSource->nextFrame(goldFrame);
+        ASSERT_FALSE( goldFrame.empty() );
+
+        superRes->nextFrame(superResFrame);
+        ASSERT_FALSE( superResFrame.empty() );
+
+        const double srMSSIM = MSSIM(goldFrame(inner), superResFrame);
+
+        srAvgMSSIM += srMSSIM;
+    }
+
+    srAvgMSSIM /= count;
+
+    EXPECT_GE( srAvgMSSIM, 0.5 );
+}
+
+TEST_F(SuperResolution, BTVL1)
+{
+    RunTest(cv::superres::createSuperResolution_BTVL1());
+}
+
+#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
+
+TEST_F(SuperResolution, BTVL1_GPU)
+{
+    RunTest(cv::superres::createSuperResolution_BTVL1_GPU());
+}
+
+#endif
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index 455ef876b..1eaeb3933 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -10,6 +10,12 @@ endif()
 
 set(OPENCV_MODULE_IS_PART_OF_WORLD FALSE)
 
+if(HAVE_CUDA)
+  ocv_include_directories(${CUDA_INCLUDE_DIRS})
+endif()
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+
 ocv_add_module(ts opencv_core opencv_features2d)
 
 ocv_glob_module_sources()
diff --git a/modules/ts/include/opencv2/ts/gpu_perf.hpp b/modules/ts/include/opencv2/ts/gpu_perf.hpp
new file mode 100644
index 000000000..330ca6cce
--- /dev/null
+++ b/modules/ts/include/opencv2/ts/gpu_perf.hpp
@@ -0,0 +1,68 @@
+#ifndef __OPENCV_GPU_PERF_UTILITY_HPP__
+#define __OPENCV_GPU_PERF_UTILITY_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+namespace perf
+{
+    CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+    #define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
+
+    CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+    #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+
+    CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)
+
+    enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
+    CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
+    #define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
+    #define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))
+
+    #define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+    #define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+    #define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+    DEF_PARAM_TEST_1(Sz, cv::Size);
+    typedef perf::Size_MatType Sz_Type;
+    DEF_PARAM_TEST(Sz_Depth, cv::Size, perf::MatDepth);
+    DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, perf::MatDepth, MatCn);
+
+    #define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
+
+    #define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"
+
+    #define GPU_SANITY_CHECK(mat, ...) \
+        do{ \
+            cv::Mat gpu_##mat(mat); \
+            SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
+        } while(0)
+
+    #define CPU_SANITY_CHECK(mat, ...) \
+        do{ \
+            cv::Mat cpu_##mat(mat); \
+            SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
+        } while(0)
+
+    CV_EXPORTS cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+    struct CvtColorInfo
+    {
+        int scn;
+        int dcn;
+        int code;
+
+        CvtColorInfo() {}
+        explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
+    };
+    CV_EXPORTS void PrintTo(const CvtColorInfo& info, std::ostream* os);
+
+    CV_EXPORTS void printCudaInfo();
+
+    CV_EXPORTS void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray());
+}
+
+#endif // __OPENCV_GPU_PERF_UTILITY_HPP__
diff --git a/modules/ts/include/opencv2/ts/gpu_test.hpp b/modules/ts/include/opencv2/ts/gpu_test.hpp
new file mode 100644
index 000000000..5ec050ff2
--- /dev/null
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
@@ -0,0 +1,306 @@
+#ifndef __OPENCV_GPU_TEST_UTILITY_HPP__
+#define __OPENCV_GPU_TEST_UTILITY_HPP__
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/ts.hpp"
+
+namespace cvtest
+{
+    //////////////////////////////////////////////////////////////////////
+    // random generators
+
+    CV_EXPORTS int randomInt(int minVal, int maxVal);
+    CV_EXPORTS double randomDouble(double minVal, double maxVal);
+    CV_EXPORTS cv::Size randomSize(int minVal, int maxVal);
+    CV_EXPORTS cv::Scalar randomScalar(double minVal, double maxVal);
+    CV_EXPORTS cv::Mat randomMat(cv::Size size, int type, double minVal = 0.0, double maxVal = 255.0);
+
+    //////////////////////////////////////////////////////////////////////
+    // GpuMat create
+
+    CV_EXPORTS cv::gpu::GpuMat createMat(cv::Size size, int type, bool useRoi = false);
+    CV_EXPORTS cv::gpu::GpuMat loadMat(const cv::Mat& m, bool useRoi = false);
+
+    //////////////////////////////////////////////////////////////////////
+    // Image load
+
+    //! read image from testdata folder
+    CV_EXPORTS cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+    //! read image from testdata folder and convert it to specified type
+    CV_EXPORTS cv::Mat readImageType(const std::string& fname, int type);
+
+    //////////////////////////////////////////////////////////////////////
+    // Gpu devices
+
+    //! return true if device supports specified feature and gpu module was built with support the feature.
+    CV_EXPORTS bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);
+
+    class CV_EXPORTS DeviceManager
+    {
+    public:
+        static DeviceManager& instance();
+
+        void load(int i);
+        void loadAll();
+
+        const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+    private:
+        std::vector<cv::gpu::DeviceInfo> devices_;
+    };
+
+    #define ALL_DEVICES testing::ValuesIn(cvtest::DeviceManager::instance().values())
+
+    //////////////////////////////////////////////////////////////////////
+    // Additional assertion
+
+    CV_EXPORTS void minMaxLocGold(const cv::Mat& src, double* minVal_, double* maxVal_ = 0, cv::Point* minLoc_ = 0, cv::Point* maxLoc_ = 0, const cv::Mat& mask = cv::Mat());
+
+    CV_EXPORTS cv::Mat getMat(cv::InputArray arr);
+
+    CV_EXPORTS testing::AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, cv::InputArray m1, cv::InputArray m2, double eps);
+
+    #define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+    #define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
+
+    #define EXPECT_SCALAR_NEAR(s1, s2, eps) \
+        { \
+            EXPECT_NEAR(s1[0], s2[0], eps); \
+            EXPECT_NEAR(s1[1], s2[1], eps); \
+            EXPECT_NEAR(s1[2], s2[2], eps); \
+            EXPECT_NEAR(s1[3], s2[3], eps); \
+        }
+    #define ASSERT_SCALAR_NEAR(s1, s2, eps) \
+        { \
+            ASSERT_NEAR(s1[0], s2[0], eps); \
+            ASSERT_NEAR(s1[1], s2[1], eps); \
+            ASSERT_NEAR(s1[2], s2[2], eps); \
+            ASSERT_NEAR(s1[3], s2[3], eps); \
+        }
+
+    #define EXPECT_POINT2_NEAR(p1, p2, eps) \
+        { \
+            EXPECT_NEAR(p1.x, p2.x, eps); \
+            EXPECT_NEAR(p1.y, p2.y, eps); \
+        }
+    #define ASSERT_POINT2_NEAR(p1, p2, eps) \
+        { \
+            ASSERT_NEAR(p1.x, p2.x, eps); \
+            ASSERT_NEAR(p1.y, p2.y, eps); \
+        }
+
+    #define EXPECT_POINT3_NEAR(p1, p2, eps) \
+        { \
+            EXPECT_NEAR(p1.x, p2.x, eps); \
+            EXPECT_NEAR(p1.y, p2.y, eps); \
+            EXPECT_NEAR(p1.z, p2.z, eps); \
+        }
+    #define ASSERT_POINT3_NEAR(p1, p2, eps) \
+        { \
+            ASSERT_NEAR(p1.x, p2.x, eps); \
+            ASSERT_NEAR(p1.y, p2.y, eps); \
+            ASSERT_NEAR(p1.z, p2.z, eps); \
+        }
+
+    CV_EXPORTS double checkSimilarity(cv::InputArray m1, cv::InputArray m2);
+
+    #define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
+        { \
+            ASSERT_EQ(mat1.type(), mat2.type()); \
+            ASSERT_EQ(mat1.size(), mat2.size()); \
+            EXPECT_LE(checkSimilarity(mat1, mat2), eps); \
+        }
+    #define ASSERT_MAT_SIMILAR(mat1, mat2, eps) \
+        { \
+            ASSERT_EQ(mat1.type(), mat2.type()); \
+            ASSERT_EQ(mat1.size(), mat2.size()); \
+            ASSERT_LE(checkSimilarity(mat1, mat2), eps); \
+        }
+
+    //////////////////////////////////////////////////////////////////////
+    // Helper structs for value-parameterized tests
+
+    #define GPU_TEST_P(test_case_name, test_name) \
+      class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+          : public test_case_name { \
+       public: \
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+        virtual void TestBody(); \
+       private: \
+        void UnsafeTestBody(); \
+        static int AddToRegistry() { \
+          ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+              GetTestCasePatternHolder<test_case_name>(\
+                  #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+          return 0; \
+        } \
+        static int gtest_registering_dummy_; \
+        GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+      }; \
+      int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                                 test_name)::gtest_registering_dummy_ = \
+          GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+      void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() \
+      { \
+        try \
+        { \
+          UnsafeTestBody(); \
+        } \
+        catch (...) \
+        { \
+          cv::gpu::resetDevice(); \
+          throw; \
+        } \
+      } \
+      void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::UnsafeTestBody()
+
+    #define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+    #define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+    #define DIFFERENT_SIZES testing::Values(cv::Size(128, 128), cv::Size(113, 113))
+
+    // Depth
+
+    using perf::MatDepth;
+
+    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))
+
+    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
+                                        std::make_pair(MatDepth(CV_8U), MatDepth(CV_16U)),  \
+                                        std::make_pair(MatDepth(CV_8U), MatDepth(CV_16S)),  \
+                                        std::make_pair(MatDepth(CV_8U), MatDepth(CV_32S)),  \
+                                        std::make_pair(MatDepth(CV_8U), MatDepth(CV_32F)),  \
+                                        std::make_pair(MatDepth(CV_8U), MatDepth(CV_64F)),  \
+                                                                                            \
+                                        std::make_pair(MatDepth(CV_16U), MatDepth(CV_16U)), \
+                                        std::make_pair(MatDepth(CV_16U), MatDepth(CV_32S)), \
+                                        std::make_pair(MatDepth(CV_16U), MatDepth(CV_32F)), \
+                                        std::make_pair(MatDepth(CV_16U), MatDepth(CV_64F)), \
+                                                                                            \
+                                        std::make_pair(MatDepth(CV_16S), MatDepth(CV_16S)), \
+                                        std::make_pair(MatDepth(CV_16S), MatDepth(CV_32S)), \
+                                        std::make_pair(MatDepth(CV_16S), MatDepth(CV_32F)), \
+                                        std::make_pair(MatDepth(CV_16S), MatDepth(CV_64F)), \
+                                                                                            \
+                                        std::make_pair(MatDepth(CV_32S), MatDepth(CV_32S)), \
+                                        std::make_pair(MatDepth(CV_32S), MatDepth(CV_32F)), \
+                                        std::make_pair(MatDepth(CV_32S), MatDepth(CV_64F)), \
+                                                                                            \
+                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)), \
+                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
+                                                                                            \
+                                        std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
+
+    // Type
+
+    using perf::MatType;
+
+    //! return vector with types from specified range.
+    CV_EXPORTS std::vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end);
+
+    //! return vector with all types (depth: CV_8U-CV_64F, channels: 1-4).
+    CV_EXPORTS const std::vector<MatType>& all_types();
+
+    #define ALL_TYPES testing::ValuesIn(all_types())
+    #define TYPES(depth_start, depth_end, cn_start, cn_end) testing::ValuesIn(types(depth_start, depth_end, cn_start, cn_end))
+
+    // ROI
+
+    class UseRoi
+    {
+    public:
+        inline UseRoi(bool val = false) : val_(val) {}
+
+        inline operator bool() const { return val_; }
+
+    private:
+        bool val_;
+    };
+
+    CV_EXPORTS void PrintTo(const UseRoi& useRoi, std::ostream* os);
+
+    #define WHOLE_SUBMAT testing::Values(UseRoi(false), UseRoi(true))
+
+    // Direct/Inverse
+
+    class Inverse
+    {
+    public:
+        inline Inverse(bool val = false) : val_(val) {}
+
+        inline operator bool() const { return val_; }
+
+    private:
+        bool val_;
+    };
+
+    CV_EXPORTS void PrintTo(const Inverse& useRoi, std::ostream* os);
+
+    #define DIRECT_INVERSE testing::Values(Inverse(false), Inverse(true))
+
+    // Param class
+
+    #define IMPLEMENT_PARAM_CLASS(name, type) \
+        class name \
+        { \
+        public: \
+            name ( type arg = type ()) : val_(arg) {} \
+            operator type () const {return val_;} \
+        private: \
+            type val_; \
+        }; \
+        inline void PrintTo( name param, std::ostream* os) \
+        { \
+            *os << #name <<  "(" << testing::PrintToString(static_cast< type >(param)) << ")"; \
+        }
+
+    IMPLEMENT_PARAM_CLASS(Channels, int)
+
+    #define ALL_CHANNELS testing::Values(Channels(1), Channels(2), Channels(3), Channels(4))
+    #define IMAGE_CHANNELS testing::Values(Channels(1), Channels(3), Channels(4))
+
+    // Flags and enums
+
+    CV_ENUM(NormCode, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_TYPE_MASK, cv::NORM_RELATIVE, cv::NORM_MINMAX)
+
+    CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+
+    CV_ENUM(BorderType, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
+
+    CV_FLAGS(WarpFlags, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::WARP_INVERSE_MAP)
+
+    //////////////////////////////////////////////////////////////////////
+    // Features2D
+
+    CV_EXPORTS testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual);
+
+    #define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual)
+
+    CV_EXPORTS int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual);
+    CV_EXPORTS int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches);
+
+    //////////////////////////////////////////////////////////////////////
+    // Other
+
+    CV_EXPORTS void dumpImage(const std::string& fileName, const cv::Mat& image);
+    CV_EXPORTS void showDiff(cv::InputArray gold, cv::InputArray actual, double eps);
+
+    CV_EXPORTS void printCudaInfo();
+}
+
+namespace cv { namespace gpu
+{
+    CV_EXPORTS void PrintTo(const DeviceInfo& info, std::ostream* os);
+}}
+
+#endif // __OPENCV_GPU_TEST_UTILITY_HPP__
diff --git a/modules/ts/src/gpu_perf.cpp b/modules/ts/src/gpu_perf.cpp
new file mode 100644
index 000000000..99d74b133
--- /dev/null
+++ b/modules/ts/src/gpu_perf.cpp
@@ -0,0 +1,314 @@
+#include "opencv2/ts/gpu_perf.hpp"
+#include "opencv2/core/gpumat.hpp"
+
+#include "cvconfig.h"
+
+#ifdef HAVE_CUDA
+    #include <cuda_runtime.h>
+#endif
+
+using namespace cv;
+using namespace std;
+
+namespace perf
+{
+    Mat readImage(const string& fileName, int flags)
+    {
+        return imread(perf::TestBase::getDataPath(fileName), flags);
+    }
+
+    void PrintTo(const CvtColorInfo& info, std::ostream* os)
+    {
+        static const char* str[] =
+        {
+            "BGR2BGRA",
+            "BGRA2BGR",
+            "BGR2RGBA",
+            "RGBA2BGR",
+            "BGR2RGB",
+            "BGRA2RGBA",
+
+            "BGR2GRAY",
+            "RGB2GRAY",
+            "GRAY2BGR",
+            "GRAY2BGRA",
+            "BGRA2GRAY",
+            "RGBA2GRAY",
+
+            "BGR2BGR565",
+            "RGB2BGR565",
+            "BGR5652BGR",
+            "BGR5652RGB",
+            "BGRA2BGR565",
+            "RGBA2BGR565",
+            "BGR5652BGRA",
+            "BGR5652RGBA",
+
+            "GRAY2BGR565",
+            "BGR5652GRAY",
+
+            "BGR2BGR555",
+            "RGB2BGR555",
+            "BGR5552BGR",
+            "BGR5552RGB",
+            "BGRA2BGR555",
+            "RGBA2BGR555",
+            "BGR5552BGRA",
+            "BGR5552RGBA",
+
+            "GRAY2BGR555",
+            "BGR5552GRAY",
+
+            "BGR2XYZ",
+            "RGB2XYZ",
+            "XYZ2BGR",
+            "XYZ2RGB",
+
+            "BGR2YCrCb",
+            "RGB2YCrCb",
+            "YCrCb2BGR",
+            "YCrCb2RGB",
+
+            "BGR2HSV",
+            "RGB2HSV",
+
+            "",
+            "",
+
+            "BGR2Lab",
+            "RGB2Lab",
+
+            "BayerBG2BGR",
+            "BayerGB2BGR",
+            "BayerRG2BGR",
+            "BayerGR2BGR",
+
+            "BGR2Luv",
+            "RGB2Luv",
+
+            "BGR2HLS",
+            "RGB2HLS",
+
+            "HSV2BGR",
+            "HSV2RGB",
+
+            "Lab2BGR",
+            "Lab2RGB",
+            "Luv2BGR",
+            "Luv2RGB",
+
+            "HLS2BGR",
+            "HLS2RGB",
+
+            "BayerBG2BGR_VNG",
+            "BayerGB2BGR_VNG",
+            "BayerRG2BGR_VNG",
+            "BayerGR2BGR_VNG",
+
+            "BGR2HSV_FULL",
+            "RGB2HSV_FULL",
+            "BGR2HLS_FULL",
+            "RGB2HLS_FULL",
+
+            "HSV2BGR_FULL",
+            "HSV2RGB_FULL",
+            "HLS2BGR_FULL",
+            "HLS2RGB_FULL",
+
+            "LBGR2Lab",
+            "LRGB2Lab",
+            "LBGR2Luv",
+            "LRGB2Luv",
+
+            "Lab2LBGR",
+            "Lab2LRGB",
+            "Luv2LBGR",
+            "Luv2LRGB",
+
+            "BGR2YUV",
+            "RGB2YUV",
+            "YUV2BGR",
+            "YUV2RGB",
+
+            "BayerBG2GRAY",
+            "BayerGB2GRAY",
+            "BayerRG2GRAY",
+            "BayerGR2GRAY",
+
+            //YUV 4:2:0 formats family
+            "YUV2RGB_NV12",
+            "YUV2BGR_NV12",
+            "YUV2RGB_NV21",
+            "YUV2BGR_NV21",
+
+            "YUV2RGBA_NV12",
+            "YUV2BGRA_NV12",
+            "YUV2RGBA_NV21",
+            "YUV2BGRA_NV21",
+
+            "YUV2RGB_YV12",
+            "YUV2BGR_YV12",
+            "YUV2RGB_IYUV",
+            "YUV2BGR_IYUV",
+
+            "YUV2RGBA_YV12",
+            "YUV2BGRA_YV12",
+            "YUV2RGBA_IYUV",
+            "YUV2BGRA_IYUV",
+
+            "YUV2GRAY_420",
+
+            //YUV 4:2:2 formats family
+            "YUV2RGB_UYVY",
+            "YUV2BGR_UYVY",
+            "YUV2RGB_VYUY",
+            "YUV2BGR_VYUY",
+
+            "YUV2RGBA_UYVY",
+            "YUV2BGRA_UYVY",
+            "YUV2RGBA_VYUY",
+            "YUV2BGRA_VYUY",
+
+            "YUV2RGB_YUY2",
+            "YUV2BGR_YUY2",
+            "YUV2RGB_YVYU",
+            "YUV2BGR_YVYU",
+
+            "YUV2RGBA_YUY2",
+            "YUV2BGRA_YUY2",
+            "YUV2RGBA_YVYU",
+            "YUV2BGRA_YVYU",
+
+            "YUV2GRAY_UYVY",
+            "YUV2GRAY_YUY2",
+
+            // alpha premultiplication
+            "RGBA2mRGBA",
+            "mRGBA2RGBA",
+
+            "COLORCVT_MAX"
+        };
+
+        *os << str[info.code];
+    }
+
+    static void printOsInfo()
+    {
+    #if defined _WIN32
+    #   if defined _WIN64
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"), fflush(stdout);
+    #   else
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"), fflush(stdout);
+    #   endif
+    #elif defined linux
+    #   if defined _LP64
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"), fflush(stdout);
+    #   else
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"), fflush(stdout);
+    #   endif
+    #elif defined __APPLE__
+    #   if defined _LP64
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"), fflush(stdout);
+    #   else
+            printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"), fflush(stdout);
+    #   endif
+    #endif
+
+    }
+
+    void printCudaInfo()
+    {
+        printOsInfo();
+    #ifndef HAVE_CUDA
+        printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
+    #else
+        int driver;
+        cudaDriverGetVersion(&driver);
+
+        printf("[----------]\n"), fflush(stdout);
+        printf("[ GPU INFO ] \tCUDA Driver  version: %d.\n", driver), fflush(stdout);
+        printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
+        printf("[----------]\n"), fflush(stdout);
+
+        printf("[----------]\n"), fflush(stdout);
+        printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout);
+        printf("[      BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
+        printf("[      PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
+        printf("[----------]\n"), fflush(stdout);
+
+        printf("[----------]\n"), fflush(stdout);
+        int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
+        printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
+        printf("[----------]\n"), fflush(stdout);
+
+        for (int i = 0; i < deviceCount; ++i)
+        {
+            cv::gpu::DeviceInfo info(i);
+
+            printf("[----------]\n"), fflush(stdout);
+            printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
+            printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
+            printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
+            printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
+            printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
+            if (!info.isCompatible())
+                printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
+            printf("[----------]\n"), fflush(stdout);
+        }
+
+    #endif
+    }
+
+    struct KeypointIdxCompare
+    {
+        std::vector<cv::KeyPoint>* keypoints;
+
+        explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
+
+        bool operator ()(size_t i1, size_t i2) const
+        {
+            cv::KeyPoint kp1 = (*keypoints)[i1];
+            cv::KeyPoint kp2 = (*keypoints)[i2];
+            if (kp1.pt.x != kp2.pt.x)
+                return kp1.pt.x < kp2.pt.x;
+            if (kp1.pt.y != kp2.pt.y)
+                return kp1.pt.y < kp2.pt.y;
+            if (kp1.response != kp2.response)
+                return kp1.response < kp2.response;
+            return kp1.octave < kp2.octave;
+        }
+    };
+
+    void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors)
+    {
+        std::vector<size_t> indexies(keypoints.size());
+        for (size_t i = 0; i < indexies.size(); ++i)
+            indexies[i] = i;
+
+        std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
+
+        std::vector<cv::KeyPoint> new_keypoints;
+        cv::Mat new_descriptors;
+
+        new_keypoints.resize(keypoints.size());
+
+        cv::Mat descriptors;
+        if (_descriptors.needed())
+        {
+            descriptors = _descriptors.getMat();
+            new_descriptors.create(descriptors.size(), descriptors.type());
+        }
+
+        for (size_t i = 0; i < indexies.size(); ++i)
+        {
+            size_t new_idx = indexies[i];
+            new_keypoints[i] = keypoints[new_idx];
+            if (!new_descriptors.empty())
+                descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
+        }
+
+        keypoints.swap(new_keypoints);
+        if (_descriptors.needed())
+            new_descriptors.copyTo(_descriptors);
+    }
+}
diff --git a/modules/ts/src/gpu_test.cpp b/modules/ts/src/gpu_test.cpp
new file mode 100644
index 000000000..2ac9467b4
--- /dev/null
+++ b/modules/ts/src/gpu_test.cpp
@@ -0,0 +1,479 @@
+#include "opencv2/ts/gpu_test.hpp"
+#include <stdexcept>
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+using namespace testing::internal;
+
+namespace perf
+{
+    CV_EXPORTS void printCudaInfo();
+}
+
+namespace cvtest
+{
+    //////////////////////////////////////////////////////////////////////
+    // random generators
+
+    int randomInt(int minVal, int maxVal)
+    {
+        RNG& rng = TS::ptr()->get_rng();
+        return rng.uniform(minVal, maxVal);
+    }
+
+    double randomDouble(double minVal, double maxVal)
+    {
+        RNG& rng = TS::ptr()->get_rng();
+        return rng.uniform(minVal, maxVal);
+    }
+
+    Size randomSize(int minVal, int maxVal)
+    {
+        return Size(randomInt(minVal, maxVal), randomInt(minVal, maxVal));
+    }
+
+    Scalar randomScalar(double minVal, double maxVal)
+    {
+        return Scalar(randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal), randomDouble(minVal, maxVal));
+    }
+
+    Mat randomMat(Size size, int type, double minVal, double maxVal)
+    {
+        return randomMat(TS::ptr()->get_rng(), size, type, minVal, maxVal, false);
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // GpuMat create
+
+    GpuMat createMat(Size size, int type, bool useRoi)
+    {
+        Size size0 = size;
+
+        if (useRoi)
+        {
+            size0.width += randomInt(5, 15);
+            size0.height += randomInt(5, 15);
+        }
+
+        GpuMat d_m(size0, type);
+
+        if (size0 != size)
+            d_m = d_m(Rect((size0.width - size.width) / 2, (size0.height - size.height) / 2, size.width, size.height));
+
+        return d_m;
+    }
+
+    GpuMat loadMat(const Mat& m, bool useRoi)
+    {
+        GpuMat d_m = createMat(m.size(), m.type(), useRoi);
+        d_m.upload(m);
+        return d_m;
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // Image load
+
+    Mat readImage(const std::string& fileName, int flags)
+    {
+        return imread(TS::ptr()->get_data_path() + fileName, flags);
+    }
+
+    Mat readImageType(const std::string& fname, int type)
+    {
+        Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
+        if (CV_MAT_CN(type) == 4)
+        {
+            Mat temp;
+            cvtColor(src, temp, COLOR_BGR2BGRA);
+            swap(src, temp);
+        }
+        src.convertTo(src, CV_MAT_DEPTH(type), CV_MAT_DEPTH(type) == CV_32F ? 1.0 / 255.0 : 1.0);
+        return src;
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // Gpu devices
+
+    bool supportFeature(const DeviceInfo& info, FeatureSet feature)
+    {
+        return TargetArchs::builtWith(feature) && info.supports(feature);
+    }
+
+    DeviceManager& DeviceManager::instance()
+    {
+        static DeviceManager obj;
+        return obj;
+    }
+
+    void DeviceManager::load(int i)
+    {
+        devices_.clear();
+        devices_.reserve(1);
+
+        std::ostringstream msg;
+
+        if (i < 0 || i >= getCudaEnabledDeviceCount())
+        {
+            msg << "Incorrect device number - " << i;
+            throw std::runtime_error(msg.str());
+        }
+
+        DeviceInfo info(i);
+
+        if (!info.isCompatible())
+        {
+            msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+            throw std::runtime_error(msg.str());
+        }
+
+        devices_.push_back(info);
+    }
+
+    void DeviceManager::loadAll()
+    {
+        int deviceCount = getCudaEnabledDeviceCount();
+
+        devices_.clear();
+        devices_.reserve(deviceCount);
+
+        for (int i = 0; i < deviceCount; ++i)
+        {
+            DeviceInfo info(i);
+            if (info.isCompatible())
+            {
+                devices_.push_back(info);
+            }
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // Additional assertion
+
+    namespace
+    {
+        template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
+        {
+            const int cn = m.channels();
+
+            std::ostringstream ostr;
+            ostr << "(";
+
+            p.x /= cn;
+
+            ostr << static_cast<OutT>(m.at<T>(p.y, p.x * cn));
+            for (int c = 1; c < m.channels(); ++c)
+            {
+                ostr << ", " << static_cast<OutT>(m.at<T>(p.y, p.x * cn + c));
+            }
+            ostr << ")";
+
+            return ostr.str();
+        }
+
+        std::string printMatVal(const Mat& m, Point p)
+        {
+            typedef std::string (*func_t)(const Mat& m, Point p);
+
+            static const func_t funcs[] =
+            {
+                printMatValImpl<uchar, int>, printMatValImpl<schar, int>, printMatValImpl<ushort, int>, printMatValImpl<short, int>,
+                printMatValImpl<int, int>, printMatValImpl<float, float>, printMatValImpl<double, double>
+            };
+
+            return funcs[m.depth()](m, p);
+        }
+    }
+
+    void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minLoc_, Point* maxLoc_, const Mat& mask)
+    {
+        if (src.depth() != CV_8S)
+        {
+            minMaxLoc(src, minVal_, maxVal_, minLoc_, maxLoc_, mask);
+            return;
+        }
+
+        // OpenCV's minMaxLoc doesn't support CV_8S type
+        double minVal = std::numeric_limits<double>::max();
+        Point minLoc(-1, -1);
+
+        double maxVal = -std::numeric_limits<double>::max();
+        Point maxLoc(-1, -1);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            const schar* src_row = src.ptr<schar>(y);
+            const uchar* mask_row = mask.empty() ? 0 : mask.ptr<uchar>(y);
+
+            for (int x = 0; x < src.cols; ++x)
+            {
+                if (!mask_row || mask_row[x])
+                {
+                    schar val = src_row[x];
+
+                    if (val < minVal)
+                    {
+                        minVal = val;
+                        minLoc = cv::Point(x, y);
+                    }
+
+                    if (val > maxVal)
+                    {
+                        maxVal = val;
+                        maxLoc = cv::Point(x, y);
+                    }
+                }
+            }
+        }
+
+        if (minVal_) *minVal_ = minVal;
+        if (maxVal_) *maxVal_ = maxVal;
+
+        if (minLoc_) *minLoc_ = minLoc;
+        if (maxLoc_) *maxLoc_ = maxLoc;
+    }
+
+    Mat getMat(InputArray arr)
+    {
+        if (arr.kind() == _InputArray::GPU_MAT)
+        {
+            Mat m;
+            arr.getGpuMat().download(m);
+            return m;
+        }
+
+        return arr.getMat();
+    }
+
+    AssertionResult assertMatNear(const char* expr1, const char* expr2, const char* eps_expr, InputArray m1_, InputArray m2_, double eps)
+    {
+        Mat m1 = getMat(m1_);
+        Mat m2 = getMat(m2_);
+
+        if (m1.size() != m2.size())
+        {
+            return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different sizes : \""
+                                      << expr1 << "\" [" << PrintToString(m1.size()) << "] vs \""
+                                      << expr2 << "\" [" << PrintToString(m2.size()) << "]";
+        }
+
+        if (m1.type() != m2.type())
+        {
+            return AssertionFailure() << "Matrices \"" << expr1 << "\" and \"" << expr2 << "\" have different types : \""
+                                      << expr1 << "\" [" << PrintToString(MatType(m1.type())) << "] vs \""
+                                      << expr2 << "\" [" << PrintToString(MatType(m2.type())) << "]";
+        }
+
+        Mat diff;
+        absdiff(m1.reshape(1), m2.reshape(1), diff);
+
+        double maxVal = 0.0;
+        Point maxLoc;
+        minMaxLocGold(diff, 0, &maxVal, 0, &maxLoc);
+
+        if (maxVal > eps)
+        {
+            return AssertionFailure() << "The max difference between matrices \"" << expr1 << "\" and \"" << expr2
+                                      << "\" is " << maxVal << " at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ")"
+                                      << ", which exceeds \"" << eps_expr << "\", where \""
+                                      << expr1 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m1, maxLoc) << ", \""
+                                      << expr2 << "\" at (" << maxLoc.y << ", " << maxLoc.x / m1.channels() << ") evaluates to " << printMatVal(m2, maxLoc) << ", \""
+                                      << eps_expr << "\" evaluates to " << eps;
+        }
+
+        return AssertionSuccess();
+    }
+
+    double checkSimilarity(InputArray m1, InputArray m2)
+    {
+        Mat diff;
+        matchTemplate(getMat(m1), getMat(m2), diff, CV_TM_CCORR_NORMED);
+        return std::abs(diff.at<float>(0, 0) - 1.f);
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // Helper structs for value-parameterized tests
+
+    vector<MatType> types(int depth_start, int depth_end, int cn_start, int cn_end)
+    {
+        vector<MatType> v;
+
+        v.reserve((depth_end - depth_start + 1) * (cn_end - cn_start + 1));
+
+        for (int depth = depth_start; depth <= depth_end; ++depth)
+        {
+            for (int cn = cn_start; cn <= cn_end; ++cn)
+            {
+                v.push_back(MatType(CV_MAKE_TYPE(depth, cn)));
+            }
+        }
+
+        return v;
+    }
+
+    const vector<MatType>& all_types()
+    {
+        static vector<MatType> v = types(CV_8U, CV_64F, 1, 4);
+
+        return v;
+    }
+
+    void PrintTo(const UseRoi& useRoi, std::ostream* os)
+    {
+        if (useRoi)
+            (*os) << "sub matrix";
+        else
+            (*os) << "whole matrix";
+    }
+
+    void PrintTo(const Inverse& inverse, std::ostream* os)
+    {
+        if (inverse)
+            (*os) << "inverse";
+        else
+            (*os) << "direct";
+    }
+
+    //////////////////////////////////////////////////////////////////////
+    // Other
+
+    void dumpImage(const std::string& fileName, const Mat& image)
+    {
+        imwrite(TS::ptr()->get_data_path() + fileName, image);
+    }
+
+    void showDiff(InputArray gold_, InputArray actual_, double eps)
+    {
+        Mat gold = getMat(gold_);
+        Mat actual = getMat(actual_);
+
+        Mat diff;
+        absdiff(gold, actual, diff);
+        threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+
+        namedWindow("gold", WINDOW_NORMAL);
+        namedWindow("actual", WINDOW_NORMAL);
+        namedWindow("diff", WINDOW_NORMAL);
+
+        imshow("gold", gold);
+        imshow("actual", actual);
+        imshow("diff", diff);
+
+        waitKey();
+    }
+
+    namespace
+    {
+        bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
+        {
+            const double maxPtDif = 1.0;
+            const double maxSizeDif = 1.0;
+            const double maxAngleDif = 2.0;
+            const double maxResponseDif = 0.1;
+
+            double dist = cv::norm(p1.pt - p2.pt);
+
+            if (dist < maxPtDif &&
+                fabs(p1.size - p2.size) < maxSizeDif &&
+                abs(p1.angle - p2.angle) < maxAngleDif &&
+                abs(p1.response - p2.response) < maxResponseDif &&
+                p1.octave == p2.octave &&
+                p1.class_id == p2.class_id)
+            {
+                return true;
+            }
+
+            return false;
+        }
+
+        struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
+        {
+            bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
+            {
+                return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
+            }
+        };
+    }
+
+    testing::AssertionResult assertKeyPointsEquals(const char* gold_expr, const char* actual_expr, std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+    {
+        if (gold.size() != actual.size())
+        {
+            return testing::AssertionFailure() << "KeyPoints size mistmach\n"
+                                               << "\"" << gold_expr << "\" : " << gold.size() << "\n"
+                                               << "\"" << actual_expr << "\" : " << actual.size();
+        }
+
+        std::sort(actual.begin(), actual.end(), KeyPointLess());
+        std::sort(gold.begin(), gold.end(), KeyPointLess());
+
+        for (size_t i = 0; i < gold.size(); ++i)
+        {
+            const cv::KeyPoint& p1 = gold[i];
+            const cv::KeyPoint& p2 = actual[i];
+
+            if (!keyPointsEquals(p1, p2))
+            {
+                return testing::AssertionFailure() << "KeyPoints differ at " << i << "\n"
+                                                   << "\"" << gold_expr << "\" vs \"" << actual_expr << "\" : \n"
+                                                   << "pt : " << testing::PrintToString(p1.pt) << " vs " << testing::PrintToString(p2.pt) << "\n"
+                                                   << "size : " << p1.size << " vs " << p2.size << "\n"
+                                                   << "angle : " << p1.angle << " vs " << p2.angle << "\n"
+                                                   << "response : " << p1.response << " vs " << p2.response << "\n"
+                                                   << "octave : " << p1.octave << " vs " << p2.octave << "\n"
+                                                   << "class_id : " << p1.class_id << " vs " << p2.class_id;
+            }
+        }
+
+        return ::testing::AssertionSuccess();
+    }
+
+    int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
+    {
+        std::sort(actual.begin(), actual.end(), KeyPointLess());
+        std::sort(gold.begin(), gold.end(), KeyPointLess());
+
+        int validCount = 0;
+
+        for (size_t i = 0; i < gold.size(); ++i)
+        {
+            const cv::KeyPoint& p1 = gold[i];
+            const cv::KeyPoint& p2 = actual[i];
+
+            if (keyPointsEquals(p1, p2))
+                ++validCount;
+        }
+
+        return validCount;
+    }
+
+    int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, const std::vector<cv::KeyPoint>& keypoints2, const std::vector<cv::DMatch>& matches)
+    {
+        int validCount = 0;
+
+        for (size_t i = 0; i < matches.size(); ++i)
+        {
+            const cv::DMatch& m = matches[i];
+
+            const cv::KeyPoint& p1 = keypoints1[m.queryIdx];
+            const cv::KeyPoint& p2 = keypoints2[m.trainIdx];
+
+            if (keyPointsEquals(p1, p2))
+                ++validCount;
+        }
+
+        return validCount;
+    }
+
+    void printCudaInfo()
+    {
+        perf::printCudaInfo();
+    }
+}
+
+
+void cv::gpu::PrintTo(const DeviceInfo& info, std::ostream* os)
+{
+    (*os) << info.name();
+}
diff --git a/samples/android/CMakeLists.txt b/samples/android/CMakeLists.txt
index a0794cb97..9d7b0cbf0 100644
--- a/samples/android/CMakeLists.txt
+++ b/samples/android/CMakeLists.txt
@@ -11,6 +11,10 @@ add_subdirectory(face-detection)
 add_subdirectory(image-manipulations)
 add_subdirectory(color-blob-detection)
 
+if (ANDROID_NATIVE_API_LEVEL GREATER 8)
+  add_subdirectory(native-activity)
+endif()
+
 add_subdirectory(tutorial-1-camerapreview)
 add_subdirectory(tutorial-2-mixedprocessing)
 add_subdirectory(tutorial-3-cameracontrol)
diff --git a/samples/android/native-activity/AndroidManifest.xml b/samples/android/native-activity/AndroidManifest.xml
new file mode 100644
index 000000000..369bc7502
--- /dev/null
+++ b/samples/android/native-activity/AndroidManifest.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+      package="org.opencv.samples.NativeActivity"
+      android:versionCode="1"
+      android:versionName="1.0">
+    <uses-sdk android:minSdkVersion="9" />
+    <application android:label="@string/app_name"
+                 android:icon="@drawable/icon"
+                 android:debuggable="true">
+
+        <activity android:name="CvNativeActivity"
+                  android:label="@string/app_name"
+                  android:configChanges="orientation|keyboardHidden">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                    <category android:name="android.intent.category.LAUNCHER" />
+                </intent-filter>
+            </activity>
+        <activity android:name="android.app.NativeActivity"
+                  android:label="@string/app_name">
+            <meta-data android:name="android.app.lib_name"
+                    android:value="native_activity" />
+        </activity>
+    </application>
+
+   <uses-permission android:name="android.permission.CAMERA"/>
+
+   <uses-feature android:name="android.hardware.camera" android:required="false"/>
+   <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
+   <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
+   <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
+
+</manifest>
diff --git a/samples/android/native-activity/jni/Android.mk b/samples/android/native-activity/jni/Android.mk
new file mode 100644
index 000000000..720d38b40
--- /dev/null
+++ b/samples/android/native-activity/jni/Android.mk
@@ -0,0 +1,14 @@
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+include ../../sdk/native/jni/OpenCV.mk
+
+LOCAL_MODULE    := native_activity
+LOCAL_SRC_FILES := native.cpp
+LOCAL_LDLIBS    := -lm -llog -landroid
+LOCAL_STATIC_LIBRARIES := android_native_app_glue
+
+include $(BUILD_SHARED_LIBRARY)
+
+$(call import-module,android/native_app_glue)
diff --git a/samples/android/native-activity/jni/Application.mk b/samples/android/native-activity/jni/Application.mk
new file mode 100644
index 000000000..a89e12df1
--- /dev/null
+++ b/samples/android/native-activity/jni/Application.mk
@@ -0,0 +1,2 @@
+APP_ABI := armeabi-v7a
+APP_PLATFORM := android-9
diff --git a/samples/android/native-activity/jni/native.cpp b/samples/android/native-activity/jni/native.cpp
new file mode 100644
index 000000000..38dda0603
--- /dev/null
+++ b/samples/android/native-activity/jni/native.cpp
@@ -0,0 +1,221 @@
+#include <android_native_app_glue.h>
+
+#include <errno.h>
+#include <jni.h>
+#include <sys/time.h>
+#include <time.h>
+#include <android/log.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <queue>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
+#define  LOG_TAG    "OCV:libnative_activity"
+#define  LOGD(...)  __android_log_print(ANDROID_LOG_DEBUG,LOG_TAG,__VA_ARGS__)
+#define  LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
+#define  LOGW(...)  __android_log_print(ANDROID_LOG_WARN,LOG_TAG,__VA_ARGS__)
+#define  LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)
+
+struct Engine
+{
+    android_app* app;
+    cv::Ptr<cv::VideoCapture> capture;
+};
+
+cv::Size calc_optimal_camera_resolution(const char* supported, int width, int height)
+{
+    int frame_width = 0;
+    int frame_height = 0;
+
+    size_t prev_idx = 0;
+    size_t idx = 0;
+    float min_diff = FLT_MAX;
+
+    do
+    {
+        int tmp_width;
+        int tmp_height;
+
+        prev_idx = idx;
+        while ((supported[idx] != '\0') && (supported[idx] != ','))
+            idx++;
+
+        sscanf(&supported[prev_idx], "%dx%d", &tmp_width, &tmp_height);
+
+        int w_diff = width - tmp_width;
+        int h_diff = height - tmp_height;
+        if ((h_diff >= 0) && (w_diff >= 0))
+        {
+            if ((h_diff <= min_diff) && (tmp_height <= 720))
+            {
+                frame_width = tmp_width;
+                frame_height = tmp_height;
+                min_diff = h_diff;
+            }
+        }
+
+        idx++; // to skip coma symbol
+
+    } while(supported[idx-1] != '\0');
+
+    return cv::Size(frame_width, frame_height);
+}
+
+static void engine_draw_frame(Engine* engine, const cv::Mat& frame)
+{
+    if (engine->app->window == NULL)
+        return; // No window.
+
+    ANativeWindow_Buffer buffer;
+    if (ANativeWindow_lock(engine->app->window, &buffer, NULL) < 0)
+    {
+        LOGW("Unable to lock window buffer");
+        return;
+    }
+
+    void* pixels = buffer.bits;
+
+    int left_indent = (buffer.width-frame.cols)/2;
+    int top_indent = (buffer.height-frame.rows)/2;
+
+    for (int yy = top_indent; yy < std::min(frame.rows+top_indent, buffer.height); yy++)
+    {
+        unsigned char* line = (unsigned char*)pixels;
+        memcpy(line+left_indent*4*sizeof(unsigned char), frame.ptr<unsigned char>(yy),
+               std::min(frame.cols, buffer.width)*4*sizeof(unsigned char));
+        // go to next line
+        pixels = (int32_t*)pixels + buffer.stride;
+    }
+    ANativeWindow_unlockAndPost(engine->app->window);
+}
+
+static void engine_handle_cmd(android_app* app, int32_t cmd)
+{
+    Engine* engine = (Engine*)app->userData;
+    switch (cmd)
+    {
+        case APP_CMD_INIT_WINDOW:
+            if (app->window != NULL)
+            {
+                LOGI("APP_CMD_INIT_WINDOW");
+
+                engine->capture = new cv::VideoCapture(0);
+
+                union {double prop; const char* name;} u;
+                u.prop = engine->capture->get(CV_CAP_PROP_SUPPORTED_PREVIEW_SIZES_STRING);
+
+                int view_width = ANativeWindow_getWidth(app->window);
+                int view_height = ANativeWindow_getHeight(app->window);
+
+                cv::Size camera_resolution;
+                if (u.name)
+                    camera_resolution = calc_optimal_camera_resolution(u.name, 640, 480);
+                else
+                {
+                    LOGE("Cannot get supported camera camera_resolutions");
+                    camera_resolution = cv::Size(ANativeWindow_getWidth(app->window),
+                                          ANativeWindow_getHeight(app->window));
+                }
+
+                if ((camera_resolution.width != 0) && (camera_resolution.height != 0))
+                {
+                    engine->capture->set(CV_CAP_PROP_FRAME_WIDTH, camera_resolution.width);
+                    engine->capture->set(CV_CAP_PROP_FRAME_HEIGHT, camera_resolution.height);
+                }
+
+                float scale = std::min((float)view_width/camera_resolution.width,
+                                       (float)view_height/camera_resolution.height);
+
+                if (ANativeWindow_setBuffersGeometry(app->window, (int)(view_width/scale),
+                    int(view_height/scale), WINDOW_FORMAT_RGBA_8888) < 0)
+                {
+                    LOGE("Cannot set pixel format!");
+                    return;
+                }
+
+                LOGI("Camera initialized at resoution %dx%d", camera_resolution.width, camera_resolution.height);
+            }
+            break;
+        case APP_CMD_TERM_WINDOW:
+            LOGI("APP_CMD_TERM_WINDOW");
+
+            engine->capture->release();
+            break;
+    }
+}
+
+void android_main(android_app* app)
+{
+    Engine engine;
+
+    // Make sure glue isn't stripped.
+    app_dummy();
+
+    memset(&engine, 0, sizeof(engine));
+    app->userData = &engine;
+    app->onAppCmd = engine_handle_cmd;
+    engine.app = app;
+
+    float fps = 0;
+    cv::Mat drawing_frame;
+    std::queue<int64> time_queue;
+
+    // loop waiting for stuff to do.
+    while (1)
+    {
+        // Read all pending events.
+        int ident;
+        int events;
+        android_poll_source* source;
+
+        // Process system events
+        while ((ident=ALooper_pollAll(0, NULL, &events, (void**)&source)) >= 0)
+        {
+            // Process this event.
+            if (source != NULL)
+            {
+                source->process(app, source);
+            }
+
+            // Check if we are exiting.
+            if (app->destroyRequested != 0)
+            {
+                LOGI("Engine thread destroy requested!");
+                return;
+            }
+        }
+
+        int64 then;
+        int64 now = cv::getTickCount();
+        time_queue.push(now);
+
+        // Capture frame from camera and draw it
+        if (!engine.capture.empty())
+        {
+            if (engine.capture->grab())
+                engine.capture->retrieve(drawing_frame, CV_CAP_ANDROID_COLOR_FRAME_RGBA);
+
+             char buffer[256];
+             sprintf(buffer, "Display performance: %dx%d @ %.3f", drawing_frame.cols, drawing_frame.rows, fps);
+             cv::putText(drawing_frame, std::string(buffer), cv::Point(8,64),
+                         cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(0,255,0,255));
+             engine_draw_frame(&engine, drawing_frame);
+        }
+
+        if (time_queue.size() >= 2)
+            then = time_queue.front();
+        else
+            then = 0;
+
+        if (time_queue.size() >= 25)
+            time_queue.pop();
+
+        fps = time_queue.size() * (float)cv::getTickFrequency() / (now-then);
+    }
+}
diff --git a/samples/android/native-activity/res/drawable/icon.png b/samples/android/native-activity/res/drawable/icon.png
new file mode 100644
index 000000000..630454927
Binary files /dev/null and b/samples/android/native-activity/res/drawable/icon.png differ
diff --git a/samples/android/native-activity/res/values/strings.xml b/samples/android/native-activity/res/values/strings.xml
new file mode 100644
index 000000000..888534efe
--- /dev/null
+++ b/samples/android/native-activity/res/values/strings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <string name="app_name">OCV Native Activity</string>
+</resources>
diff --git a/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java
new file mode 100644
index 000000000..d8e437533
--- /dev/null
+++ b/samples/android/native-activity/src/org/opencv/samples/NativeActivity/CvNativeActivity.java
@@ -0,0 +1,44 @@
+package org.opencv.samples.NativeActivity;
+
+import org.opencv.android.BaseLoaderCallback;
+import org.opencv.android.LoaderCallbackInterface;
+import org.opencv.android.OpenCVLoader;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+public class CvNativeActivity extends Activity {
+    private static final String TAG = "OCVSample::Activity";
+
+    private BaseLoaderCallback mLoaderCallback = new BaseLoaderCallback(this) {
+        @Override
+        public void onManagerConnected(int status) {
+            switch (status) {
+                case LoaderCallbackInterface.SUCCESS:
+                {
+                    Log.i(TAG, "OpenCV loaded successfully");
+                    System.loadLibrary("native_activity");
+                    Intent intent = new Intent(CvNativeActivity.this, android.app.NativeActivity.class);
+                    CvNativeActivity.this.startActivity(intent);
+                } break;
+                default:
+                {
+                    super.onManagerConnected(status);
+                } break;
+            }
+        }
+    };
+
+    public CvNativeActivity() {
+        Log.i(TAG, "Instantiated new " + this.getClass());
+    }
+
+   @Override
+    public void onResume()
+    {
+        super.onResume();
+        OpenCVLoader.initAsync(OpenCVLoader.OPENCV_VERSION_2_4_3, this, mLoaderCallback);
+    }
+}
\ No newline at end of file
diff --git a/samples/android/tutorial-3-cameracontrol/res/layout/tutorial3_surface_view.xml b/samples/android/tutorial-3-cameracontrol/res/layout/tutorial3_surface_view.xml
index 79fef4eae..a57490578 100644
--- a/samples/android/tutorial-3-cameracontrol/res/layout/tutorial3_surface_view.xml
+++ b/samples/android/tutorial-3-cameracontrol/res/layout/tutorial3_surface_view.xml
@@ -4,8 +4,8 @@
     android:layout_height="match_parent" >
 
     <org.opencv.samples.tutorial3.Tutorial3View
-        android:layout_width="fill_parent"
-        android:layout_height="fill_parent"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
         android:visibility="gone"
         android:id="@+id/tutorial3_activity_java_surface_view" />
 
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 05bbfe874..f3a38b80b 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
   ocv_include_directories("${OpenCV_SOURCE_DIR}/include")#for opencv.hpp
   ocv_include_modules(${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 
-  if (HAVE_opencv_gpu)
+  if(HAVE_opencv_gpu)
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
   endif()
 
@@ -41,7 +41,7 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     add_executable(${the_target} ${srcs})
     target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_CPP_SAMPLES_REQUIRED_DEPS})
 
-    if (HAVE_opencv_gpu)
+    if(HAVE_opencv_gpu)
       target_link_libraries(${the_target} opencv_gpu)
     endif()
 
diff --git a/samples/cpp/letter_recog.cpp b/samples/cpp/letter_recog.cpp
index 49a55fc62..74d5971ca 100644
--- a/samples/cpp/letter_recog.cpp
+++ b/samples/cpp/letter_recog.cpp
@@ -131,7 +131,7 @@ int build_rtrees_classifier( char* data_filename,
             printf( "Could not read the classifier %s\n", filename_to_load );
             return -1;
         }
-        printf( "The classifier %s is loaded.\n", data_filename );
+        printf( "The classifier %s is loaded.\n", filename_to_load );
     }
     else
     {
@@ -262,7 +262,7 @@ int build_boost_classifier( char* data_filename,
             printf( "Could not read the classifier %s\n", filename_to_load );
             return -1;
         }
-        printf( "The classifier %s is loaded.\n", data_filename );
+        printf( "The classifier %s is loaded.\n", filename_to_load );
     }
     else
     {
@@ -403,7 +403,7 @@ int build_mlp_classifier( char* data_filename,
             printf( "Could not read the classifier %s\n", filename_to_load );
             return -1;
         }
-        printf( "The classifier %s is loaded.\n", data_filename );
+        printf( "The classifier %s is loaded.\n", filename_to_load );
     }
     else
     {
@@ -639,10 +639,11 @@ int build_nbayes_classifier( char* data_filename )
 }
 
 static
-int build_svm_classifier( char* data_filename )
+int build_svm_classifier( char* data_filename, const char* filename_to_save, const char* filename_to_load )
 {
     CvMat* data = 0;
     CvMat* responses = 0;
+    CvMat* train_resp = 0;
     CvMat train_data;
     int nsamples_all = 0, ntrain_samples = 0;
     int var_count;
@@ -666,13 +667,29 @@ int build_svm_classifier( char* data_filename )
     ntrain_samples = (int)(nsamples_all*0.1);
     var_count = data->cols;
 
-    // train classifier
-    printf( "Training the classifier (may take a few minutes)...\n");
-    cvGetRows( data, &train_data, 0, ntrain_samples );
-    CvMat* train_resp = cvCreateMat( ntrain_samples, 1, CV_32FC1);
-    for (int i = 0; i < ntrain_samples; i++)
-        train_resp->data.fl[i] = responses->data.fl[i];
-    svm.train(&train_data, train_resp, 0, 0, param);
+    // Create or load Random Trees classifier
+    if( filename_to_load )
+    {
+        // load classifier from the specified file
+        svm.load( filename_to_load );
+        ntrain_samples = 0;
+        if( svm.get_var_count() == 0 )
+        {
+            printf( "Could not read the classifier %s\n", filename_to_load );
+            return -1;
+        }
+        printf( "The classifier %s is loaded.\n", filename_to_load );
+    }
+    else
+    {
+        // train classifier
+        printf( "Training the classifier (may take a few minutes)...\n");
+        cvGetRows( data, &train_data, 0, ntrain_samples );
+        train_resp = cvCreateMat( ntrain_samples, 1, CV_32FC1);
+        for (int i = 0; i < ntrain_samples; i++)
+            train_resp->data.fl[i] = responses->data.fl[i];
+        svm.train(&train_data, train_resp, 0, 0, param);
+    }
 
     // classification
     std::vector<float> _sample(var_count * (nsamples_all - ntrain_samples));
@@ -691,7 +708,10 @@ int build_svm_classifier( char* data_filename )
     CvMat *result = cvCreateMat(1, nsamples_all - ntrain_samples, CV_32FC1);
 
     printf("Classification (may take a few minutes)...\n");
+    double t = (double)cvGetTickCount();
     svm.predict(&sample, result);
+    t = (double)cvGetTickCount() - t;
+    printf("Prediction type: %gms\n", t/(cvGetTickFrequency()*1000.));
 
     int true_resp = 0;
     for (int i = 0; i < nsamples_all - ntrain_samples; i++)
@@ -702,6 +722,9 @@ int build_svm_classifier( char* data_filename )
 
     printf("true_resp = %f%%\n", (float)true_resp / (nsamples_all - ntrain_samples) * 100);
 
+    if( filename_to_save )
+        svm.save( filename_to_save );
+
     cvReleaseMat( &train_resp );
     cvReleaseMat( &result );
     cvReleaseMat( &data );
@@ -772,7 +795,7 @@ int main( int argc, char *argv[] )
         method == 4 ?
         build_nbayes_classifier( data_filename) :
         method == 5 ?
-        build_svm_classifier( data_filename ):
+        build_svm_classifier( data_filename, filename_to_save, filename_to_load ):
         -1) < 0)
     {
         help();
diff --git a/samples/cpp/stitching_detailed.cpp b/samples/cpp/stitching_detailed.cpp
index 29cffca1b..20aee1651 100644
--- a/samples/cpp/stitching_detailed.cpp
+++ b/samples/cpp/stitching_detailed.cpp
@@ -356,7 +356,7 @@ int main(int argc, char* argv[])
     Ptr<FeaturesFinder> finder;
     if (features_type == "surf")
     {
-#ifdef HAVE_OPENCV_GPU
+#if defined(HAVE_OPENCV_NONFREE) && defined(HAVE_OPENCV_GPU)
         if (try_gpu && gpu::getCudaEnabledDeviceCount() > 0)
             finder = new SurfFeaturesFinderGpu();
         else
diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt
index 6d20fc34d..ee5910630 100644
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt
@@ -1,7 +1,7 @@
 SET(OPENCV_GPU_SAMPLES_REQUIRED_DEPS opencv_core opencv_flann opencv_imgproc opencv_highgui
                                      opencv_ml opencv_video opencv_objdetect opencv_features2d
                                      opencv_calib3d opencv_legacy opencv_contrib opencv_gpu
-                                     opencv_nonfree opencv_softcascade)
+                                     opencv_nonfree opencv_softcascade opencv_superres)
 
 ocv_check_dependencies(${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
 
@@ -17,6 +17,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     "${OpenCV_SOURCE_DIR}/modules/gpu/src/nvidia/core"
     )
 
+  if(HAVE_opencv_nonfree)
+    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/nonfree/include")
+  endif()
+
   if(HAVE_CUDA)
     ocv_include_directories(${CUDA_INCLUDE_DIRS})
   endif()
@@ -33,6 +37,9 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     add_executable(${the_target} ${srcs})
 
     target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
+    if(HAVE_opencv_nonfree)
+      target_link_libraries(${the_target} opencv_nonfree)
+    endif()
 
     set_target_properties(${the_target} PROPERTIES
       OUTPUT_NAME "${project}-example-${name}"
diff --git a/samples/gpu/bgfg_segm.cpp b/samples/gpu/bgfg_segm.cpp
index e7db9a168..63552c250 100644
--- a/samples/gpu/bgfg_segm.cpp
+++ b/samples/gpu/bgfg_segm.cpp
@@ -15,7 +15,9 @@ enum Method
     FGD_STAT,
     MOG,
     MOG2,
+#ifdef HAVE_OPENCV_NONFREE
     VIBE,
+#endif
     GMG
 };
 
@@ -38,13 +40,25 @@ int main(int argc, const char** argv)
     string file = cmd.get<string>("file");
     string method = cmd.get<string>("method");
 
-    if (method != "fgd" && method != "mog" && method != "mog2" && method != "vibe" && method != "gmg")
+    if (method != "fgd"
+        && method != "mog"
+        && method != "mog2"
+    #ifdef HAVE_OPENCV_NONFREE
+        && method != "vibe"
+    #endif
+        && method != "gmg")
     {
         cerr << "Incorrect method" << endl;
         return -1;
     }
 
-    Method m = method == "fgd" ? FGD_STAT : method == "mog" ? MOG : method == "mog2" ? MOG2 : method == "vibe" ? VIBE : GMG;
+    Method m = method == "fgd" ? FGD_STAT :
+               method == "mog" ? MOG :
+               method == "mog2" ? MOG2 :
+            #ifdef HAVE_OPENCV_NONFREE
+               method == "vibe" ? VIBE :
+            #endif
+                                  GMG;
 
     VideoCapture cap;
 
@@ -67,7 +81,9 @@ int main(int argc, const char** argv)
     FGDStatModel fgd_stat;
     MOG_GPU mog;
     MOG2_GPU mog2;
+#ifdef HAVE_OPENCV_NONFREE
     VIBE_GPU vibe;
+#endif
     GMG_GPU gmg;
     gmg.numInitializationFrames = 40;
 
@@ -93,9 +109,11 @@ int main(int argc, const char** argv)
         mog2(d_frame, d_fgmask);
         break;
 
+#ifdef HAVE_OPENCV_NONFREE
     case VIBE:
         vibe.initialize(d_frame);
         break;
+#endif
 
     case GMG:
         gmg.initialize(d_frame.size());
@@ -105,8 +123,14 @@ int main(int argc, const char** argv)
     namedWindow("image", WINDOW_NORMAL);
     namedWindow("foreground mask", WINDOW_NORMAL);
     namedWindow("foreground image", WINDOW_NORMAL);
-    if (m != VIBE && m != GMG)
+    if (m != GMG
+    #ifdef HAVE_OPENCV_NONFREE
+        && m != VIBE
+    #endif
+        )
+    {
         namedWindow("mean background image", WINDOW_NORMAL);
+    }
 
     for(;;)
     {
@@ -136,9 +160,11 @@ int main(int argc, const char** argv)
             mog2.getBackgroundImage(d_bgimg);
             break;
 
+#ifdef HAVE_OPENCV_NONFREE
         case VIBE:
             vibe(d_frame, d_fgmask);
             break;
+#endif
 
         case GMG:
             gmg(d_frame, d_fgmask);
diff --git a/samples/gpu/cascadeclassifier.cpp b/samples/gpu/cascadeclassifier.cpp
index 412183f10..dee9d16c7 100644
--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@@ -216,7 +216,7 @@ int main(int argc, const char *argv[])
 
         if (useGPU)
         {
-            cascade_gpu.visualizeInPlace = true;
+            //cascade_gpu.visualizeInPlace = true;
             cascade_gpu.findLargestObject = findLargestObject;
 
             detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
@@ -245,6 +245,11 @@ int main(int argc, const char *argv[])
         if (useGPU)
         {
             resized_gpu.download(resized_cpu);
+
+             for (int i = 0; i < detections_num; ++i)
+             {
+                rectangle(resized_cpu, faces_downloaded.ptr<cv::Rect>()[i], Scalar(255));
+             }
         }
 
         tm.stop();
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index b8f99e810..10c397477 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -73,9 +73,6 @@ GpuMat d_right[2];
 StereoBM_GPU* bm[2];
 GpuMat d_result[2];
 
-// CPU result
-Mat result;
-
 static void printHelp()
 {
     std::cout << "Usage: driver_api_stereo_multi_gpu --left <left_image> --right <right_image>\n";
diff --git a/samples/gpu/performance/CMakeLists.txt b/samples/gpu/performance/CMakeLists.txt
index 3b3d6e441..492b4c790 100644
--- a/samples/gpu/performance/CMakeLists.txt
+++ b/samples/gpu/performance/CMakeLists.txt
@@ -3,9 +3,17 @@ set(the_target "example_gpu_performance")
 file(GLOB sources "performance/*.cpp")
 file(GLOB headers "performance/*.h")
 
+if(HAVE_opencv_nonfree)
+  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/nonfree/include")
+endif()
+
 add_executable(${the_target} ${sources} ${headers})
 target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${OPENCV_GPU_SAMPLES_REQUIRED_DEPS})
 
+if(HAVE_opencv_nonfree)
+  target_link_libraries(${the_target} opencv_nonfree)
+endif()
+
 set_target_properties(${the_target} PROPERTIES
     OUTPUT_NAME "performance_gpu"
     PROJECT_LABEL "(EXAMPLE_GPU) performance")
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index b35102d86..8af4722c3 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -4,10 +4,15 @@
 #include "opencv2/calib3d/calib3d.hpp"
 #include "opencv2/video/video.hpp"
 #include "opencv2/gpu/gpu.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
 #include "opencv2/legacy/legacy.hpp"
 #include "performance.h"
 
+#include "opencv2/opencv_modules.hpp"
+#ifdef HAVE_OPENCV_NONFREE
+#include "opencv2/nonfree/gpu.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+#endif
+
 using namespace std;
 using namespace cv;
 
@@ -266,6 +271,7 @@ TEST(meanShift)
     }
 }
 
+#ifdef HAVE_OPENCV_NONFREE
 
 TEST(SURF)
 {
@@ -294,6 +300,8 @@ TEST(SURF)
     GPU_OFF;
 }
 
+#endif
+
 
 TEST(FAST)
 {
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index d424bf90b..cf42043bb 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -44,9 +44,6 @@ GpuMat d_right[2];
 StereoBM_GPU* bm[2];
 GpuMat d_result[2];
 
-// CPU result
-Mat result;
-
 static void printHelp()
 {
     std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
diff --git a/samples/gpu/super_resolution.cpp b/samples/gpu/super_resolution.cpp
new file mode 100644
index 000000000..3ab5ed062
--- /dev/null
+++ b/samples/gpu/super_resolution.cpp
@@ -0,0 +1,153 @@
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/contrib.hpp"
+#include "opencv2/superres.hpp"
+#include "opencv2/superres/optical_flow.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::superres;
+
+#define MEASURE_TIME(op) \
+    { \
+        TickMeter tm; \
+        tm.start(); \
+        op; \
+        tm.stop(); \
+        cout << tm.getTimeSec() << " sec" << endl; \
+    }
+
+static Ptr<DenseOpticalFlowExt> createOptFlow(const string& name, bool useGpu)
+{
+    if (name == "farneback")
+    {
+        if (useGpu)
+            return createOptFlow_Farneback_GPU();
+        else
+            return createOptFlow_Farneback();
+    }
+    else if (name == "simple")
+        return createOptFlow_Simple();
+    else if (name == "tvl1")
+    {
+        if (useGpu)
+            return createOptFlow_DualTVL1_GPU();
+        else
+            return createOptFlow_DualTVL1();
+    }
+    else if (name == "brox")
+        return createOptFlow_Brox_GPU();
+    else if (name == "pyrlk")
+        return createOptFlow_PyrLK_GPU();
+    else
+    {
+        cerr << "Incorrect Optical Flow algorithm - " << name << endl;
+        exit(-1);
+    }
+
+    return Ptr<DenseOpticalFlowExt>();
+}
+
+int main(int argc, const char* argv[])
+{
+    CommandLineParser cmd(argc, argv,
+        "{ v video      |           | Input video }"
+        "{ o output     |           | Output video }"
+        "{ s scale      | 4         | Scale factor }"
+        "{ i iterations | 180       | Iteration count }"
+        "{ t temporal   | 4         | Radius of the temporal search area }"
+        "{ f flow       | farneback | Optical flow algorithm (farneback, simple, tvl1, brox, pyrlk) }"
+        "{ gpu          | false     | Use GPU }"
+        "{ h help       | false     | Print help message }"
+    );
+
+    if (cmd.get<bool>("help"))
+    {
+        cout << "This sample demonstrates Super Resolution algorithms for video sequence" << endl;
+        cmd.printMessage();
+        return 0;
+    }
+
+    const string inputVideoName = cmd.get<string>("video");
+    const string outputVideoName = cmd.get<string>("output");
+    const int scale = cmd.get<int>("scale");
+    const int iterations = cmd.get<int>("iterations");
+    const int temporalAreaRadius = cmd.get<int>("temporal");
+    const string optFlow = cmd.get<string>("flow");
+    const bool useGpu = cmd.get<bool>("gpu");
+
+    Ptr<SuperResolution> superRes;
+    if (useGpu)
+        superRes = createSuperResolution_BTVL1_GPU();
+    else
+        superRes = createSuperResolution_BTVL1();
+
+    superRes->set("scale", scale);
+    superRes->set("iterations", iterations);
+    superRes->set("temporalAreaRadius", temporalAreaRadius);
+    superRes->set("opticalFlow", createOptFlow(optFlow, useGpu));
+
+    Ptr<FrameSource> frameSource;
+    if (useGpu)
+    {
+        // Try to use gpu Video Decoding
+        try
+        {
+            frameSource = createFrameSource_Video_GPU(inputVideoName);
+            Mat frame;
+            frameSource->nextFrame(frame);
+        }
+        catch (const cv::Exception&)
+        {
+            frameSource.release();
+        }
+    }
+    if (frameSource.empty())
+        frameSource = createFrameSource_Video(inputVideoName);
+
+    // skip first frame, it is usually corrupted
+    {
+        Mat frame;
+        frameSource->nextFrame(frame);
+        cout << "Input           : " << inputVideoName << " " << frame.size() << endl;
+        cout << "Scale factor    : " << scale << endl;
+        cout << "Iterations      : " << iterations << endl;
+        cout << "Temporal radius : " << temporalAreaRadius << endl;
+        cout << "Optical Flow    : " << optFlow << endl;
+        cout << "Mode            : " << (useGpu ? "GPU" : "CPU") << endl;
+    }
+
+    superRes->setInput(frameSource);
+
+    VideoWriter writer;
+
+    for (int i = 0;; ++i)
+    {
+        cout << '[' << setw(3) << i << "] : ";
+
+        Mat result;
+        MEASURE_TIME(superRes->nextFrame(result));
+
+        if (result.empty())
+            break;
+
+        imshow("Super Resolution", result);
+
+        if (waitKey(1000) > 0)
+            break;
+
+        if (!outputVideoName.empty())
+        {
+            if (!writer.isOpened())
+                writer.open(outputVideoName, CV_FOURCC('X', 'V', 'I', 'D'), 25.0, result.size());
+            writer << result;
+        }
+    }
+
+    return 0;
+}
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp
index 617cda52b..f4c5e73f7 100644
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -1,9 +1,14 @@
 #include <iostream>
 
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_NONFREE
+
 #include "opencv2/core/core.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
+#include "opencv2/nonfree/gpu.hpp"
 
 using namespace std;
 using namespace cv;
@@ -81,3 +86,13 @@ int main(int argc, char* argv[])
 
     return 0;
 }
+
+#else
+
+int main()
+{
+    std::cerr << "OpenCV was built without nonfree module" << std::endl;
+    return 0;
+}
+
+#endif
diff --git a/samples/ocl/CMakeLists.txt b/samples/ocl/CMakeLists.txt
index 40fe0e6e3..cdcf2f3e5 100644
--- a/samples/ocl/CMakeLists.txt
+++ b/samples/ocl/CMakeLists.txt
@@ -17,10 +17,6 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     ocv_include_directories(${OPENCL_INCLUDE_DIR})
   endif()
 
-  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
-  endif()
-
   # ---------------------------------------------
   #      Define executable targets
   # ---------------------------------------------
diff --git a/samples/ocl/aloe-L.png b/samples/ocl/aloe-L.png
new file mode 100644
index 000000000..47587668e
Binary files /dev/null and b/samples/ocl/aloe-L.png differ
diff --git a/samples/ocl/aloe-R.png b/samples/ocl/aloe-R.png
new file mode 100644
index 000000000..5d11c57a9
Binary files /dev/null and b/samples/ocl/aloe-R.png differ
diff --git a/samples/ocl/aloe-disp.png b/samples/ocl/aloe-disp.png
new file mode 100644
index 000000000..dd4a499be
Binary files /dev/null and b/samples/ocl/aloe-disp.png differ
diff --git a/samples/ocl/performance.cpp b/samples/ocl/performance.cpp
index 2c999ac69..46f71a429 100644
--- a/samples/ocl/performance.cpp
+++ b/samples/ocl/performance.cpp
@@ -16,6 +16,7 @@
 #define USE_OPENCL
 #ifdef USE_OPENCL
 #include "opencv2/ocl.hpp"
+#include "opencv2/nonfree/ocl.hpp"
 #endif
 
 #define TAB "    "
diff --git a/samples/ocl/surf_matcher.cpp b/samples/ocl/surf_matcher.cpp
index 8462300ed..ea6ee97cb 100644
--- a/samples/ocl/surf_matcher.cpp
+++ b/samples/ocl/surf_matcher.cpp
@@ -50,6 +50,7 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/ocl/ocl.hpp"
 #include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/nonfree/ocl.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
 
 using namespace std;