Merge remote-tracking branch 'origin/master'

2012-08-23 14:58:41 +04:00
parent fc307c87dc c8a54f67d4
commit 5648e49d59
169 changed files with 14121 additions and 9349 deletions
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -89,7 +89,7 @@ endif(WIN32)

 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-prototypes -Wmissing-declarations -Wundef -Wunused -Wsign-compare
                                   -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast)
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311 /wd4703)

 if(UNIX AND (CMAKE_COMPILER_IS_GNUCXX OR CV_ICC))
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,11 +189,11 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
@@ -336,6 +336,7 @@ include(cmake/OpenCVCompilerOptions.cmake REQUIRED)
 # ----------------------------------------------------------------------------
 if(MSVC)
  include(cmake/OpenCVCRTLinkage.cmake REQUIRED)
+  add_definitions(-D_VARIADIC_MAX=10)
 endif(MSVC)


--- a/android/android.toolchain.cmake
+++ b/android/android.toolchain.cmake
@@ -1,6 +1,6 @@
 # ------------------------------------------------------------------------------
 #  Android CMake toolchain file, for use with the Android NDK r5-r8
-#  Requires cmake 2.6.3 or newer (2.8.3 or newer is recommended).
+#  Requires cmake 2.6.3 or newer (2.8.5 or newer is recommended).
 #  See home page: http://code.google.com/p/android-cmake/
 #
 #  The file is mantained by the OpenCV project. And also can be found at
@@ -44,7 +44,8 @@
 #    ANDROID_ABI=armeabi-v7a -  specifies the target Application Binary
 #      Interface (ABI). This option nearly matches to the APP_ABI variable
 #      used by ndk-build tool from Android NDK.
-#      Possible values are:
+#
+#      Possible targets are:
 #        "armeabi" - matches to the NDK ABI with the same name.
 #           See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
 #        "armeabi-v7a" - matches to the NDK ABI with the same name.
@@ -56,6 +57,8 @@
 #        "armeabi-v6 with VFP" - tuned for ARMv6 processors having VFP.
 #        "x86" - matches to the NDK ABI with the same name.
 #            See ${ANDROID_NDK}/docs/CPU-ARCH-ABIS.html for the documentation.
+#        "mips" - matches to the NDK ABI with the same name
+#            (not testes on real devices)
 #
 #    ANDROID_NATIVE_API_LEVEL=android-8 - level of Android API compile for.
 #      Option is read-only when standalone toolchain used.
@@ -183,12 +186,13 @@
 #   - modified August 2012
 #     [+] updated for NDK r8b
 #     [~] all intermediate files generated by toolchain are moved into CMakeFiles
+#     [~] libstdc++ and libsupc are removed from explicit link libraries
 # ------------------------------------------------------------------------------

 cmake_minimum_required( VERSION 2.6.3 )

 if( DEFINED CMAKE_CROSSCOMPILING )
- #subsequent toolchain loading is not really needed
+ # subsequent toolchain loading is not really needed
 return()
 endif()

@@ -199,7 +203,7 @@ endif()

 # this one is important
 set( CMAKE_SYSTEM_NAME Linux )
-#this one not so much
+# this one not so much
 set( CMAKE_SYSTEM_VERSION 1 )

 set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
@@ -331,11 +335,11 @@ macro( __COPY_IF_DIFFERENT _source _destination )
 endmacro()


-#stl version: by default gnustl_static will be used
+# stl version: by default gnustl_static will be used
 set( ANDROID_USE_STLPORT FALSE CACHE BOOL "Experimental: use stlport_static instead of gnustl_static")
 mark_as_advanced( ANDROID_USE_STLPORT )

-#fight against cygwin
+# fight against cygwin
 set( ANDROID_FORBID_SYGWIN TRUE CACHE BOOL "Prevent cmake from working under cygwin and using cygwin tools")
 mark_as_advanced( ANDROID_FORBID_SYGWIN )
 if( ANDROID_FORBID_SYGWIN )
@@ -344,7 +348,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()

 if( CMAKE_HOST_WIN32 )
-  #remove cygwin from PATH
+  # remove cygwin from PATH
  set( __new_path "$ENV{PATH}")
  __LIST_FILTER( __new_path "cygwin" )
  set(ENV{PATH} "${__new_path}")
@@ -352,7 +356,7 @@ if( ANDROID_FORBID_SYGWIN )
 endif()
 endif()

-#detect current host platform
+# detect current host platform
 set( TOOL_OS_SUFFIX "" )
 if( CMAKE_HOST_APPLE )
 set( ANDROID_NDK_HOST_SYSTEM_NAME "darwin-x86" )
@@ -365,10 +369,10 @@ else()
 message( FATAL_ERROR "Cross-compilation on your platform is not supported by this cmake toolchain" )
 endif()

-#see if we have path to Android NDK
+# see if we have path to Android NDK
 __INIT_VARIABLE( ANDROID_NDK PATH ENV_ANDROID_NDK )
 if( NOT ANDROID_NDK )
- #see if we have path to Android standalone toolchain
+ # see if we have path to Android standalone toolchain
 __INIT_VARIABLE( ANDROID_STANDALONE_TOOLCHAIN PATH ENV_ANDROID_STANDALONE_TOOLCHAIN OBSOLETE_ANDROID_NDK_TOOLCHAIN_ROOT OBSOLETE_ENV_ANDROID_NDK_TOOLCHAIN_ROOT )

 if( NOT ANDROID_STANDALONE_TOOLCHAIN )
@@ -397,10 +401,10 @@ if( NOT ANDROID_NDK )
 endif( NOT ANDROID_STANDALONE_TOOLCHAIN )
 endif( NOT ANDROID_NDK )

-#remember found paths
+# remember found paths
 if( ANDROID_NDK )
 get_filename_component( ANDROID_NDK "${ANDROID_NDK}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_NDK}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidNdkPreviousPath )
@@ -414,7 +418,7 @@ if( ANDROID_NDK )
 set( BUILD_WITH_ANDROID_NDK True )
 elseif( ANDROID_STANDALONE_TOOLCHAIN )
 get_filename_component( ANDROID_STANDALONE_TOOLCHAIN "${ANDROID_STANDALONE_TOOLCHAIN}" ABSOLUTE )
- #try to detect change
+ # try to detect change
 if( CMAKE_AR )
  string( LENGTH "${ANDROID_STANDALONE_TOOLCHAIN}" __length )
  string( SUBSTRING "${CMAKE_AR}" 0 ${__length} __androidStandaloneToolchainPreviousPath )
@@ -438,7 +442,7 @@ else()
      sudo ln -s ~/my-android-toolchain ${ANDROID_STANDALONE_TOOLCHAIN_SEARCH_PATH}" )
 endif()

-#get all the details about standalone toolchain
+# get all the details about standalone toolchain
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 __DETECT_NATIVE_API_LEVEL( ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" )
 set( ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
@@ -455,7 +459,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
  set( __availableToolchainArchs "mipsel" )
 endif()
 if( ANDROID_COMPILER_VERSION )
-  #do not run gcc every time because it is relatevely expencive
+  # do not run gcc every time because it is relatevely expencive
  set( __availableToolchainCompilerVersions "${ANDROID_COMPILER_VERSION}" )
 else()
  execute_process( COMMAND "${ANDROID_STANDALONE_TOOLCHAIN}/bin/${__availableToolchainMachines}-gcc${TOOL_OS_SUFFIX}" --version
@@ -464,7 +468,7 @@ if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 endif()
 endif()

-#get all the details about NDK
+# get all the details about NDK
 if( BUILD_WITH_ANDROID_NDK )
 file( GLOB ANDROID_SUPPORTED_NATIVE_API_LEVELS RELATIVE "${ANDROID_NDK}/platforms" "${ANDROID_NDK}/platforms/android-*" )
 string( REPLACE "android-" "" ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_SUPPORTED_NATIVE_API_LEVELS}" )
@@ -490,7 +494,7 @@ if( BUILD_WITH_ANDROID_NDK )
 endif()
 endif()

-#build list of available ABIs
+# build list of available ABIs
 if( NOT ANDROID_SUPPORTED_ABIS )
 set( ANDROID_SUPPORTED_ABIS "" )
 set( __uniqToolchainArchNames ${__availableToolchainArchs} )
@@ -505,9 +509,9 @@ if( NOT ANDROID_SUPPORTED_ABIS )
 endif()
 endif()

-#choose target ABI
+# choose target ABI
 __INIT_VARIABLE( ANDROID_ABI OBSOLETE_ARM_TARGET OBSOLETE_ARM_TARGETS VALUES ${ANDROID_SUPPORTED_ABIS} )
-#verify that target ABI is supported
+# verify that target ABI is supported
 list( FIND ANDROID_SUPPORTED_ABIS "${ANDROID_ABI}" __androidAbiIdx )
 if( __androidAbiIdx EQUAL -1 )
 string( REPLACE ";" "\", \"", PRINTABLE_ANDROID_SUPPORTED_ABIS  "${ANDROID_SUPPORTED_ABIS}" )
@@ -517,10 +521,10 @@ if( __androidAbiIdx EQUAL -1 )
 endif()
 unset( __androidAbiIdx )

-#remember target ABI
+# remember target ABI
 set( ANDROID_ABI "${ANDROID_ABI}" CACHE STRING "The target ABI for Android. If arm, then armeabi-v7a is recommended for hardware floating point." FORCE )

-#set target ABI options
+# set target ABI options
 if( ANDROID_ABI STREQUAL "x86" )
 set( X86 true )
 set( ANDROID_NDK_ABI_NAME "x86" )
@@ -545,7 +549,7 @@ elseif( ANDROID_ABI STREQUAL "armeabi-v6 with VFP" )
 set( ANDROID_ARCH_NAME "arm" )
 set( ANDROID_ARCH_FULLNAME "arm" )
 set( CMAKE_SYSTEM_PROCESSOR "armv6" )
- #need always fallback to older platform
+ # need always fallback to older platform
 set( ARMEABI true )
 elseif( ANDROID_ABI STREQUAL "armeabi-v7a")
 set( ARMEABI_V7A true )
@@ -573,8 +577,8 @@ else()
 endif()

 if( CMAKE_BINARY_DIR AND EXISTS "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" )
- #really dirty hack
- #it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
+ # really dirty hack
+ # it is not possible to change CMAKE_SYSTEM_PROCESSOR after the first run...
 file( APPEND "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeSystem.cmake" "SET(CMAKE_SYSTEM_PROCESSOR \"${CMAKE_SYSTEM_PROCESSOR}\")\n" )
 endif()

@@ -592,7 +596,7 @@ else()
 unset( ANDROID_FORCE_ARM_BUILD CACHE )
 endif()

-#choose toolchain
+# choose toolchain
 if( ANDROID_TOOLCHAIN_NAME )
 list( FIND __availableToolchains "${ANDROID_TOOLCHAIN_NAME}" __toolchainIdx )
 if( __toolchainIdx EQUAL -1 )
@@ -637,10 +641,10 @@ unset( __availableToolchainMachines )
 unset( __availableToolchainArchs )
 unset( __availableToolchainCompilerVersions )

-#choose native API level
+# choose native API level
 __INIT_VARIABLE( ANDROID_NATIVE_API_LEVEL ENV_ANDROID_NATIVE_API_LEVEL ANDROID_API_LEVEL ENV_ANDROID_API_LEVEL ANDROID_STANDALONE_TOOLCHAIN_API_LEVEL ANDROID_DEFAULT_NDK_API_LEVEL_${ANDROID_ARCH_NAME} ANDROID_DEFAULT_NDK_API_LEVEL )
 string( REGEX MATCH "[0-9]+" ANDROID_NATIVE_API_LEVEL "${ANDROID_NATIVE_API_LEVEL}" )
-#validate
+# validate
 list( FIND ANDROID_SUPPORTED_NATIVE_API_LEVELS "${ANDROID_NATIVE_API_LEVEL}" __levelIdx )
 if( __levelIdx EQUAL -1 )
 message( SEND_ERROR "Specified Android native API level (${ANDROID_NATIVE_API_LEVEL}) is not supported by your NDK/toolchain." )
@@ -659,7 +663,7 @@ if( CMAKE_VERSION VERSION_GREATER "2.8" )
 set_property( CACHE ANDROID_NATIVE_API_LEVEL PROPERTY STRINGS ${ANDROID_SUPPORTED_NATIVE_API_LEVELS} )
 endif()

-#setup paths
+# setup paths
 if( BUILD_WITH_STANDALONE_TOOLCHAIN )
 set( ANDROID_TOOLCHAIN_ROOT "${ANDROID_STANDALONE_TOOLCHAIN}" )
 set( ANDROID_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot" )
@@ -689,7 +693,7 @@ set( CMAKE_ASM_COMPILER "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHI
 if( CMAKE_VERSION VERSION_LESS 2.8.5 )
 set( CMAKE_ASM_COMPILER_ARG1 "-c" )
 endif()
-#there may be a way to make cmake deduce these TODO deduce the rest of the tools
+# there may be a way to make cmake deduce these TODO deduce the rest of the tools
 set( CMAKE_STRIP        "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-strip${TOOL_OS_SUFFIX}"   CACHE PATH "strip" )
 set( CMAKE_AR           "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ar${TOOL_OS_SUFFIX}"      CACHE PATH "archive" )
 set( CMAKE_LINKER       "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_MACHINE_NAME}-ld${TOOL_OS_SUFFIX}"      CACHE PATH "linker" )
@@ -705,11 +709,12 @@ if( APPLE )
 endif()
 mark_as_advanced( CMAKE_INSTALL_NAME_TOOL )
 endif()
-#export directories
+
+# export directories
 set( ANDROID_SYSTEM_INCLUDE_DIRS "" )
 set( ANDROID_SYSTEM_LIB_DIRS "" )

-#setup output directories
+# setup output directories
 set( LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_SOURCE_DIR} CACHE PATH "root for library output, set this to change where android libs are installed to" )
 set( CMAKE_INSTALL_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/user" CACHE STRING "path for installing" )

@@ -722,13 +727,13 @@ if(NOT _CMAKE_IN_TRY_COMPILE)
 set( LIBRARY_OUTPUT_PATH "${LIBRARY_OUTPUT_PATH_ROOT}/libs/${ANDROID_NDK_ABI_NAME}" CACHE PATH "path for android libs" )
 endif()

-#includes
+# includes
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_SYSROOT}/usr/include" )
 if( __stlIncludePath AND EXISTS "${__stlIncludePath}" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlIncludePath}" )
 endif()

-#STL bits includes
+# c++ bits includes
 if( __stlLibPath AND EXISTS "${__stlLibPath}/include" )
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${__stlLibPath}/include" )
 endif()
@@ -742,7 +747,7 @@ elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/incl
 list( APPEND ANDROID_SYSTEM_INCLUDE_DIRS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/include/c++/${ANDROID_COMPILER_VERSION}/${ANDROID_TOOLCHAIN_MACHINE_NAME}" )
 endif()

-#flags and definitions
+# flags and definitions
 if(ANDROID_SYSROOT MATCHES "[ ;\"]")
 set( ANDROID_CXX_FLAGS "--sysroot=\"${ANDROID_SYSROOT}\"" )
 # quotes will break try_compile and compiler identification
@@ -766,7 +771,7 @@ set( CMAKE_CXX_PLATFORM_ID Linux )
 set( CMAKE_CXX_SIZEOF_DATA_PTR 4 )
 set( CMAKE_CXX_HAS_ISYSROOT 1 )
 set( CMAKE_CXX_COMPILER_ABI ELF )
-#force ASM compiler (required for CMake < 2.8.5)
+# force ASM compiler (required for CMake < 2.8.5)
 set( CMAKE_ASM_COMPILER_ID_RUN TRUE )
 set( CMAKE_ASM_COMPILER_ID GNU )
 set( CMAKE_ASM_COMPILER_WORKS TRUE )
@@ -796,17 +801,17 @@ endif()

 if( ANDROID_USE_STLPORT )
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -fno-rtti -fno-exceptions" )
- set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-rtti -fno-exceptions" )
+ set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fno-exceptions" )
 else()
 set( _CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS} -frtti -fexceptions" )
 set( _CMAKE_C_FLAGS "${_CMAKE_C_FLAGS} -fexceptions" )
 endif()

-#release and debug flags
+# release and debug flags
 if( ARMEABI OR ARMEABI_V7A )
 if( NOT ANDROID_FORCE_ARM_BUILD AND NOT ARMEABI_V6 )
-  #It is recommended to use the -mthumb compiler flag to force the generation
-  #of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
+  # It is recommended to use the -mthumb compiler flag to force the generation
+  # of 16-bit Thumb-1 instructions (the default being 32-bit ARM ones).
  # O3 instead of O2/Os in release mode - like cmake sets for desktop gcc
  set( _CMAKE_CXX_FLAGS_RELEASE "-mthumb -O3" )
  set( _CMAKE_C_FLAGS_RELEASE   "-mthumb -O3" )
@@ -836,7 +841,7 @@ set( _CMAKE_C_FLAGS_RELEASE   "${_CMAKE_C_FLAGS_RELEASE}   -fomit-frame-pointer
 set( _CMAKE_CXX_FLAGS_DEBUG "${_CMAKE_CXX_FLAGS_DEBUG} -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )
 set( _CMAKE_C_FLAGS_DEBUG   "${_CMAKE_C_FLAGS_DEBUG}   -fno-strict-aliasing -fno-omit-frame-pointer -DDEBUG -D_DEBUG" )

-#ABI-specific flags
+# ABI-specific flags
 if( ARMEABI_V7A )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS} -march=armv7-a -mfloat-abi=softfp" )
 if( NEON )
@@ -854,19 +859,18 @@ elseif( X86 )
 set( ANDROID_CXX_FLAGS "${ANDROID_CXX_FLAGS}" )#sse?
 endif()

-#linker flags
+# linker flags
 if( NOT DEFINED __ndklibspath )
 set( __ndklibspath "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/ndklibs/${ANDROID_NDK_ABI_NAME}" )
 endif()
-list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
+list( APPEND ANDROID_SYSTEM_LIB_DIRS "${CMAKE_INSTALL_PREFIX}/libs/${ANDROID_NDK_ABI_NAME}" )
 set( ANDROID_LINKER_FLAGS "" )
-#STL
+
+# STL
 if( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libstlport_static.a" )
-  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstlport_static.a" "${__ndklibspath}/libstlport_static.a" )
- endif()
- if( EXISTS "${__ndklibspath}/libstlport_static.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -Wl,--start-group -lstlport_static" )
+  set( CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
+  set( CMAKE_CXX_CREATE_SHARED_MODULE  "<CMAKE_CXX_COMPILER> <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> \"${__stlLibPath}/libstlport_static.a\"")
 endif()
 else( ANDROID_USE_STLPORT )
 if( EXISTS "${__stlLibPath}/libgnustl_static.a" )
@@ -880,11 +884,6 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${__stlLibPath}/libstdc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libstdc++.a" "${__ndklibspath}/libstdc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libstdc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lstdc++" )
- endif()
-
- #gcc exception & rtti support
 if( EXISTS "${__stlLibPath}/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${__stlLibPath}/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 elseif( ANDROID_ARCH_NAME STREQUAL "arm" AND EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/${CMAKE_SYSTEM_PROCESSOR}/thumb/libsupc++.a" )
@@ -896,16 +895,14 @@ else( ANDROID_USE_STLPORT )
 elseif( EXISTS "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" )
  __COPY_IF_DIFFERENT( "${ANDROID_TOOLCHAIN_ROOT}/${ANDROID_TOOLCHAIN_MACHINE_NAME}/lib/libsupc++.a" "${__ndklibspath}/libsupc++.a" )
 endif()
- if( EXISTS "${__ndklibspath}/libsupc++.a" )
-  set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -lsupc++" )
- endif()
+ list( APPEND ANDROID_SYSTEM_LIB_DIRS "${__ndklibspath}" )
 endif( ANDROID_USE_STLPORT )

-#cleanup for STL search
+# cleanup for STL search
 unset( __stlIncludePath )
 unset( __stlLibPath )

-#other linker flags
+# other linker flags
 __INIT_VARIABLE( ANDROID_NO_UNDEFINED OBSOLETE_NO_UNDEFINED VALUES ON )
 set( ANDROID_NO_UNDEFINED ${ANDROID_NO_UNDEFINED} CACHE BOOL "Show all undefined symbols as linker errors" FORCE )
 mark_as_advanced( ANDROID_NO_UNDEFINED )
@@ -914,7 +911,7 @@ if( ANDROID_NO_UNDEFINED )
 endif()

 if (ANDROID_NDK MATCHES "-r[56].?$")
- #libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
+ # libGLESv2.so in NDK's prior to r7 refers to exteranal symbols. So this flag option is required for all projects using OpenGL from native.
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES ON )
 else()
 __INIT_VARIABLE( ANDROID_SO_UNDEFINED VALUES OFF )
@@ -940,7 +937,7 @@ if( ARMEABI_V7A )
 set( ANDROID_LINKER_FLAGS "-Wl,--fix-cortex-a8 ${ANDROID_LINKER_FLAGS}" )
 endif()

-#cache flags
+# cache flags
 set( CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags" )
 set( CMAKE_C_FLAGS "${_CMAKE_C_FLAGS}" CACHE STRING "c flags" )
 set( CMAKE_CXX_FLAGS_RELEASE "${_CMAKE_CXX_FLAGS_RELEASE}" CACHE STRING "c++ Release flags" )
@@ -954,7 +951,7 @@ set( CMAKE_EXE_LINKER_FLAGS "-Wl,-z,nocopyreloc" CACHE STRING "linker flags" )
 include_directories( SYSTEM ${ANDROID_SYSTEM_INCLUDE_DIRS} )
 link_directories( ${ANDROID_SYSTEM_LIB_DIRS} )

-#finish flags
+# finish flags
 set( ANDROID_CXX_FLAGS    "${ANDROID_CXX_FLAGS}"    CACHE INTERNAL "Extra Android compiler flags")
 set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}" CACHE INTERNAL "Extra Android linker flags")
 set( CMAKE_CXX_FLAGS           "${ANDROID_CXX_FLAGS} ${CMAKE_CXX_FLAGS}" )
@@ -969,7 +966,7 @@ else()
 set( CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" )
 endif()

-#set these global flags for cmake client scripts to change behavior
+# set these global flags for cmake client scripts to change behavior
 set( ANDROID True )
 set( BUILD_ANDROID True )

@@ -982,7 +979,7 @@ set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
 set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )


-#macro to find packages on the host OS
+# macro to find packages on the host OS
 macro( find_host_package )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@@ -1004,7 +1001,7 @@ macro( find_host_package )
 endmacro()


-#macro to find programs on the host OS
+# macro to find programs on the host OS
 macro( find_host_program )
 set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
 set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER )
@@ -1044,7 +1041,11 @@ if( NOT PROJECT_NAME STREQUAL "CMAKE_TRY_COMPILE" )
 set( __toolchain_config "")
 foreach( __var ANDROID_ABI ANDROID_FORCE_ARM_BUILD ANDROID_NATIVE_API_LEVEL ANDROID_NO_UNDEFINED ANDROID_SO_UNDEFINED ANDROID_SET_OBSOLETE_VARIABLES LIBRARY_OUTPUT_PATH_ROOT ANDROID_USE_STLPORT ANDROID_FORBID_SYGWIN ANDROID_NDK ANDROID_STANDALONE_TOOLCHAIN ANDROID_FUNCTION_LEVEL_LINKING __ndklibspath )
  if( DEFINED ${__var} )
-   set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" )\n" )
+   if( "${__var}" MATCHES " ")
+    set( __toolchain_config "${__toolchain_config}set( ${__var} \"${${__var}}\" CACHE INTERNAL \"\" )\n" )
+   else()
+    set( __toolchain_config "${__toolchain_config}set( ${__var} ${${__var}} CACHE INTERNAL \"\" )\n" )
+   endif()
  endif()
 endforeach()
 file( WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/android.toolchain.config.cmake" "${__toolchain_config}" )
--- a/cmake/CMakeParseArguments.cmake
+++ b/cmake/CMakeParseArguments.cmake
@@ -0,0 +1,138 @@
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
+# parsing the arguments given to that macro or function.
+# It processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e. keywords which can be used when calling the macro without any value
+# following, like e.g. the OPTIONAL keyword of the install() command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g. DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this macro
+# which can be followed by more than one value, like e.g. the TARGETS or
+# FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.
+# These variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
+# your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments as the
+# real install() command:
+#
+#   function(MY_INSTALL)
+#     set(options OPTIONAL FAST)
+#     set(oneValueArgs DESTINATION RENAME)
+#     set(multiValueArgs TARGETS CONFIGURATIONS)
+#     cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+#     ...
+#
+# Assume my_install() has been called like this:
+#   my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+# After the cmake_parse_arguments() call the macro will have set the following
+# variables:
+#   MY_INSTALL_OPTIONAL = TRUE
+#   MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+#   MY_INSTALL_DESTINATION = "bin"
+#   MY_INSTALL_RENAME = "" (was not used)
+#   MY_INSTALL_TARGETS = "foo;bar"
+#   MY_INSTALL_CONFIGURATIONS = "" (was not used)
+#   MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+# You can the continue and process these variables.
+#
+# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
+# another recognized keyword follows, this is interpreted as the beginning of
+# the new option.
+# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
+# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+  return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+  # first set all result variables to empty/FALSE
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+    set(${prefix}_${arg_name})
+  endforeach(arg_name)
+
+  foreach(option ${_optionNames})
+    set(${prefix}_${option} FALSE)
+  endforeach(option)
+
+  set(${prefix}_UNPARSED_ARGUMENTS)
+
+  set(insideValues FALSE)
+  set(currentArgName)
+
+  # now iterate over all arguments and fill the result variables
+  foreach(currentArg ${ARGN})
+    list(FIND _optionNames "${currentArg}" optionIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _singleArgNames "${currentArg}" singleArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _multiArgNames "${currentArg}" multiArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+
+    if(${optionIndex} EQUAL -1  AND  ${singleArgIndex} EQUAL -1  AND  ${multiArgIndex} EQUAL -1)
+      if(insideValues)
+        if("${insideValues}" STREQUAL "SINGLE")
+          set(${prefix}_${currentArgName} ${currentArg})
+          set(insideValues FALSE)
+        elseif("${insideValues}" STREQUAL "MULTI")
+          list(APPEND ${prefix}_${currentArgName} ${currentArg})
+        endif()
+      else(insideValues)
+        list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+      endif(insideValues)
+    else()
+      if(NOT ${optionIndex} EQUAL -1)
+        set(${prefix}_${currentArg} TRUE)
+        set(insideValues FALSE)
+      elseif(NOT ${singleArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "SINGLE")
+      elseif(NOT ${multiArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "MULTI")
+      endif()
+    endif()
+
+  endforeach(currentArg)
+
+  # propagate the result variables to the caller:
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+    set(${prefix}_${arg_name}  ${${prefix}_${arg_name}} PARENT_SCOPE)
+  endforeach(arg_name)
+  set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -88,7 +88,11 @@ if(CUDA_FOUND)
    if(APPLE)
      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()
-    string(REPLACE "-Wsign-promo" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+
+    # disabled because of multiple warnings during building nvcc auto generated files
+    if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_GCC_REGEX_VERSION VERSION_GREATER "4.6.0")
+      ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-but-set-variable)
+    endif()

    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
--- a/cmake/OpenCVDetectTBB.cmake
+++ b/cmake/OpenCVDetectTBB.cmake
@@ -21,7 +21,12 @@ elseif(UNIX AND NOT APPLE)
 endif()

 if(NOT HAVE_TBB)
-  set(TBB_DEFAULT_INCLUDE_DIRS "/opt/intel/tbb" "/usr/local/include" "/usr/include" "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB" "C:/Program Files (x86)/TBB" "${CMAKE_INSTALL_PREFIX}/include")
+  set(TBB_DEFAULT_INCLUDE_DIRS 
+    "/opt/intel/tbb" "/usr/local/include" "/usr/include" 
+    "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB" 
+    "C:/Program Files (x86)/tbb/include" 
+    "C:/Program Files (x86)/tbb/include" 
+    "${CMAKE_INSTALL_PREFIX}/include")

  find_path(TBB_INCLUDE_DIRS "tbb/tbb.h" PATHS ${TBB_INCLUDE_DIR} ${TBB_DEFAULT_INCLUDE_DIRS} DOC "The path to TBB headers")
  if(TBB_INCLUDE_DIRS)
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -64,9 +64,14 @@ macro(ocv_generate_dependencies_map_configcmake suffix configuration)
      string(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "${OPENCV_LINK_LIBRARY_SUFFIX}" __libname "${__libname}")
    endif()

+    string(REPLACE " " "\\ " __mod_deps "${${__ocv_lib}_MODULE_DEPS_${suffix}}")
+    string(REPLACE " " "\\ " __ext_deps "${${__ocv_lib}_EXTRA_DEPS_${suffix}}")
+    string(REPLACE "\"" "\\\"" __mod_deps "${__mod_deps}")
+    string(REPLACE "\"" "\\\"" __ext_deps "${__ext_deps}")
+
    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_LIBNAME_${suffix} \"${__libname}\")\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${${__ocv_lib}_MODULE_DEPS_${suffix}})\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${${__ocv_lib}_EXTRA_DEPS_${suffix}})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${__mod_deps})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${__ext_deps})\n")

    list(APPEND OPENCV_PROCESSED_LIBS ${__ocv_lib})
    list(APPEND OPENCV_LIBS_TO_PROCESS ${${__ocv_lib}_MODULE_DEPS_${suffix}})
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -509,8 +509,6 @@ endmacro()
 macro(ocv_add_precompiled_headers the_target)
  if("${the_target}" MATCHES "^opencv_test_.*$")
    SET(pch_path "test/test_")
-  elseif("${the_target}" MATCHES "opencv_perf_gpu_cpu")
-    SET(pch_path "perf_cpu/perf_cpu_")
  elseif("${the_target}" MATCHES "^opencv_perf_.*$")
    SET(pch_path "perf/perf_")
  else()
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -501,3 +501,12 @@ macro(ocv_parse_header2 LIBNAME HDR_PATH VARNAME)
    endif()
  endif()
 endmacro()
+
+
+################################################################################################
+# short command to setup source group
+function(ocv_source_group group)
+  cmake_parse_arguments(OCV_SOURCE_GROUP "" "" "GLOB" ${ARGN})
+  file(GLOB srcs ${OCV_SOURCE_GROUP_GLOB})
+  source_group(${group} FILES ${srcs})
+endfunction()
--- a/data/lbpcascades/lbpcascade_profileface.xml
+++ b/data/lbpcascades/lbpcascade_profileface.xml
--- a/data/lbpcascades/lbpcascade_silverware.xml
+++ b/data/lbpcascades/lbpcascade_silverware.xml
--- a/doc/_themes/blue/static/default.css_t
+++ b/doc/_themes/blue/static/default.css_t
@@ -175,6 +175,8 @@ a:hover {
 div.body p, div.body dd, div.body li {
    text-align: justify;
    line-height: 130%;
+    margin-top: 1em;
+    margin-bottom: 1em;
 }

 div.body h1,
@@ -327,9 +329,9 @@ table.field-list {
    margin-top: 20px;
 }

-ul.simple {
+/*ul.simple {
    list-style: none;
-}
+}*/

 em.menuselection, em.guilabel {
    font-family: {{ theme_guifont }};
@@ -384,3 +386,8 @@ margin-top: 0px;
 div.body ul.search li {
    text-align: left;
 }
+
+div.linenodiv {
+    min-width: 1em;
+    text-align: right;
+}
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@@ -202,7 +202,7 @@ Open OpenCV library and samples in Eclipse

   Sometimes more advanced manipulations are required:

-   * The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
+   The provided projects are configured for ``API 11`` target (and ``API 9`` for the library) that can be missing platform in your Android SDK.
   After right click on any project select  :guilabel:`Properties` and then :guilabel:`Android` on the left pane.
   Click some target with `API Level` 11 or higher:

--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
@@ -75,7 +75,7 @@ You need the following software to be installed in order to develop for Android

           sudo update-java-alternatives --set java-6-sun

-   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.
+..   **TODO:** add a note on Sun/Oracle Java installation on Ubuntu 12.

 #. **Android SDK**

@@ -241,6 +241,7 @@ where:
 The script :file:`Android.mk` usually has the following structure:

 .. code-block:: make
+   :linenos:

   LOCAL_PATH := $(call my-dir)

@@ -258,6 +259,7 @@ This is the minimal file :file:`Android.mk`, which builds C++ source code of an
 Usually the file :file:`Application.mk` is optional, but in case of project using OpenCV, when STL and exceptions are used in C++, it also should be created. Example of the file :file:`Application.mk`:

 .. code-block:: make
+   :linenos:

   APP_STL := gnustl_static
   APP_CPPFLAGS := -frtti -fexceptions
@@ -337,7 +339,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
      :alt: Configure CDT
      :align: center

-    ` `
+   And:

   .. image:: images/eclipse_cdt_cfg2.png
      :alt: Configure CDT
@@ -350,6 +352,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
        :align: center

 #. Open :guilabel:`Project Properties -> C/C++ Build`, unckeck ``Use default build command``, replace "Build command" text from ``"make"`` to
+
   ``"${NDKROOT}/ndk-build.cmd"`` on Windows,

   ``"${NDKROOT}/ndk-build"`` on Linux and MacOS.
@@ -393,7 +396,7 @@ We recommend the approach based on Eclipse :abbr:`CDT(C/C++ Development Tooling)
      :alt: Configure CDT
      :align: center

-     .. note:: The latest Android NDK **r8b** has a bit different STL headers path. So if you use this NDK version please use the following modified **Include** paths list:
+   .. note:: The latest Android NDK **r8b** uses different STL headers path. So if you use this NDK release add the following **Include** paths list instead:

   ::

@@ -412,12 +415,16 @@ AVD
 AVD (*Android Virtual Device*) is not probably the most convenient way to test an OpenCV-dependent application, but sure the most uncomplicated one to configure.

 #. Assuming you already have *Android SDK* and *Eclipse IDE* installed, in Eclipse go :guilabel:`Window -> AVD Manager`.
-     **TBD:** how to start AVD Manager without Eclipse...
+
+   ..     **TBD:** how to start AVD Manager without Eclipse...
+
 #. Press the :guilabel:`New` button in :guilabel:`AVD Manager` window.
 #. :guilabel:`Create new Android Virtual Device` window will let you select some properties for your new device, like target API level, size of SD-card and other.
+
   .. image:: images/AVD_create.png
      :alt: Configure builders
      :align: center
+
 #. When you click the :guilabel:`Create AVD` button, your new AVD will be availible in :guilabel:`AVD Manager`.
 #. Press :guilabel:`Start` to launch the device. Be aware that any AVD (a.k.a. Emulator) is usually much slower than a hardware Android device, so it may take up to several minutes to start.
 #. Go :guilabel:`Run -> Run/Debug`  in Eclipse IDE to run your application in regular or debugging mode. :guilabel:`Device Chooser` will let you choose among the running devices or to start a new one.
@@ -435,22 +442,31 @@ Windows host computer
 #. Attach the Android device to your PC with a USB cable.
 #. Go to :guilabel:`Start Menu` and **right-click** on :guilabel:`Computer`. Select :guilabel:`Manage` in the context menu. You may be asked for Administrative permissions.
 #. Select :guilabel:`Device Manager` in the left pane and find an unknown device in the list. You may try unplugging it and then plugging back in order to check whether it's your exact equipment appears in the list.
+
   .. image:: images/usb_device_connect_01.png
      :alt: Unknown device
      :align: center
+
 #. Try your luck installing `Google USB drivers` without any modifications: **right-click** on the unknown device, select :guilabel:`Properties` menu item --> :guilabel:`Details` tab --> :guilabel:`Update Driver` button.
+
   .. image:: images/usb_device_connect_05.png
      :alt: Device properties
      :align: center
+
 #. Select :guilabel:`Browse computer for driver software`.
+
   .. image:: images/usb_device_connect_06.png
      :alt: Browse for driver
      :align: center
+
 #. Specify the path to :file:`<Android SDK folder>/extras/google/usb_driver/` folder.
+
   .. image:: images/usb_device_connect_07.png
      :alt: Browse for driver
      :align: center
+
 #. If you get the prompt to install unverified drivers and report about success - you've finished with USB driver installation.
+
   .. image:: images/usb_device_connect_08.png
      :alt: Install prompt
      :align: center
@@ -460,23 +476,33 @@ Windows host computer
   .. image:: images/usb_device_connect_09.png
      :alt: Installed OK
      :align: center
+
 #. Otherwise (getting the failure like shown below) follow the next steps.
+
   .. image:: images/usb_device_connect_12.png
      :alt: No driver
      :align: center
+
 #. Again **right-click** on the unknown device, select :guilabel:`Properties --> Details --> Hardware Ids` and copy the line like ``USB\VID_XXXX&PID_XXXX&MI_XX``.
+
   .. image:: images/usb_device_connect_02.png
      :alt: Device properties details
      :align: center
+
 #. Now open file :file:`<Android SDK folder>/extras/google/usb_driver/android_winusb.inf`. Select either ``Google.NTx86`` or ``Google.NTamd64`` section depending on your host system architecture.
+
   .. image:: images/usb_device_connect_03.png
      :alt: "android_winusb.inf"
      :align: center
+
 #. There should be a record like existing ones for your device and you need to add one manually.
+
   .. image:: images/usb_device_connect_04.png
      :alt: "android_winusb.inf"
      :align: center
+
 #. Save the :file:`android_winusb.inf` file and try to install the USB driver again.
+
   .. image:: images/usb_device_connect_05.png
      :alt: Device properties
      :align: center
@@ -492,7 +518,9 @@ Windows host computer
   .. image:: images/usb_device_connect_07.png
      :alt: Browse for driver
      :align: center
+
 #. This time installation should go successfully.
+
   .. image:: images/usb_device_connect_08.png
      :alt: Install prompt
      :align: center
@@ -502,11 +530,15 @@ Windows host computer
   .. image:: images/usb_device_connect_09.png
      :alt: Installed OK
      :align: center
+
 #. And an unknown device is now recognized as an Android phone.
+
   .. image:: images/usb_device_connect_10.png
      :alt: "Known" device
      :align: center
+
 #. Successful device USB connection can be verified in console via ``adb devices`` command.
+
   .. image:: images/usb_device_connect_11.png
      :alt: "adb devices"
      :align: center
@@ -523,7 +555,7 @@ By default Linux doesn't recognize Android devices, but it's easy to fix this is

 Then restart your adb server (even better to restart the system), plug in your Android device and execute :command:`adb devices` command. You will see the list of attached devices:

-  .. image:: images/usb_device_connect_ubuntu.png
+.. image:: images/usb_device_connect_ubuntu.png
   :alt: List of attached devices
   :align: center

--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
@@ -57,7 +57,7 @@ Using async initialization is a **recommended** way for application development.
 To run OpenCV Manager-based application the first time you need to install packages with the `OpenCV Manager` and `OpenCV binary pack` for you platform.
 You can do it using Google Play Market or manually with ``adb`` tool:

-  .. code-block:: sh
+.. code-block:: sh
    :linenos:

    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.2_Manager.apk
@@ -266,12 +266,12 @@ It will be capable of accessing camera output, processing it and displaying the
 #. Set name, target, package and minSDKVersion accordingly.

 #. Create a new class (*File -> New -> Class*). Name it for example: *HelloOpenCVView*.
+
   .. image:: images/dev_OCV_new_class.png
        :alt: Add a new class.
        :align: center

   * It should extend *SurfaceView* class.
-
   * It also should implement *SurfaceHolder.Callback*, *Runnable*.

 #. Edit *HelloOpenCVView* class.
@@ -279,7 +279,9 @@ It will be capable of accessing camera output, processing it and displaying the
   * Add an *import* line for *android.content.context*.

   * Modify autogenerated stubs: *HelloOpenCVView*, *surfaceCreated*, *surfaceDestroyed* and *surfaceChanged*.
+
     .. code-block:: java
+        :linenos:

        package com.hello.opencv.test;

@@ -300,16 +302,18 @@ It will be capable of accessing camera output, processing it and displaying the
            cameraRelease();
        }

-		  public void surfaceChanged(SurfaceHolder holder, int format, int width,
-		  int height) {
+        public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) {
            cameraSetup(width, height);
        }

+        //...
+
   * Add *cameraOpen*, *cameraRelease* and *cameraSetup* voids as shown below.

   * Also, don't forget to add the public void *run()* as follows:

     .. code-block:: java
+        :linenos:

        public void run() {
            // TODO: loop { getFrame(), processFrame(), drawFrame() }
@@ -327,11 +331,10 @@ It will be capable of accessing camera output, processing it and displaying the
            // TODO setup camera
        }

-
-       ..
-
 #. Create a new *Activity* (*New -> Other -> Android -> Android Activity*) and name it, for example: *HelloOpenCVActivity*. For this activity define *onCreate*, *onResume* and *onPause* voids.
+
   .. code-block:: java
+      :linenos:

       public void onCreate (Bundle savedInstanceState) {
           super.onCreate(savedInstanceState);
@@ -359,11 +362,12 @@ It will be capable of accessing camera output, processing it and displaying the
               });
               ad.show();
           }
-		
       }

 #. Add the following permissions to the AndroidManifest.xml file:
+
   .. code-block:: xml
+      :linenos:

      </application>

@@ -372,12 +376,15 @@ It will be capable of accessing camera output, processing it and displaying the
      <uses-feature android:name="android.hardware.camera.autofocus" />

 #. Reference OpenCV library within your project properties.
+
   .. image:: images/dev_OCV_reference.png
        :alt: Reference OpenCV library.
        :align: center

 #. We now need some code to handle the camera. Update the *HelloOpenCVView* class as follows:
+
   .. code-block:: java
+      :linenos:

      private VideoCapture      mCamera;

@@ -394,6 +401,7 @@ It will be capable of accessing camera output, processing it and displaying the
          }
          return true;
      }
+
      public void cameraRelease() {
          synchronized(this) {
              if (mCamera != null) {
@@ -402,6 +410,7 @@ It will be capable of accessing camera output, processing it and displaying the
              }
          }
      }
+
      private void cameraSetup(int width, int height) {
          synchronized (this) {
              if (mCamera != null && mCamera.isOpened()) {
@@ -425,7 +434,9 @@ It will be capable of accessing camera output, processing it and displaying the
      }

 #. The last step would be to update the *run()* void in *HelloOpenCVView* class as follows:
+
   .. code-block:: java
+      :linenos:

      public void run() {
          while (true) {
@@ -465,5 +476,3 @@ It will be capable of accessing camera output, processing it and displaying the
          }
          return bmp;
      }
-
-
--- a/ios/cmake/Modules/Platform/iOS.cmake
+++ b/ios/cmake/Modules/Platform/iOS.cmake
@@ -42,6 +42,8 @@ set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
 set (CMAKE_C_FLAGS "")
 set (CMAKE_CXX_FLAGS "-headerpad_max_install_names -fvisibility=hidden -fvisibility-inlines-hidden")

+set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -fomit-frame-pointer -ffast-math")
+
 if (HAVE_FLAG_SEARCH_PATHS_FIRST)
 	set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
 	set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
--- a/modules/calib3d/src/triangulate.cpp
+++ b/modules/calib3d/src/triangulate.cpp
@@ -416,10 +416,10 @@ void cv::triangulatePoints( InputArray _projMatr1, InputArray _projMatr2,
    Mat points1 = _projPoints1.getMat(), points2 = _projPoints2.getMat();

    if((points1.rows == 1 || points1.cols == 1) && points1.channels() == 2)
-        points1 = points1.reshape(1, points1.total()).t();
+        points1 = points1.reshape(1, static_cast<int>(points1.total())).t();

    if((points2.rows == 1 || points2.cols == 1) && points2.channels() == 2)
-        points2 = points2.reshape(1, points2.total()).t();
+        points2 = points2.reshape(1, static_cast<int>(points2.total())).t();

    CvMat cvMatr1 = matr1, cvMatr2 = matr2;
    CvMat cvPoints1 = points1, cvPoints2 = points2;
--- a/modules/calib3d/test/test_affine3d_estimator.cpp
+++ b/modules/calib3d/test/test_affine3d_estimator.cpp
@@ -48,6 +48,7 @@ using namespace std;
 #include <string>
 #include <iostream>
 #include <fstream>
+#include <functional>
 #include <iterator>
 #include <limits>
 #include <numeric>
--- a/modules/calib3d/test/test_chesscorners.cpp
+++ b/modules/calib3d/test/test_chesscorners.cpp
@@ -42,6 +42,7 @@
 #include "test_precomp.hpp"
 #include "test_chessboardgenerator.hpp"

+#include <functional>
 #include <limits>
 #include <numeric>

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -3,13 +3,14 @@ ocv_add_module(core ${ZLIB_LIBRARIES})
 ocv_module_include_directories(${ZLIB_INCLUDE_DIR})

 if(HAVE_CUDA)
-  file(GLOB lib_cuda "src/cuda/*.cu")
-  source_group("Cuda" FILES "${lib_cuda}")
-
+  ocv_source_group("Src\\Cuda" GLOB "src/cuda/*.cu")  
  ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src" "${OpenCV_SOURCE_DIR}/modules/gpu/src/cuda" ${CUDA_INCLUDE_DIRS})
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+  
+  file(GLOB lib_cuda "src/cuda/*.cu")
  ocv_cuda_compile(cuda_objs ${lib_cuda})

+  
  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 else()
  set(lib_cuda "")
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -440,7 +440,7 @@ template<typename _Tp, int m, int n> class CV_EXPORTS Matx
 {
 public:
    typedef _Tp value_type;
-    typedef Matx<_Tp, MIN(m, n), 1> diag_type;
+    typedef Matx<_Tp, (m < n ? m : n), 1> diag_type;
    typedef Matx<_Tp, m, n> mat_type;
    enum { depth = DataDepth<_Tp>::value, rows = m, cols = n, channels = rows*cols,
           type = CV_MAKETYPE(depth, channels) };
@@ -4620,6 +4620,34 @@ public:

 CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);

+/////////////////////////// Synchronization Primitives ///////////////////////////////
+
+class CV_EXPORTS Mutex
+{
+public:
+    Mutex();
+    ~Mutex();
+    Mutex(const Mutex& m);
+    Mutex& operator = (const Mutex& m);
+    
+    void lock();
+    bool trylock();
+    void unlock();
+    
+    struct Impl;
+protected:
+    Impl* impl;
+};
+
+class CV_EXPORTS AutoLock
+{
+public:    
+    AutoLock(Mutex& m) : mutex(&m) { mutex->lock(); }
+    ~AutoLock() { mutex->unlock(); }
+protected:    
+    Mutex* mutex;
+};
+
 }

 #endif // __cplusplus
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -72,9 +72,11 @@ namespace cv { namespace gpu
        FEATURE_SET_COMPUTE_13 = 13,
        FEATURE_SET_COMPUTE_20 = 20,
        FEATURE_SET_COMPUTE_21 = 21,
+        FEATURE_SET_COMPUTE_30 = 30,
        GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
        SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
-        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
+        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };

    // Gives information about what GPU archs this OpenCV GPU module was
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -282,6 +282,11 @@ namespace cv { namespace gpu { namespace device
        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, WithOutMask(), stream);
    }

+#if defined  __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wmissing-declarations"
+#endif
+
    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
@@ -318,4 +323,8 @@ namespace cv { namespace gpu { namespace device

        func(src, dst, alpha, beta, stream);
    }
+
+#if defined __clang__
+# pragma clang diagnostic pop
+#endif
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -1199,10 +1199,6 @@ namespace

        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
        {
-            NppiSize sz;
-            sz.width  = m.cols;
-            sz.height = m.rows;
-
            if (mask.empty())
            {
                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -42,6 +42,16 @@

 #include "precomp.hpp"

+#if !defined HAVE_TBB && !defined HAVE_OPENMP && !defined HAVE_GCD && !defined HAVE_CONCURRENCY
+
+#ifdef __APPLE__
+#define HAVE_GCD
+#elif defined _MSC_VER && _MSC_VER >= 1600
+#define HAVE_CONCURRENCY
+#endif
+
+#endif
+
 #ifdef HAVE_CONCURRENCY
 #  include <ppl.h>
 #elif defined HAVE_OPENMP
@@ -106,7 +116,22 @@ namespace cv

 #elif defined HAVE_CONCURRENCY

-        Concurrency::parallel_for(range.start, range.end, body);
+        class ConcurrencyProxyLoopBody
+        {
+        public:
+            ConcurrencyProxyLoopBody(const ParallelLoopBody& body) : _body(body) {}
+
+            void operator ()(int i) const
+            {
+                _body(Range(i, i + 1));
+            }
+
+        private:
+            const ParallelLoopBody& _body;
+            ConcurrencyProxyLoopBody& operator=(const ConcurrencyProxyLoopBody&) {return *this;}
+        } proxy(body);
+
+        Concurrency::parallel_for(range.start, range.end, proxy);

 #elif defined HAVE_OPENMP

--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -930,4 +930,104 @@ BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
 }
 #endif

+namespace cv
+{
+
+#if defined WIN32 || defined _WIN32 || defined WINCE
+
+struct Mutex::Impl
+{
+    Impl() { InitializeCriticalSection(&cs); refcount = 1; }
+    ~Impl() { DeleteCriticalSection(&cs); }
+
+    void lock() { EnterCriticalSection(&cs); }
+    bool trylock() { return TryEnterCriticalSection(&cs) != 0; }
+    void unlock() { LeaveCriticalSection(&cs); }
+
+    CRITICAL_SECTION cs;
+    int refcount;
+};
+
+#elif defined __APPLE__
+
+#include <libkern/OSAtomic.h>
+
+struct Mutex::Impl
+{
+    Impl() { sl = OS_SPINLOCK_INIT; refcount = 1; }
+    ~Impl() {}
+
+    void lock() { OSSpinLockLock(&sl); }
+    bool trylock() { return OSSpinLockTry(&sl); }
+    void unlock() { OSSpinLockUnlock(&sl); }
+
+    OSSpinLock sl;
+    int refcount;
+};
+
+#elif defined __linux__ && !defined ANDROID
+
+struct Mutex::Impl
+{
+    Impl() { pthread_spin_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_spin_destroy(&sl); }
+
+    void lock() { pthread_spin_lock(&sl); }
+    bool trylock() { return pthread_spin_trylock(&sl) == 0; }
+    void unlock() { pthread_spin_unlock(&sl); }
+
+    pthread_spinlock_t sl;
+    int refcount;
+};
+
+#else
+
+struct Mutex::Impl
+{
+    Impl() { pthread_mutex_init(&sl, 0); refcount = 1; }
+    ~Impl() { pthread_mutex_destroy(&sl); }
+
+    void lock() { pthread_mutex_lock(&sl); }
+    bool trylock() { return pthread_mutex_trylock(&sl) == 0; }
+    void unlock() { pthread_mutex_unlock(&sl); }
+
+    pthread_mutex_t sl;
+    int refcount;
+};
+
+#endif
+
+Mutex::Mutex()
+{
+    impl = new Mutex::Impl;
+}
+
+Mutex::~Mutex()
+{
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = 0;
+}
+
+Mutex::Mutex(const Mutex& m)
+{
+    impl = m.impl;
+    CV_XADD(&impl->refcount, 1);
+}
+
+Mutex& Mutex::operator = (const Mutex& m)
+{
+    CV_XADD(&m.impl->refcount, 1);
+    if( CV_XADD(&impl->refcount, -1) == 1 )
+        delete impl;
+    impl = m.impl;
+    return *this;
+}
+
+void Mutex::lock() { impl->lock(); }
+void Mutex::unlock() { impl->unlock(); }
+bool Mutex::trylock() { return impl->trylock(); }
+
+}
+
 /* End of file. */
--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@@ -59,7 +59,7 @@ CV_INIT_ALGORITHM(BriefDescriptorExtractor, "Feature2D.BRIEF",
 CV_INIT_ALGORITHM(FastFeatureDetector, "Feature2D.FAST",
                  obj.info()->addParam(obj, "threshold", obj.threshold);
                  obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression);
-                  obj.info()->addParam(obj, "type", obj.type, FastFeatureDetector::TYPE_9_16));
+                  obj.info()->addParam(obj, "type", obj.type));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////

--- a/modules/features2d/test/test_fast.cpp
+++ b/modules/features2d/test/test_fast.cpp
@@ -75,8 +75,8 @@ void CV_FastTest::run( int )

    vector<KeyPoint> keypoints1;
    vector<KeyPoint> keypoints2;
-    FAST(gray1, keypoints1, 30, type);
-    FAST(gray2, keypoints2, 30, type);
+    FAST(gray1, keypoints1, 30, true, type);
+    FAST(gray2, keypoints2, 30, true, type);

    for(size_t i = 0; i < keypoints1.size(); ++i)
    {
--- a/modules/features2d/test/test_nearestneighbors.cpp
+++ b/modules/features2d/test/test_nearestneighbors.cpp
@@ -200,7 +200,7 @@ int CV_KDTreeTest_CPP::checkGetPoins( const Mat& data )

 int CV_KDTreeTest_CPP::checkFindBoxed()
 {
-    vector<float> min( dims, minValue), max(dims, maxValue);
+    vector<float> min( dims, static_cast<float>(minValue)), max(dims, static_cast<float>(maxValue));
    vector<int> indices;
    tr->findOrthoRange( min, max, indices );
    // TODO check indices
@@ -214,8 +214,8 @@ int CV_KDTreeTest_CPP::findNeighbors( Mat& points, Mat& neighbors )
    const int emax = 20;
    Mat neighbors2( neighbors.size(), CV_32SC1 );
    int j;
-    vector<float> min(points.cols, minValue);
-    vector<float> max(points.cols, maxValue);
+    vector<float> min(points.cols, static_cast<float>(minValue));
+    vector<float> max(points.cols, static_cast<float>(maxValue));
    for( int pi = 0; pi < points.rows; pi++ )
    {
        // 1st way
--- a/modules/features2d/test/test_rotation_and_scale_invariance.cpp
+++ b/modules/features2d/test/test_rotation_and_scale_invariance.cpp
@@ -54,7 +54,7 @@ static
 Mat generateHomography(float angle)
 {
    // angle - rotation around Oz in degrees
-    float angleRadian = angle * CV_PI / 180.;
+    float angleRadian = static_cast<float>(angle * CV_PI / 180);
    Mat H = Mat::eye(3, 3, CV_32FC1);
    H.at<float>(0,0) = H.at<float>(1,1) = std::cos(angleRadian);
    H.at<float>(0,1) = -std::sin(angleRadian);
@@ -69,8 +69,8 @@ Mat rotateImage(const Mat& srcImage, float angle, Mat& dstImage, Mat& dstMask)
    // angle - rotation around Oz in degrees
    float diag = std::sqrt(static_cast<float>(srcImage.cols * srcImage.cols + srcImage.rows * srcImage.rows));
    Mat LUShift = Mat::eye(3, 3, CV_32FC1); // left up
-    LUShift.at<float>(0,2) = -srcImage.cols/2;
-    LUShift.at<float>(1,2) = -srcImage.rows/2;
+    LUShift.at<float>(0,2) = static_cast<float>(-srcImage.cols/2);
+    LUShift.at<float>(1,2) = static_cast<float>(-srcImage.rows/2);
    Mat RDShift = Mat::eye(3, 3, CV_32FC1); // right down
    RDShift.at<float>(0,2) = diag/2;
    RDShift.at<float>(1,2) = diag/2;
@@ -114,7 +114,7 @@ void scaleKeyPoints(const vector<KeyPoint>& src, vector<KeyPoint>& dst, float sc
 static
 float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
-    float c = norm(p0 - p1), sqr_c = c * c;
+    float c = static_cast<float>(norm(p0 - p1)), sqr_c = c * c;

    float sqr_r0 = r0 * r0;
    float sqr_r1 = r1 * r1;
@@ -125,7 +125,7 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float minR = std::min(r0, r1);
    float maxR = std::max(r0, r1);
    if(c + minR <= maxR)
-        return CV_PI * minR * minR;
+        return static_cast<float>(CV_PI * minR * minR);

    float cos_halfA0 = (sqr_r0 + sqr_c - sqr_r1) / (2 * r0 * c);
    float cos_halfA1 = (sqr_r1 + sqr_c - sqr_r0) / (2 * r1 * c);
@@ -133,15 +133,15 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float A0 = 2 * acos(cos_halfA0);
    float A1 = 2 * acos(cos_halfA1);

-    return  0.5 * sqr_r0 * (A0 - sin(A0)) +
-            0.5 * sqr_r1 * (A1 - sin(A1));
+    return  0.5f * sqr_r0 * (A0 - sin(A0)) +
+            0.5f * sqr_r1 * (A1 - sin(A1));
 }

 static
 float calcIntersectRatio(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
    float intersectArea = calcCirclesIntersectArea(p0, r0, p1, r1);
-    float unionArea = CV_PI * (r0 * r0 + r1 * r1) - intersectArea;
+    float unionArea = static_cast<float>(CV_PI) * (r0 * r0 + r1 * r1) - intersectArea;
    return intersectArea / unionArea;
 }

@@ -160,7 +160,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,

    matches.clear();
    vector<uchar> usedMask(keypoints1.size(), 0);
-    for(size_t i0 = 0; i0 < keypoints0.size(); i0++)
+    for(int i0 = 0; i0 < static_cast<int>(keypoints0.size()); i0++)
    {
        int nearestPointIndex = -1;
        float maxIntersectRatio = 0.f;
@@ -176,7 +176,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,
            if(intersectRatio > maxIntersectRatio)
            {
                maxIntersectRatio = intersectRatio;
-                nearestPointIndex = i1;
+                nearestPointIndex = static_cast<int>(i1);
            }
        }

@@ -222,7 +222,7 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
            featureDetector->detect(image1, keypoints1, mask1);
@@ -339,10 +339,10 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
-            rotateKeyPoints(keypoints0, H, angle, keypoints1);
+            rotateKeyPoints(keypoints0, H, static_cast<float>(angle), keypoints1);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

@@ -457,7 +457,7 @@ protected:
                keyPointMatchesCount++;

                // Check does this inlier have consistent sizes
-                const float maxSizeDiff = 0.8;//0.9f; // grad
+                const float maxSizeDiff = 0.8f;//0.9f; // grad
                float size0 = keypoints0[matches[m].trainIdx].size;
                float size1 = osiKeypoints1[matches[m].queryIdx].size;
                CV_Assert(size0 > 0 && size1 > 0);
@@ -545,7 +545,7 @@ protected:
            resize(image0, image1, Size(), 1./scale, 1./scale);

            vector<KeyPoint> keypoints1;
-            scaleKeyPoints(keypoints0, keypoints1, 1./scale);
+            scaleKeyPoints(keypoints0, keypoints1, 1.0f/scale);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -111,43 +111,3 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
                       FILES "Src" ${test_srcs}
                       ${nvidia})
 ocv_add_perf_tests()
-
-
-
-set(perf_cpu_path "${CMAKE_CURRENT_SOURCE_DIR}/perf_cpu")
-if(BUILD_PERF_TESTS AND EXISTS "${perf_cpu_path}")
-    # opencv_highgui is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_highgui opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
-    ocv_check_dependencies(${perf_deps})
-
-    if(OCV_DEPENDENCIES_FOUND)
-      set(the_target "opencv_perf_gpu_cpu")
-
-      ocv_module_include_directories(${perf_deps} "${perf_cpu_path}")
-
-      if(NOT OPENCV_PERF_${the_module}_CPU_SOURCES)
-        file(GLOB perf_srcs "${perf_cpu_path}/*.cpp")
-        file(GLOB perf_hdrs "${perf_cpu_path}/*.hpp" "${perf_cpu_path}/*.h")
-        source_group("Src" FILES ${perf_srcs})
-        source_group("Include" FILES ${perf_hdrs})
-        set(OPENCV_PERF_${the_module}_CPU_SOURCES ${perf_srcs} ${perf_hdrs})
-      endif()
-
-      add_executable(${the_target} ${OPENCV_PERF_${the_module}_CPU_SOURCES})
-      target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${perf_deps} ${OPENCV_LINKER_LIBS})
-
-      # Additional target properties
-      set_target_properties(${the_target} PROPERTIES
-        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-        RUNTIME_OUTPUT_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}"
-      )
-
-      if(ENABLE_SOLUTION_FOLDERS)
-        set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-      endif()
-
-      ocv_add_precompiled_headers(${the_target})
-    else(OCV_DEPENDENCIES_FOUND)
-      #TODO: warn about unsatisfied dependencies
-    endif(OCV_DEPENDENCIES_FOUND)
-  endif()
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@@ -204,7 +204,7 @@ gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU

-Cascade classifier class used for object detection. ::
+Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::

    class CV_EXPORTS CascadeClassifier_GPU
    {
@@ -219,6 +219,7 @@ Cascade classifier class used for object detection. ::

            /* Returns number of detected objects */
            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);

            /* Finds only the largest object. Special mode if training is required.*/
            bool findLargestObject;
@@ -233,11 +234,11 @@ Cascade classifier class used for object detection. ::

 gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
 -----------------------------------------------------
-Loads the classifier from a file.
+Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.

 .. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.



@@ -255,8 +256,7 @@ Loads the classifier from a file. The previous content is destroyed.

 .. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
-
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.


 gpu::CascadeClassifier_GPU::release
@@ -273,13 +273,17 @@ Detects objects of different sizes in the input image.

 .. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())

+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
+
    :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.

    :param objectsBuf: Buffer to store detected objects (rectangles). If it is empty, it is allocated with the default size. If not empty, the function searches not more than N objects, where ``N = sizeof(objectsBufer's data)/sizeof(cv::Rect)``.

-    :param scaleFactor: Value to specify how much the image size is reduced at each image scale.
+    :param maxObjectSize: Maximum possible object size. Objects larger than that are ignored. Used for second signature and supported only for LBP cascades.

-    :param minNeighbors: Value to specify how many neighbours each candidate rectangle has to retain.
+    :param scaleFactor:  Parameter specifying how much the image size is reduced at each image scale.
+
+    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.

    :param minSize: Minimum possible object size. Objects smaller than that are ignored.

--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
@@ -653,7 +653,7 @@ gpu::GMG_GPU
 ------------
 .. ocv:class:: gpu::GMG_GPU

-Class used for background/foreground segmentation. ::
+  Class used for background/foreground segmentation. ::

    class GMG_GPU_GPU
    {
@@ -677,9 +677,9 @@ Class used for background/foreground segmentation. ::
        ...
    };

-The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.
+  The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [GMG2012]_.

-Here are important members of the class that control the algorithm, which you can set after constructing the class instance:
+  Here are important members of the class that control the algorithm, which you can set after constructing the class instance:

    .. ocv:member:: int maxFeatures

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -820,6 +820,7 @@ private:
    int nLayers_;
 };

+//! HoughLines
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);
--- a/modules/gpu/misc/mark_nvidia.py
+++ b/modules/gpu/misc/mark_nvidia.py
@@ -1,255 +1,234 @@
 import sys, re

 spaces = '[\s]*'
-symbols = '[\s\w\d,.=:|]*'
+symbols = '[\s\w\d,.:|]*'

 def pattern1(prefix, test):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + '\)' + spaces)
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + '\)' + spaces)

-def pattern2(prefix, test, cvtype):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + '\)' + spaces)
+def pattern2(prefix, test, param1):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + '\)' + spaces)

-def pattern3(prefix, test, cvtype, param1):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + '\)' + spaces)
+def pattern3(prefix, test, param1, param2):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)

-def pattern4(prefix, test, cvtype, param1, param2):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)
+def pattern4(prefix, test, param1, param2, param3):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + '\)' + spaces)
+
+def pattern5(prefix, test, param1, param2, param3, param5):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + param4 + symbols + '\)' + spaces)

 npp_patterns = [
    ##############################################################
    # Core

-    # Core/Add_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Mat', '8U'),
-    pattern2('Core', 'Add_Mat', '16U'),
-    pattern2('Core', 'Add_Mat', '32F'),
+    # Core_AddMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddMat', '8U'),
+    pattern2('Core', 'AddMat', '16U'),
+    pattern2('Core', 'AddMat', '32F'),

-    # Core/Add_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Scalar', '8U'),
-    pattern2('Core', 'Add_Scalar', '16U'),
-    pattern2('Core', 'Add_Scalar', '32F'),
+    # Core_AddScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddScalar', '8U'),
+    pattern2('Core', 'AddScalar', '16U'),
+    pattern2('Core', 'AddScalar', '32F'),

-    # Core/Subtract_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Mat', '8U'),
-    pattern2('Core', 'Subtract_Mat', '16U'),
-    pattern2('Core', 'Subtract_Mat', '32F'),
+    # Core_SubtractMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractMat', '8U'),
+    pattern2('Core', 'SubtractMat', '16U'),
+    pattern2('Core', 'SubtractMat', '32F'),

-    # Core/Subtract_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Scalar', '8U'),
-    pattern2('Core', 'Subtract_Scalar', '16U'),
-    pattern2('Core', 'Subtract_Scalar', '32F'),
+    # Core_SubtractScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractScalar', '8U'),
+    pattern2('Core', 'SubtractScalar', '16U'),
+    pattern2('Core', 'SubtractScalar', '32F'),

-    # Core/Multiply_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Mat', '8U'),
-    pattern2('Core', 'Multiply_Mat', '16U'),
-    pattern2('Core', 'Multiply_Mat', '32F'),
+    # Core_MultiplyMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyMat', '8U'),
+    pattern2('Core', 'MultiplyMat', '16U'),
+    pattern2('Core', 'MultiplyMat', '32F'),

-    # Core/Multiply_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Scalar', '8U'),
-    pattern2('Core', 'Multiply_Scalar', '16U'),
-    pattern2('Core', 'Multiply_Scalar', '32F'),
+    # Core_MultiplyScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyScalar', '8U'),
+    pattern2('Core', 'MultiplyScalar', '16U'),
+    pattern2('Core', 'MultiplyScalar', '32F'),

-    # Core/Divide_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Mat', '8U'),
-    pattern2('Core', 'Divide_Mat', '16U'),
-    pattern2('Core', 'Divide_Mat', '32F'),
+    # Core_DivideMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideMat', '8U'),
+    pattern2('Core', 'DivideMat', '16U'),
+    pattern2('Core', 'DivideMat', '32F'),

-    # Core/Divide_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Scalar', '8U'),
-    pattern2('Core', 'Divide_Scalar', '16U'),
-    pattern2('Core', 'Divide_Scalar', '32F'),
+    # Core_Divide_Scalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideScalar', '8U'),
+    pattern2('Core', 'DivideScalar', '16U'),
+    pattern2('Core', 'DivideScalar', '32F'),

-    # Core/AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Mat', '8U'),
-    pattern2('Core', 'AbsDiff_Mat', '16U'),
-    pattern2('Core', 'AbsDiff_Mat', '32F'),
+    # Core_AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffMat', '8U'),
+    pattern2('Core', 'AbsDiffMat', '16U'),
+    pattern2('Core', 'AbsDiffMat', '32F'),

-    # Core/AbsDiff_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Scalar', '8U'),
-    pattern2('Core', 'AbsDiff_Scalar', '16U'),
-    pattern2('Core', 'AbsDiff_Scalar', '32F'),
+    # Core_AbsDiffScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffScalar', '8U'),
+    pattern2('Core', 'AbsDiffScalar', '16U'),
+    pattern2('Core', 'AbsDiffScalar', '32F'),

-    # Core/Abs
+    # Core_Abs
    pattern1('Core', 'Abs'),

-    # Core/Sqr
+    # Core_Sqr
    pattern1('Core', 'Sqr'),

-    # Core/Sqrt
+    # Core_Sqrt
    pattern1('Core', 'Sqrt'),

-    # Core/Log
+    # Core_Log
    pattern1('Core', 'Log'),

-    # Core/Exp
+    # Core_Exp
    pattern1('Core', 'Exp'),

-    # Core/Bitwise_And_Scalar
-    pattern1('Core', 'Bitwise_And_Scalar'),
+    # Core_BitwiseAndScalar
+    pattern1('Core', 'BitwiseAndScalar'),

-    # Core/Bitwise_Or_Scalar
-    pattern1('Core', 'Bitwise_Or_Scalar'),
+    # Core_BitwiseOrScalar
+    pattern1('Core', 'BitwiseOrScalar'),

-    # Core/Bitwise_Xor_Scalar
-    pattern1('Core', 'Bitwise_Xor_Scalar'),
+    # Core_BitwiseXorScalar
+    pattern1('Core', 'BitwiseXorScalar'),

-    # Core/RShift
+    # Core_RShift
    pattern1('Core', 'RShift'),

-    # Core/LShift
+    # Core_LShift
    pattern1('Core', 'LShift'),

-    # Core/Transpose
+    # Core_Transpose
    pattern1('Core', 'Transpose'),

-    # Core/Flip
+    # Core_Flip
    pattern1('Core', 'Flip'),

-    # Core/LUT_OneChannel
-    pattern1('Core', 'LUT_OneChannel'),
+    # Core_LutOneChannel
+    pattern1('Core', 'LutOneChannel'),

-    # Core/LUT_MultiChannel
-    pattern1('Core', 'LUT_MultiChannel'),
+    # Core_LutMultiChannel
+    pattern1('Core', 'LutMultiChannel'),

-    # Core/Magnitude_Complex
-    pattern1('Core', 'Magnitude_Complex'),
+    # Core_MagnitudeComplex
+    pattern1('Core', 'MagnitudeComplex'),

-    # Core/Magnitude_Sqr_Complex
-    pattern1('Core', 'Magnitude_Sqr_Complex'),
+    # Core_MagnitudeSqrComplex
+    pattern1('Core', 'MagnitudeSqrComplex'),

-    # Core/MeanStdDev
+    # Core_MeanStdDev
    pattern1('Core', 'MeanStdDev'),

-    # Core/NormDiff
+    # Core_NormDiff
    pattern1('Core', 'NormDiff'),

    ##############################################################
    # Filters

-    # Filters/Blur
+    # Filters_Blur
    pattern1('Filters', 'Blur'),

-    # Filters/Erode
+    # Filters_Erode
    pattern1('Filters', 'Erode'),

-    # Filters/Dilate
+    # Filters_Dilate
    pattern1('Filters', 'Dilate'),

-    # Filters/MorphologyEx
+    # Filters_MorphologyEx
    pattern1('Filters', 'MorphologyEx'),

    ##############################################################
    # ImgProc

-    # ImgProc/Resize (8UC1 | 8UC4, INTER_NEAREST | INTER_LINEAR)
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_LINEAR'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_LINEAR'),
+    # ImgProc_Resize (8U, 1 | 4, INTER_NEAREST | INTER_LINEAR)
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_LINEAR'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_LINEAR'),

-    # ImgProc/Resize (8UC4, INTER_CUBIC)
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_CUBIC'),
+    # ImgProc_Resize (8U, 4, INTER_CUBIC)
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_CUBIC'),

-    # ImgProc/WarpAffine (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    # ImgProc_WarpAffine (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),

-    # ImgProc/WarpPerspective (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    # ImgProc_WarpPerspective (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),

-    # ImgProc/CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC4', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32SC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32FC1', 'BORDER_CONSTANT'),
+    # ImgProc_CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '4', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32S', '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32F', '1', 'BORDER_CONSTANT'),

-    # ImgProc/Threshold (32F, THRESH_TRUNC)
+    # ImgProc_Threshold (32F, THRESH_TRUNC)
    pattern3('ImgProc', 'Threshold', '32F', 'THRESH_TRUNC'),

-    # ImgProc/Integral_Sqr
-    pattern1('ImgProc', 'Integral_Sqr'),
+    # ImgProc_IntegralSqr
+    pattern1('ImgProc', 'IntegralSqr'),

-    # ImgProc/HistEven_OneChannel
-    pattern1('ImgProc', 'HistEven_OneChannel'),
+    # ImgProc_HistEven_OneChannel
+    pattern1('ImgProc', 'HistEvenOneChannel'),

-    # ImgProc/HistEven_FourChannel
-    pattern1('ImgProc', 'HistEven_FourChannel'),
+    # ImgProc_HistEven_FourChannel
+    pattern1('ImgProc', 'HistEvenFourChannel'),

-    # ImgProc/Rotate
+    # ImgProc_Rotate
    pattern1('ImgProc', 'Rotate'),

-    # ImgProc/SwapChannels
+    # ImgProc_SwapChannels
    pattern1('ImgProc', 'SwapChannels'),

-    # ImgProc/AlphaComp
+    # ImgProc_AlphaComp
    pattern1('ImgProc', 'AlphaComp'),

-    # ImgProc/ImagePyramid_build
-    pattern1('ImgProc', 'ImagePyramid_build'),
+    # ImgProc_ImagePyramidBuild
+    pattern1('ImgProc', 'ImagePyramidBuild'),

-    # ImgProc/ImagePyramid_getLayer
-    pattern1('ImgProc', 'ImagePyramid_getLayer'),
+    # ImgProc_ImagePyramid_getLayer
+    pattern1('ImgProc', 'ImagePyramidGetLayer'),

    ##############################################################
    # MatOp

-    # MatOp/SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetTo', '8UC4'),
-    pattern2('MatOp', 'SetTo', '16UC1'),
-    pattern2('MatOp', 'SetTo', '16UC4'),
-    pattern2('MatOp', 'SetTo', '32FC1'),
-    pattern2('MatOp', 'SetTo', '32FC4'),
+    # MatOp_SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetTo', '8U' , '4'),
+    pattern3('MatOp', 'SetTo', '16U', '1'),
+    pattern3('MatOp', 'SetTo', '16U', '4'),
+    pattern3('MatOp', 'SetTo', '32F', '1'),
+    pattern3('MatOp', 'SetTo', '32F', '4'),

-    # MatOp/SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetToMasked', '8UC4'),
-    pattern2('MatOp', 'SetToMasked', '16UC1'),
-    pattern2('MatOp', 'SetToMasked', '16UC4'),
-    pattern2('MatOp', 'SetToMasked', '32FC1'),
-    pattern2('MatOp', 'SetToMasked', '32FC4'),
+    # MatOp_SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetToMasked', '8U' , '4'),
+    pattern3('MatOp', 'SetToMasked', '16U', '1'),
+    pattern3('MatOp', 'SetToMasked', '16U', '4'),
+    pattern3('MatOp', 'SetToMasked', '32F', '1'),
+    pattern3('MatOp', 'SetToMasked', '32F', '4'),

-    # MatOp/CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
-    pattern2('MatOp', 'CopyToMasked', '8UC1'),
-    pattern2('MatOp', 'CopyToMasked', '8UC3'),
-    pattern2('MatOp', 'CopyToMasked', '8UC4'),
-    pattern2('MatOp', 'CopyToMasked', '16UC1'),
-    pattern2('MatOp', 'CopyToMasked', '16UC3'),
-    pattern2('MatOp', 'CopyToMasked', '16UC4'),
-    pattern2('MatOp', 'CopyToMasked', '32FC1'),
-    pattern2('MatOp', 'CopyToMasked', '32FC3'),
-    pattern2('MatOp', 'CopyToMasked', '32FC4'),    
+    # MatOp_CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
+    pattern3('MatOp', 'CopyToMasked', '8U' , '1'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '3'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '4'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '1'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '3'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '4'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '1'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '3'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '4'),
 ]

 cublasPattern = pattern1('Core', 'GEMM')
--- a/modules/gpu/perf/main.cpp
+++ b/modules/gpu/perf/main.cpp
@@ -0,0 +1,125 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+
+void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        cout << "OS: Windows x64 \n" << endl;
+#   else
+        cout << "OS: Windows x32 \n" << endl;
+#   endif
+#elif defined linux
+#   if defined _LP64
+        cout << "OS: Linux x64 \n" << endl;
+#   else
+        cout << "OS: Linux x32 \n" << endl;
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        cout << "OS: Apple x64 \n" << endl;
+#   else
+        cout << "OS: Apple x32 \n" << endl;
+#   endif
+#endif
+}
+
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
+    int driver;
+    cudaDriverGetVersion(&driver);
+
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
+
+        cout << endl;
+    }
+#endif
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineParser cmd(argc, (const char**) argv,
+        "{ print_info_only | print_info_only | false | Print information about system and exit }"
+        "{ device | device | 0 | Device on which tests will be executed }"
+        "{ cpu | cpu | false | Run tests on cpu }"
+    );
+
+    printOsInfo();
+    printCudaInfo();
+
+    if (cmd.get<bool>("print_info_only"))
+        return 0;
+
+    int device = cmd.get<int>("device");
+    bool cpu = cmd.get<bool>("cpu");
+#ifndef HAVE_CUDA
+    cpu = true;
+#endif
+
+    if (cpu)
+    {
+        runOnGpu = false;
+
+        cout << "Run tests on CPU \n" << endl;
+    }
+    else
+    {
+        runOnGpu = true;
+
+        if (device < 0 || device >= getCudaEnabledDeviceCount())
+        {
+            cerr << "Incorrect device index - " << device << endl;
+            return -1;
+        }
+
+        DeviceInfo info(device);
+        if (!info.isCompatible())
+        {
+            cerr << "Device " << device << " [" << info.name() << "] is NOT compatible with current GPU module build" << endl;
+            return -1;
+        }
+
+        setDevice(device);
+
+        cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+    }
+
+    InitGoogleTest(&argc, argv);
+    perf::TestBase::Init(argc, argv);
+    return RUN_ALL_TESTS();
+}
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -1,219 +1,263 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // StereoBM

-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(make_pair<string, string>("gpu/perf/aloe.jpg", "gpu/perf/aloeR.jpg")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoBM_GPU bm(0, 256);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    bm(img_l, img_r, dst);
-
    declare.time(5.0);

+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int preset = 0;
+    const int ndisp = 256;
+
+    if (runOnGpu)
+    {
+        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bm(d_imgLeft, d_imgRight, d_dst);
+
        TEST_CYCLE()
        {
-        bm(img_l, img_r, dst);
+            d_bm(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        cv::StereoBM bm(preset, ndisp);
+
+        cv::Mat dst;
+
+        bm(imgLeft, imgRight, dst);
+
+        TEST_CYCLE()
+        {
+            bm(imgLeft, imgRight, dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

-GPU_PERF_TEST_1(StereoBeliefPropagation, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(make_pair<string, string>("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/stereobp/aloe-L.png");
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/stereobp/aloe-R.png");
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoBeliefPropagation bp(64);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    bp(img_l, img_r, dst);
-
    declare.time(10.0);

+    const cv::Mat imgLeft = readImage(GetParam().first);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 64;
+
+    if (runOnGpu)
+    {
+        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bp(d_imgLeft, d_imgRight, d_dst);
+
        TEST_CYCLE()
        {
-        bp(img_l, img_r, dst);
+            d_bp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBeliefPropagation, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoConstantSpaceBP

-GPU_PERF_TEST_1(StereoConstantSpaceBP, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_l_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
-
-    cv::Mat img_r_host = readImage("gpu/stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
-
-    cv::gpu::StereoConstantSpaceBP csbp(128);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
-
-    csbp(img_l, img_r, dst);
-
    declare.time(10.0);

+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 128;
+
+    if (runOnGpu)
+    {
+        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_csbp(d_imgLeft, d_imgRight, d_dst);
+
        TEST_CYCLE()
        {
-        csbp(img_l, img_r, dst);
+            d_csbp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoConstantSpaceBP, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // DisparityBilateralFilter

-GPU_PERF_TEST_1(DisparityBilateralFilter, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    const cv::Mat disp = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());

-    cv::Mat disp_host = readImage("gpu/stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(disp_host.empty());
+    const int ndisp = 128;

-    cv::gpu::DisparityBilateralFilter f(128);
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat disp(disp_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::DisparityBilateralFilter d_filter(ndisp);

-    f(disp, img, dst);
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat d_dst;
+
+        d_filter(d_disp, d_img, d_dst);

        TEST_CYCLE()
        {
-        f(disp, img, dst);
+            d_filter(d_disp, d_img, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DisparityBilateralFilter, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // TransformPoints

-IMPLEMENT_PARAM_CLASS(Count, int)
+DEF_PARAM_TEST_1(Count, int);

-GPU_PERF_TEST(TransformPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::transformPoints(src, rvec, tvec, dst);
+        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);

        TEST_CYCLE()
        {
-        cv::gpu::transformPoints(src, rvec, tvec, dst);
+            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, TransformPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
+        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);

        TEST_CYCLE()
        {
-        cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
+            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        TEST_CYCLE()
+        {
+            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(10.0);

-    int count = GET_PARAM(1);
+    const int count = GetParam();

    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
+    fillRandom(object, -100, 100);

    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
+    fillRandom(camera_mat, 0.5, 1);
    camera_mat.at<float>(0, 1) = 0.f;
    camera_mat.at<float>(1, 0) = 0.f;
    camera_mat.at<float>(2, 0) = 0.f;
    camera_mat.at<float>(2, 1) = 0.f;

-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));

    std::vector<cv::Point2f> image_vec;
    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
+    fillRandom(rvec_gold, 0, 1);
    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
+    fillRandom(tvec_gold, 0, 1);
    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);

    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
@@ -221,82 +265,92 @@ GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
    cv::Mat rvec;
    cv::Mat tvec;

+    if (runOnGpu)
+    {
        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-    declare.time(3.0);
-
        TEST_CYCLE()
        {
            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
        }
-}
+    }
+    else
+    {
+        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
+        TEST_CYCLE()
+        {
+            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
+    }
+}

 //////////////////////////////////////////////////////////////////////
 // ReprojectImageTo3D

-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);

-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src_host(size, depth);
-    fill(src_host, 5.0, 30.0);
+    cv::Mat src(size, depth);
+    fillRandom(src, 5.0, 30.0);

    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
+    fillRandom(Q, 0.1, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::reprojectImageTo3D(src, dst, Q);
+        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);

        TEST_CYCLE()
        {
-        cv::gpu::reprojectImageTo3D(src, dst, Q);
+            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::reprojectImageTo3D(src, dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::reprojectImageTo3D(src, dst, Q);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
 //////////////////////////////////////////////////////////////////////
 // DrawColorDisp

-GPU_PERF_TEST(DrawColorDisp, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Mat src(size, type);
+    fillRandom(src, 0, 255);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-
-    cv::gpu::drawColorDisp(src, dst, 255);
+        cv::gpu::drawColorDisp(d_src, d_dst, 255);

        TEST_CYCLE()
        {
-        cv::gpu::drawColorDisp(src, dst, 255);
+            cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        }
+    }
+    else
+    {
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DrawColorDisp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S))));
-
-#endif
-
+} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -1,209 +1,278 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SURF

-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(50.0);

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::gpu::SURF_GPU surf;
+    if (runOnGpu)
+    {
+        cv::gpu::SURF_GPU d_surf;

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;

-    surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
-
-    declare.time(2.0);
+        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

        TEST_CYCLE()
        {
-        surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
+            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
+    {
+        cv::SURF surf;
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        surf(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            surf(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // FAST

-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::FAST_GPU d_fast(20);

-    cv::gpu::FAST_GPU fast(20);
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints;

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints;
-
-    fast(img, cv::gpu::GpuMat(), keypoints);
+        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);

        TEST_CYCLE()
        {
-        fast(img, cv::gpu::GpuMat(), keypoints);
+            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        }
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+
+        cv::FAST(img, keypoints, 20);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            cv::FAST(img, keypoints, 20);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // ORB

-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::ORB_GPU d_orb(4000);

-    cv::gpu::ORB_GPU orb(4000);
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
+        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

        TEST_CYCLE()
        {
-        orb(img, cv::gpu::GpuMat(), keypoints, descriptors);
+            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
+    {
+        cv::ORB orb(4000);
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        orb(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            orb(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
+//////////////////////////////////////////////////////////////////////
+// BFMatch
+
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(20.0);
+
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);
+
+    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);
+
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);
+
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance;
+
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+
+        TEST_CYCLE()
+        {
+            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        }
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> matches;
+
+        matcher.match(query, train, matches);
+
+        TEST_CYCLE()
+        {
+            matcher.match(query, train, matches);
+        }
+    }
+}

 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
+// BFKnnMatch

-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
+    int desc_size = GET_PARAM(0);
+    int k = GET_PARAM(1);
    int normType = GET_PARAM(2);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);

-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);

-    cv::gpu::BFMatcher_GPU matcher(normType);
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);

-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance;
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;

-    matcher.matchSingle(query, train, trainIdx, distance);
-
-    declare.time(3.0);
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);

        TEST_CYCLE()
        {
-        matcher.matchSingle(query, train, trainIdx, distance);
+            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        }
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.knnMatch(query, train, matches, k);
+
+        TEST_CYCLE()
+        {
+            matcher.knnMatch(query, train, matches, k);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
+// BFRadiusMatch

-IMPLEMENT_PARAM_CLASS(K, int)
-
-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query, 0.0, 1.0);

-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train, 0.0, 1.0);

-    cv::gpu::BFMatcher_GPU matcher(normType);
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);

-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance, allDist;
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;

-    matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
-
-    declare.time(3.0);
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);

        TEST_CYCLE()
        {
-        matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
+            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
        }
-}
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
+        std::vector< std::vector<cv::DMatch> > matches;

-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
-
-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 1.0);
-
-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 1.0);
-
-    cv::gpu::BFMatcher_GPU matcher(normType);
-
-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, nMatches, distance;
-
-    matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
-
-    declare.time(3.0);
+        matcher.radiusMatch(query, train, matches, 2.0);

        TEST_CYCLE()
        {
-        matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
+            matcher.radiusMatch(query, train, matches, 2.0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -1,308 +1,379 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // Blur

-IMPLEMENT_PARAM_CLASS(KernelSize, int)
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);

-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
+        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));

        TEST_CYCLE()
        {
-        cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
+            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::blur(src, dst, cv::Size(ksize, ksize));
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
 //////////////////////////////////////////////////////////////////////
 // Sobel

-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
+        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);

        TEST_CYCLE()
        {
-        cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
+            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Sobel(src, dst, -1, 1, 1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Scharr

-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
+        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);

        TEST_CYCLE()
        {
-        cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
+            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Scharr(src, dst, -1, 1, 0);
+
+        TEST_CYCLE()
+        {
+            cv::Scharr(src, dst, -1, 1, 0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur

-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
+        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);

        TEST_CYCLE()
        {
-        cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
+            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Laplacian

-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::Laplacian(src, dst, -1, ksize);
+        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);

        TEST_CYCLE()
        {
-        cv::gpu::Laplacian(src, dst, -1, ksize);
+            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Laplacian(src, dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Laplacian(src, dst, -1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
 //////////////////////////////////////////////////////////////////////
 // Erode

-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::erode(src, dst, ker, buf);
+        cv::gpu::erode(d_src, d_dst, ker, d_buf);

        TEST_CYCLE()
        {
-        cv::gpu::erode(src, dst, ker, buf);
+            cv::gpu::erode(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::erode(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::erode(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // Dilate

-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::dilate(src, dst, ker, buf);
+        cv::gpu::dilate(d_src, d_dst, ker, d_buf);

        TEST_CYCLE()
        {
-        cv::gpu::dilate(src, dst, ker, buf);
+            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::dilate(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::dilate(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // MorphologyEx

 CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
+#define ALL_MORPH_OPS ValuesIn(MorphOp::all())

-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), ALL_MORPH_OPS))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int morphOp = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf1;
-    cv::gpu::GpuMat buf2;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf1;
+        cv::gpu::GpuMat d_buf2;

-    cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
+        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);

        TEST_CYCLE()
        {
-        cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
+            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::morphologyEx(src, dst, morphOp, ker);
+
+        TEST_CYCLE()
+        {
+            cv::morphologyEx(src, dst, morphOp, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
 //////////////////////////////////////////////////////////////////////
 // Filter2D

-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
+    fillRandom(kernel, 0.0, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::filter2D(src, dst, -1, kernel);
+        cv::gpu::filter2D(d_src, d_dst, -1, kernel);

        TEST_CYCLE()
        {
-        cv::gpu::filter2D(src, dst, -1, kernel);
+            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        }
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::filter2D(src, dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::filter2D(src, dst, -1, kernel);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@@ -1,57 +1,113 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;

-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
+namespace {
+
+DEF_PARAM_TEST_1(Image, string);
+
+struct GreedyLabeling
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    struct dot
+    {
+        int x;
+        int y;

-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };

-    // cv::threshold(image, image, 150, 255, CV_THRESH_BINARY);
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+        const int lo, hi;

+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }
+
+	private:
+		InInterval& operator=(const InInterval&);
+
+
+    };
+
+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;
+
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;
+
+                dist_labels[j * pitch + i] = cc;
+
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];
+
+                    dl[0] = cc;
+
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);
+
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);
+
+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);
+
+    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+
+    if (runOnGpu)
+    {
        cv::gpu::GpuMat mask;
        mask.create(image.rows, image.cols, CV_8UC1);

@@ -62,14 +118,24 @@ GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)

        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));

-    declare.time(1.0);
-
        TEST_CYCLE()
        {
            cv::gpu::labelComponents(mask, components);
        }
+    }
+    else
+    {
+        GreedyLabeling host(image);
+
+        host(host._labels);
+
+        declare.time(1.0);
+
+        TEST_CYCLE()
+        {
+            host(host._labels);
+        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -1,20 +0,0 @@
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -1,94 +1,122 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SetTo

-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::gpu::GpuMat src(size, type);
    cv::Scalar val(1, 2, 3, 4);

+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(size, type);
+
+        d_src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val);
+        }
+    }
+    else
+    {
+        cv::Mat src(size, type);
+
        src.setTo(val);

        TEST_CYCLE()
        {
            src.setTo(val);
        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
    cv::Scalar val(1, 2, 3, 4);
-    cv::gpu::GpuMat mask(mask_host);

+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+
+        d_src.setTo(val, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val, d_mask);
+        }
+    }
+    else
+    {
        src.setTo(val, mask);

        TEST_CYCLE()
        {
            src.setTo(val, mask);
        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat mask(mask_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_dst;
+
+        d_src.copyTo(d_dst, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.copyTo(d_dst, d_mask);
+        }
+    }
+    else
+    {
+        cv::Mat dst;

        src.copyTo(dst, mask);

@@ -96,33 +124,38 @@ GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
        {
            src.copyTo(dst, mask);
        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // ConvertTo

-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
+DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
+
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth1 = GET_PARAM(1);
+    int depth2 = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
+    cv::Mat src(size, depth1);
+    fillRandom(src);

-    cv::Mat src_host(size, depth1);
-    fill(src_host, 0, 255);
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        }
+    }
+    else
+    {
+        cv::Mat dst;

        src.convertTo(dst, depth2, 0.5, 1.0);

@@ -130,12 +163,7 @@ GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
        {
            src.convertTo(dst, depth2, 0.5, 1.0);
        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -1,22 +1,39 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 ///////////////////////////////////////////////////////////////
 // HOG

-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::GpuMat img(img_host);
    std::vector<cv::Rect> found_locations;

-    cv::gpu::HOGDescriptor hog;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_img(img);
+
+        cv::gpu::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        d_hog.detectMultiScale(d_img, found_locations);
+
+        TEST_CYCLE()
+        {
+            d_hog.detectMultiScale(d_img, found_locations);
+        }
+    }
+    else
+    {
+        cv::HOGDescriptor hog;
        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

        hog.detectMultiScale(img, found_locations);
@@ -25,61 +42,90 @@ GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
        {
            hog.detectMultiScale(img, found_locations);
        }
+    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-    cv::gpu::CascadeClassifier_GPU cascade;
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_objects_buffer;

+        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        }
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat objects_buffer;
+        std::vector<cv::Rect> rects;

-    cascade.detectMultiScale(img, objects_buffer);
+        cascade.detectMultiScale(img, rects);

        TEST_CYCLE()
        {
-        cascade.detectMultiScale(img, objects_buffer);
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
+///////////////////////////////////////////////////////////////
+// LBP cascade

-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_gpu_rects;

+        d_cascade.detectMultiScale(d_img, d_gpu_rects);

-    cv::gpu::GpuMat img(img_host);
-        cv::gpu::GpuMat gpu_rects;
-    cv::gpu::CascadeClassifier_GPU cascade;
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        }
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));

-    cascade.detectMultiScale(img, gpu_rects);
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
        TEST_CYCLE()
        {
-        cascade.detectMultiScale(img, gpu_rects);
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@@ -11,6 +11,10 @@

 #include "cvconfig.h"

+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include "opencv2/ts/ts.hpp"
 #include "opencv2/ts/ts_perf.hpp"

@@ -18,8 +22,12 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"

-#include "perf_utility.hpp"
+#include "utility.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/perf_utility.cpp
+++ b/modules/gpu/perf/perf_utility.cpp
@@ -4,12 +4,19 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;

-void fill(Mat& m, double a, double b)
+bool runOnGpu = true;
+
+void fillRandom(Mat& m, double a, double b)
 {
    RNG rng(123456789);
    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
 }

+Mat readImage(const string& fileName, int flags)
+{
+    return imread(perf::TestBase::getDataPath(fileName), flags);
+}
+
 void PrintTo(const CvtColorInfo& info, ostream* os)
 {
    static const char* str[] =
@@ -184,37 +191,3 @@ void PrintTo(const CvtColorInfo& info, ostream* os)

    *os << str[info.code];
 }
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -0,0 +1,45 @@
+#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
+#define __OPENCV_PERF_GPU_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+extern bool runOnGpu;
+
+void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
+cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+using perf::MatType;
+using perf::MatDepth;
+
+CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
+CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+
+struct CvtColorInfo
+{
+    int scn;
+    int dcn;
+    int code;
+
+    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+};
+void PrintTo(const CvtColorInfo& info, std::ostream* os);
+
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+DEF_PARAM_TEST_1(Sz, cv::Size);
+typedef perf::Size_MatType Sz_Type;
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, int);
+
+#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz720p, perf::sz1080p)
+
+#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_calib3d.cpp
+++ b/modules/gpu/perf_cpu/perf_calib3d.cpp
@@ -1,136 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// StereoBM
-
-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
-{
-    cv::Mat img_l = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l.empty());
-
-    cv::Mat img_r = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r.empty());
-
-    cv::StereoBM bm(0, 256);
-
-    cv::Mat dst;
-
-    bm(img_l, img_r, dst);
-
-    declare.time(5.0);
-
-    TEST_CYCLE()
-    {
-        bm(img_l, img_r, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-IMPLEMENT_PARAM_CLASS(Count, int)
-
-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat src(1, count, CV_32FC3);
-    fill(src, -100, 100);
-
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::Mat dst;
-
-    cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-    TEST_CYCLE()
-    {
-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
-
-    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
-
-    std::vector<cv::Point2f> image_vec;
-    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
-    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
-
-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
-
-    cv::Mat rvec;
-    cv::Mat tvec;
-
-    cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// ReprojectImageTo3D
-
-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 5.0, 30.0);
-
-    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
-
-    cv::Mat dst;
-
-    cv::reprojectImageTo3D(src, dst, Q);
-
-    TEST_CYCLE()
-    {
-        cv::reprojectImageTo3D(src, dst, Q);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
-#endif
-
--- a/modules/gpu/perf_cpu/perf_core.cpp
+++ b/modules/gpu/perf_cpu/perf_core.cpp
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
@@ -1 +0,0 @@
-#include "perf_cpu_precomp.hpp"
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
@@ -1,32 +0,0 @@
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  pragma GCC diagnostic ignored "-Wmissing-prototypes" //OSX
-#endif
-
-#ifndef __OPENCV_PERF_CPU_PRECOMP_HPP__
-#define __OPENCV_PERF_CPU_PRECOMP_HPP__
-
-#include <cstdio>
-#include <iostream>
-
-#include "cvconfig.h"
-
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-#include "opencv2/legacy/legacy.hpp"
-
-#include "perf_utility.hpp"
-
-#ifdef GTEST_CREATE_SHARED_LIBRARY
-#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
-#endif
-
-#endif
--- a/modules/gpu/perf_cpu/perf_features2d.cpp
+++ b/modules/gpu/perf_cpu/perf_features2d.cpp
@@ -1,187 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SURF
-
-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::SURF surf;
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    surf(img, cv::noArray(), keypoints, descriptors);
-
-    declare.time(50.0);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        surf(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// FAST
-
-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::KeyPoint> keypoints;
-
-    cv::FAST(img, keypoints, 20);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        cv::FAST(img, keypoints, 20);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ORB
-
-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::ORB orb(4000);
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    orb(img, cv::noArray(), keypoints, descriptors);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        orb(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
-
-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
-
-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector<cv::DMatch> matches;
-
-    matcher.match(query, train, matches);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        matcher.match(query, train, matches);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
-
-IMPLEMENT_PARAM_CLASS(K, int)
-
-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.knnMatch(query, train, matches, k);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.knnMatch(query, train, matches, k);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
-
-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 1.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 1.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.radiusMatch(query, train, matches, 2.0);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.radiusMatch(query, train, matches, 2.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_filters.cpp
+++ b/modules/gpu/perf_cpu/perf_filters.cpp
@@ -1,283 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-IMPLEMENT_PARAM_CLASS(KernelSize, int)
-
-//////////////////////////////////////////////////////////////////////
-// Blur
-
-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::blur(src, dst, cv::Size(ksize, ksize));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::blur(src, dst, cv::Size(ksize, ksize));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// Sobel
-
-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Scharr
-
-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Scharr(src, dst, -1, 1, 0);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Scharr(src, dst, -1, 1, 0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
-//////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Laplacian(src, dst, -1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Laplacian(src, dst, -1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
-//////////////////////////////////////////////////////////////////////
-// Erode
-
-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::erode(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::erode(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// Dilate
-
-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::dilate(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::dilate(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// MorphologyEx
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
-
-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::morphologyEx(src, dst, morphOp, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::morphologyEx(src, dst, morphOp, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(src, dst, -1, kernel);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(src, dst, -1, kernel);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_imgproc.cpp
+++ b/modules/gpu/perf_cpu/perf_imgproc.cpp
@@ -1,771 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// Remap
-
-GPU_PERF_TEST(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat xmap(size, CV_32FC1);
-    fill(xmap, 0, size.width);
-
-    cv::Mat ymap(size, CV_32FC1);
-    fill(ymap, 0, size.height);
-
-    cv::Mat dst;
-
-    cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-
-//////////////////////////////////////////////////////////////////////
-// Resize
-
-IMPLEMENT_PARAM_CLASS(Scale, double)
-
-GPU_PERF_TEST(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    double f = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR),
-                    Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
-    testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
-
-GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = cv::INTER_AREA;
-    double f = GET_PARAM(3);
-
-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
-
-    cv::Mat src(src_host);
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
-    testing::Values(MatType(CV_8UC1)/*,  MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
-    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpAffine
-
-GPU_PERF_TEST(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[2][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0}};
-    cv::Mat M(2, 3, CV_64F, (void*) mat);
-
-    cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpAffine, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpPerspective
-
-GPU_PERF_TEST(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0},
-                         {0.0,              0.0,             1.0}};
-    cv::Mat M(3, 3, CV_64F, (void*) mat);
-
-    cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpPerspective, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyMakeBorder
-
-GPU_PERF_TEST(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int borderType = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CopyMakeBorder, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
-
-GPU_PERF_TEST(Threshold, cv::gpu::DeviceInfo, cv::Size, MatDepth, ThreshOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    int threshOp = GET_PARAM(3);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::threshold(src, dst, 100.0, 255.0, threshOp);
-
-    TEST_CYCLE()
-    {
-        cv::threshold(src, dst, 100.0, 255.0, threshOp);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Threshold, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    ALL_THRESH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Integral
-
-GPU_PERF_TEST(Integral, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::integral(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::integral(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// HistEven_OneChannel
-
-GPU_PERF_TEST(HistEven_OneChannel, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    int hbins = 30;
-    float hranges[] = {0.0f, 180.0f};
-    cv::Mat hist;
-    int histSize[] = {hbins};
-    const float* ranges[] = {hranges};
-    int channels[] = {0};
-
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-
-    TEST_CYCLE()
-    {
-        cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HistEven_OneChannel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S))));
-
-//////////////////////////////////////////////////////////////////////
-// EqualizeHist
-
-GPU_PERF_TEST(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::equalizeHist(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::equalizeHist(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, EqualizeHist, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// Canny
-
-IMPLEMENT_PARAM_CLASS(AppertureSize, int)
-IMPLEMENT_PARAM_CLASS(L2gradient, bool)
-
-GPU_PERF_TEST(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient)
-{
-    int apperture_size = GET_PARAM(1);
-    bool useL2gradient = GET_PARAM(2);
-
-    cv::Mat image = readImage("perf/1280x1024.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat dst;
-
-    cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-
-    TEST_CYCLE()
-    {
-        cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
-    testing::Values(L2gradient(false), L2gradient(true))));
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftFiltering
-
-GPU_PERF_TEST_1(MeanShiftFiltering, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/meanshift/cones.png");
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat dst;
-
-    cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-
-    declare.time(15.0);
-
-    TEST_CYCLE()
-    {
-        cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MeanShiftFiltering, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// Convolve
-
-IMPLEMENT_PARAM_CLASS(KSize, int)
-IMPLEMENT_PARAM_CLASS(Ccorr, bool)
-
-GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
-{
-    cv::Size size = GET_PARAM(1);
-    int templ_size = GET_PARAM(2);
-    bool ccorr = GET_PARAM(3);
-
-    ASSERT_FALSE(ccorr);
-
-    cv::Mat image(size, CV_32FC1);
-    image.setTo(1.0);
-
-    cv::Mat templ(templ_size, templ_size, CV_32FC1);
-    templ.setTo(1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(image, dst, image.depth(), templ);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(image, dst, image.depth(), templ);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(KSize(3), KSize(9), KSize(17), KSize(27), KSize(32), KSize(64)),
-    testing::Values(Ccorr(false), Ccorr(true))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_8U
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size)
-
-GPU_PERF_TEST(MatchTemplate_8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_8U, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    ALL_TEMPLATE_METHODS));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_32F
-
-GPU_PERF_TEST(MatchTemplate_32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_32F, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-//////////////////////////////////////////////////////////////////////
-// MulSpectrums
-
-CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-GPU_PERF_TEST(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat a(size, CV_32FC2);
-    fill(a, 0, 100);
-
-    cv::Mat b(size, CV_32FC2);
-    fill(b, 0, 100);
-
-    cv::Mat dst;
-
-    cv::mulSpectrums(a, b, dst, flag);
-
-    TEST_CYCLE()
-    {
-        cv::mulSpectrums(a, b, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MulSpectrums, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
-
-//////////////////////////////////////////////////////////////////////
-// Dft
-
-GPU_PERF_TEST(Dft, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat src(size, CV_32FC2);
-    fill(src, 0, 100);
-
-    cv::Mat dst;
-
-    cv::dft(src, dst, flag);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::dft(src, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Dft, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerHarris
-
-IMPLEMENT_PARAM_CLASS(BlockSize, int)
-IMPLEMENT_PARAM_CLASS(ApertureSize, int)
-
-GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    double k = 0.5;
-
-    cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerMinEigenVal
-
-GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigenVal, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrDown
-
-GPU_PERF_TEST(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrDown(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrDown(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrUp
-
-GPU_PERF_TEST(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrUp(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrUp(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CvtColor
-
-GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, CvtColorInfo)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    CvtColorInfo info = GET_PARAM(3);
-
-    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::cvtColor(src, dst, info.code, info.dcn);
-
-    TEST_CYCLE()
-    {
-        cv::cvtColor(src, dst, info.code, info.dcn);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
-    testing::Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
-                    CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
-                    CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
-                    CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
-                    CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
-                    CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
-                    CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
-                    CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA))));
-
-//////////////////////////////////////////////////////////////////////
-// HoughLines
-
-IMPLEMENT_PARAM_CLASS(DoSort, bool)
-
-GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort)
-{
-    declare.time(30.0);
-
-    const cv::Size size = GET_PARAM(1);
-
-    const float rho = 1.0f;
-    const float theta = CV_PI / 180.0f;
-    const int threshold = 300;
-
-    cv::RNG rng(123456789);
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-
-    const int numLines = rng.uniform(500, 2000);
-    for (int i = 0; i < numLines; ++i)
-    {
-        cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::Point p2(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::line(src, p1, p2, cv::Scalar::all(255), 2);
-    }
-
-    std::vector<cv::Vec2f> lines;
-    cv::HoughLines(src, lines, rho, theta, threshold);
-
-    TEST_CYCLE()
-    {
-        cv::HoughLines(src, lines, rho, theta, threshold);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DoSort(false), DoSort(true))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_labeling.cpp
+++ b/modules/gpu/perf_cpu/perf_labeling.cpp
@@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace {
-
-    struct GreedyLabeling
-    {
-        struct dot
-        {
-            int x;
-            int y;
-
-            static dot make(int i, int j)
-            {
-                dot d; d.x = i; d.y = j;
-                return d;
-            }
-        };
-
-        struct InInterval
-        {
-            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
-            const int lo, hi;
-
-            bool operator() (const unsigned char a, const unsigned char b) const
-            {
-                int d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-        GreedyLabeling(cv::Mat img)
-        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
-
-        ~GreedyLabeling(){delete[] stack;}
-
-        void operator() (cv::Mat labels) const
-        {
-            labels.setTo(cv::Scalar::all(-1));
-            InInterval inInt(0, 2);
-            int cc = -1;
-
-            int* dist_labels = (int*)labels.data;
-            int pitch = labels.step1();
-
-            unsigned char* source = (unsigned char*)image.data;
-            int width = image.cols;
-            int height = image.rows;
-
-            for (int j = 0; j < image.rows; ++j)
-                for (int i = 0; i < image.cols; ++i)
-                {
-                    if (dist_labels[j * pitch + i] != -1) continue;
-
-                    dot* top = stack;
-                    dot p = dot::make(i, j);
-                    cc++;
-
-                    dist_labels[j * pitch + i] = cc;
-
-                    while (top >= stack)
-                    {
-                        int*  dl = &dist_labels[p.y * pitch + p.x];
-                        unsigned char* sp = &source[p.y * image.step1() + p.x];
-
-                        dl[0] = cc;
-
-                        //right
-                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
-                            *top++ = dot::make(p.x + 1, p.y);
-
-                        //left
-                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
-                            *top++ = dot::make(p.x - 1, p.y);
-
-                        //bottom
-                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
-                            *top++ = dot::make(p.x, p.y + 1);
-
-                        //top
-                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-image.step1()]))
-                            *top++ = dot::make(p.x, p.y - 1);
-
-                        p = *--top;
-                    }
-                }
-        }
-
-        cv::Mat image;
-        cv::Mat _labels;
-        dot* stack;
-    };
-}
-
-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-
-    GreedyLabeling host(image);
-
-    host(host._labels);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        host(host._labels);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_main.cpp
+++ b/modules/gpu/perf_cpu/perf_main.cpp
@@ -1,20 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf_cpu/perf_matop.cpp
+++ b/modules/gpu/perf_cpu/perf_matop.cpp
@@ -1,124 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SetTo
-
-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// SetToMasked
-
-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val, mask);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyToMasked
-
-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Mat dst;
-
-    src.copyTo(dst, mask);
-
-    TEST_CYCLE()
-    {
-        src.copyTo(dst, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// ConvertTo
-
-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
-
-    cv::Mat src(size, depth1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    src.convertTo(dst, depth2, 0.5, 1.0);
-
-    TEST_CYCLE()
-    {
-        src.convertTo(dst, depth2, 0.5, 1.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_objdetect.cpp
+++ b/modules/gpu/perf_cpu/perf_objdetect.cpp
@@ -1,74 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-///////////////////////////////////////////////////////////////
-// HOG
-
-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-    hog.detectMultiScale(img, found_locations);
-
-    TEST_CYCLE()
-    {
-        hog.detectMultiScale(img, found_locations);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////
-// HaarClassifier
-
-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
-
-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
--- a/modules/gpu/perf_cpu/perf_utility.cpp
+++ b/modules/gpu/perf_cpu/perf_utility.cpp
@@ -1,220 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void fill(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, a, b);
-}
-
-void PrintTo(const CvtColorInfo& info, ostream* os)
-{
-    static const char* str[] =
-    {
-        "BGR2BGRA",
-        "BGRA2BGR",
-        "BGR2RGBA",
-        "RGBA2BGR",
-        "BGR2RGB",
-        "BGRA2RGBA",
-
-        "BGR2GRAY",
-        "RGB2GRAY",
-        "GRAY2BGR",
-        "GRAY2BGRA",
-        "BGRA2GRAY",
-        "RGBA2GRAY",
-
-        "BGR2BGR565",
-        "RGB2BGR565",
-        "BGR5652BGR",
-        "BGR5652RGB",
-        "BGRA2BGR565",
-        "RGBA2BGR565",
-        "BGR5652BGRA",
-        "BGR5652RGBA",
-
-        "GRAY2BGR565",
-        "BGR5652GRAY",
-
-        "BGR2BGR555",
-        "RGB2BGR555",
-        "BGR5552BGR",
-        "BGR5552RGB",
-        "BGRA2BGR555",
-        "RGBA2BGR555",
-        "BGR5552BGRA",
-        "BGR5552RGBA",
-
-        "GRAY2BGR555",
-        "BGR5552GRAY",
-
-        "BGR2XYZ",
-        "RGB2XYZ",
-        "XYZ2BGR",
-        "XYZ2RGB",
-
-        "BGR2YCrCb",
-        "RGB2YCrCb",
-        "YCrCb2BGR",
-        "YCrCb2RGB",
-
-        "BGR2HSV",
-        "RGB2HSV",
-
-        "",
-        "",
-
-        "BGR2Lab",
-        "RGB2Lab",
-
-        "BayerBG2BGR",
-        "BayerGB2BGR",
-        "BayerRG2BGR",
-        "BayerGR2BGR",
-
-        "BGR2Luv",
-        "RGB2Luv",
-
-        "BGR2HLS",
-        "RGB2HLS",
-
-        "HSV2BGR",
-        "HSV2RGB",
-
-        "Lab2BGR",
-        "Lab2RGB",
-        "Luv2BGR",
-        "Luv2RGB",
-
-        "HLS2BGR",
-        "HLS2RGB",
-
-        "BayerBG2BGR_VNG",
-        "BayerGB2BGR_VNG",
-        "BayerRG2BGR_VNG",
-        "BayerGR2BGR_VNG",
-
-        "BGR2HSV_FULL",
-        "RGB2HSV_FULL",
-        "BGR2HLS_FULL",
-        "RGB2HLS_FULL",
-
-        "HSV2BGR_FULL",
-        "HSV2RGB_FULL",
-        "HLS2BGR_FULL",
-        "HLS2RGB_FULL",
-
-        "LBGR2Lab",
-        "LRGB2Lab",
-        "LBGR2Luv",
-        "LRGB2Luv",
-
-        "Lab2LBGR",
-        "Lab2LRGB",
-        "Luv2LBGR",
-        "Luv2LRGB",
-
-        "BGR2YUV",
-        "RGB2YUV",
-        "YUV2BGR",
-        "YUV2RGB",
-
-        "BayerBG2GRAY",
-        "BayerGB2GRAY",
-        "BayerRG2GRAY",
-        "BayerGR2GRAY",
-
-        //YUV 4:2:0 formats family
-        "YUV2RGB_NV12",
-        "YUV2BGR_NV12",
-        "YUV2RGB_NV21",
-        "YUV2BGR_NV21",
-
-        "YUV2RGBA_NV12",
-        "YUV2BGRA_NV12",
-        "YUV2RGBA_NV21",
-        "YUV2BGRA_NV21",
-
-        "YUV2RGB_YV12",
-        "YUV2BGR_YV12",
-        "YUV2RGB_IYUV",
-        "YUV2BGR_IYUV",
-
-        "YUV2RGBA_YV12",
-        "YUV2BGRA_YV12",
-        "YUV2RGBA_IYUV",
-        "YUV2BGRA_IYUV",
-
-        "YUV2GRAY_420",
-
-        //YUV 4:2:2 formats family
-        "YUV2RGB_UYVY",
-        "YUV2BGR_UYVY",
-        "YUV2RGB_VYUY",
-        "YUV2BGR_VYUY",
-
-        "YUV2RGBA_UYVY",
-        "YUV2BGRA_UYVY",
-        "YUV2RGBA_VYUY",
-        "YUV2BGRA_VYUY",
-
-        "YUV2RGB_YUY2",
-        "YUV2BGR_YUY2",
-        "YUV2RGB_YVYU",
-        "YUV2BGR_YVYU",
-
-        "YUV2RGBA_YUY2",
-        "YUV2BGRA_YUY2",
-        "YUV2RGBA_YVYU",
-        "YUV2BGRA_YVYU",
-
-        "YUV2GRAY_UYVY",
-        "YUV2GRAY_YUY2",
-
-        // alpha premultiplication
-        "RGBA2mRGBA",
-        "mRGBA2RGBA",
-
-        "COLORCVT_MAX"
-    };
-
-    *os << str[info.code];
-}
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf_cpu/perf_utility.hpp
+++ b/modules/gpu/perf_cpu/perf_utility.hpp
@@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_video.cpp
+++ b/modules/gpu/perf_cpu/perf_video.cpp
@@ -1,466 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-
-IMPLEMENT_PARAM_CLASS(MinDistance, double)
-
-GPU_PERF_TEST(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
-{
-    double minDistance = GET_PARAM(1);
-
-    cv::Mat image = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat corners;
-
-    cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-
-    TEST_CYCLE()
-    {
-        cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GoodFeaturesToTrack, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinDistance(0.0), MinDistance(3.0))));
-
-//////////////////////////////////////////////////////
-// PyrLKOpticalFlowSparse
-
-IMPLEMENT_PARAM_CLASS(GraySource, bool)
-IMPLEMENT_PARAM_CLASS(Points, int)
-IMPLEMENT_PARAM_CLASS(WinSize, int)
-IMPLEMENT_PARAM_CLASS(Levels, int)
-IMPLEMENT_PARAM_CLASS(Iters, int)
-
-GPU_PERF_TEST(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, GraySource, Points, WinSize, Levels, Iters)
-{
-    bool useGray = GET_PARAM(1);
-    int points = GET_PARAM(2);
-    int win_size = GET_PARAM(3);
-    int levels = GET_PARAM(4);
-    int iters = GET_PARAM(5);
-
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    cv::Mat pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-    cv::Mat nextPts;
-    cv::Mat status;
-
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                             cv::Size(win_size, win_size), levels - 1,
-                             cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                                 cv::Size(win_size, win_size), levels - 1,
-                                 cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(GraySource(true), GraySource(false)),
-    testing::Values(Points(1000), Points(2000), Points(4000), Points(8000)),
-    testing::Values(WinSize(9), WinSize(13), WinSize(17), WinSize(21)),
-    testing::Values(Levels(1), Levels(2), Levels(3)),
-    testing::Values(Iters(1), Iters(10), Iters(30))));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlowTest
-
-GPU_PERF_TEST_1(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo)
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat flow;
-
-    int numLevels = 5;
-    double pyrScale = 0.5;
-    int winSize = 13;
-    int numIters = 10;
-    int polyN = 5;
-    double polySigma = 1.1;
-    int flags = 0;
-
-    cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-
-    declare.time(10);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-namespace cv
-{
-    template<> void Ptr<CvBGStatModel>::delete_obj()
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-GPU_PERF_TEST(FGDStatModel, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    declare.time(60);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-
-        startTimer();
-        next();
-
-        cvUpdateBGStatModel(&ipl_frame, model);
-
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-//////////////////////////////////////////////////////
-// MOG
-
-IMPLEMENT_PARAM_CLASS(LearningRate, double)
-
-GPU_PERF_TEST(MOG, cv::gpu::DeviceInfo, std::string, Channels, LearningRate)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    double learningRate = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG mog;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog(frame, foreground, learningRate);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog(frame, foreground, learningRate);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/),
-    testing::Values(LearningRate(0.0), LearningRate(0.01))));
-
-//////////////////////////////////////////////////////
-// MOG2
-
-GPU_PERF_TEST(MOG2_update, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog2(frame, foreground);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog2(frame, foreground);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_update, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/)));
-
-GPU_PERF_TEST(MOG2_getBackgroundImage, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        mog2(frame, foreground);
-    }
-
-    cv::Mat background;
-    mog2.getBackgroundImage(background);
-
-    TEST_CYCLE()
-    {
-        mog2.getBackgroundImage(background);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_getBackgroundImage, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(/*Channels(1),*/ Channels(3)/*, Channels(4)*/)));
-
-//////////////////////////////////////////////////////
-// GMG
-
-IMPLEMENT_PARAM_CLASS(MaxFeatures, int)
-
-GPU_PERF_TEST(GMG, cv::gpu::DeviceInfo, std::string, Channels, MaxFeatures)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    int maxFeatures = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    cv::Mat fgmask;
-    cv::Mat zeros(frame.size(), CV_8UC1, cv::Scalar::all(0));
-
-    cv::BackgroundSubtractorGMG gmg;
-    gmg.set("maxFeatures", maxFeatures);
-    gmg.initialize(frame.size(), 0.0, 255.0);
-
-    gmg(frame, fgmask);
-
-    for (int i = 0; i < 150; ++i)
-    {
-        cap >> frame;
-        if (frame.empty())
-        {
-            cap.open(inputFile);
-            cap >> frame;
-        }
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        gmg(frame, fgmask);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GMG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(MaxFeatures(20), MaxFeatures(40), MaxFeatures(60))));
-
-//////////////////////////////////////////////////////
-// VideoWriter
-
-#ifdef WIN32
-
-GPU_PERF_TEST(VideoWriter, cv::gpu::DeviceInfo, std::string)
-{
-    const double FPS = 25.0;
-
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    std::string outputFile = cv::tempfile(".avi");
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::VideoWriter writer;
-
-    cv::Mat frame;
-
-    declare.time(30);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        reader >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (!writer.isOpened())
-            writer.open(outputFile, CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size());
-
-        startTimer(); next();
-        writer.write(frame);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoWriter, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif // WIN32
-
-//////////////////////////////////////////////////////
-// VideoReader
-
-GPU_PERF_TEST(VideoReader, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::Mat frame;
-
-    reader >> frame;
-
-    declare.time(20);
-
-    TEST_CYCLE_N(10)
-    {
-        reader >> frame;
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoReader, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -420,16 +420,16 @@ void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx
    const float* distance_ptr =  distance.ptr<float>();
    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
    {
-        int trainIdx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;

-        if (trainIdx == -1)
+        if (_trainIdx == -1)
            continue;

-        int imgIdx = *imgIdx_ptr;
+        int _imgIdx = *imgIdx_ptr;

-        float distance = *distance_ptr;
+        float _distance = *distance_ptr;

-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

        matches.push_back(m);
    }
@@ -558,13 +558,13 @@ void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& dis

        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, 0, distance);
+                DMatch m(queryIdx, _trainIdx, 0, _distance);

                curMatches.push_back(m);
            }
@@ -680,15 +680,15 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& im

        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                int imgIdx = *imgIdx_ptr;
+                int _imgIdx = *imgIdx_ptr;

-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

                curMatches.push_back(m);
            }
@@ -868,25 +868,25 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
            continue;
        }

-        matches.push_back(vector<DMatch>(nMatches));
+        matches.push_back(vector<DMatch>(nMatched));
        vector<DMatch>& curMatches = matches.back();

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            float distance = *distance_ptr;
+            float _distance = *distance_ptr;

-            DMatch m(queryIdx, trainIdx, 0, distance);
+            DMatch m(queryIdx, _trainIdx, 0, _distance);

            curMatches[i] = m;
        }
@@ -1009,9 +1009,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
@@ -1020,9 +1020,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&

        matches.push_back(vector<DMatch>());
        vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatches);
+        curMatches.reserve(nMatched);

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
            int _trainIdx = *trainIdx_ptr;
            int _imgIdx = *imgIdx_ptr;
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -214,6 +214,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
                             int num_iters, float max_dist, int min_inlier_count,
                             vector<int>* inliers)
 {
+    (void)min_inlier_count;
    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
    CV_Assert(object.cols == image.cols);
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -143,7 +143,7 @@ public:
    }

    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size maxObjectSize)
+                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
    {
        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);

@@ -380,12 +380,12 @@ public:
    LbpCascade(){}
    virtual ~LbpCascade(){}

-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool findLargestObject,
-        bool visualizeInPlace, cv::Size minObjectSize, cv::Size maxObjectSize)
+    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
+        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
    {
        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);

-        const int defaultObjSearchNum = 100;
+        // const int defaultObjSearchNum = 100;
        const float grouping_eps = 0.2f;

        if( !objects.empty() && objects.depth() == CV_32S)
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -56,7 +56,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            extern __shared__ int smem[];

@@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            extern __shared__ int smem[];

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -261,7 +261,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
        {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)

            __shared__ int smem[18][18];

@@ -358,7 +358,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
        {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120

            const int stack_size = 512;

--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -316,7 +316,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }

-                changed = Emulation::sycthOr(changed);
+                changed = Emulation::syncthreadsOr(changed);

                if (!changed)
                    break;
@@ -474,7 +474,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }
                }
-            } while (Emulation::sycthOr(changed));
+            } while (Emulation::syncthreadsOr(changed));
        }

        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
        template <int KSIZE, typename T, typename D, typename B>
        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 16;
                const int BLOCK_DIM_Y = 16;
                const int PATCH_PER_BLOCK = 4;
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@@ -223,7 +223,7 @@ namespace cv { namespace gpu { namespace device
        template <bool calcScore, class Mask>
        __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            const int j = threadIdx.x + blockIdx.x * blockDim.x + 3;
            const int i = threadIdx.y + blockIdx.y * blockDim.y + 3;
@@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device

        __global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
        {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)

            const int kpIdx = threadIdx.x + blockIdx.x * blockDim.x;

--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device

    #define MERGE_THREADBLOCK_SIZE 256

-    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
+    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))

    namespace hist
    {
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        {
            __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
-            __shared__ int s_start[4];
+            __shared__ int s_globStart[4];

            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();

            // fill the queue
+            const uchar* srcRow = src.ptr(y);
            for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
            {
-                if (src(y, xx))
+                if (srcRow[xx])
                {
                    const unsigned int val = (y << 16) | xx;
                    const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x == 0 && threadIdx.y == 0)
            {
                // find how many items are stored in each list
-                int total_size = 0;
+                int totalSize = 0;
                for (int i = 0; i < blockDim.y; ++i)
                {
-                    s_start[i] = total_size;
-                    total_size += s_qsize[i];
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
                }

                // calculate the offset in the global list
-                const int global_offset = atomicAdd(&g_counter, total_size);
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
                for (int i = 0; i < blockDim.y; ++i)
-                    s_start[i] += global_offset;
+                    s_globStart[i] += globalOffset;
            }

            __syncthreads();

            // copy local queues to global queue
            const int qsize = s_qsize[threadIdx.y];
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
-            {
-                const unsigned int val = s_queues[threadIdx.y][i];
-                list[s_start[threadIdx.y] + i] = val;
-            }
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
        }

        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 4);
            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            return total_count;
+            return totalCount;
        }

        ////////////////////////////////////////////////////////////////////////
@@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

+            int* accumRow = accum.ptr(n + 1);
            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

-                ::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
+                ::atomicAdd(accumRow + r + 1, 1);
            }
        }

@@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

                Emulation::smem::atomicAdd(&smem[r + 1], 1);
            }

            __syncthreads();

-            for (int i = threadIdx.x; i < numrho; i += blockDim.x)
-                accum(n + 1, i) = smem[i];
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
        }

        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult

-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
+        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
        {
            __shared__ int smem[8][32];

-            int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
-            int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
+            const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+            const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;

-            if (r >= accum.cols || n >= accum.rows)
+            if (x >= accum.cols || y >= accum.rows)
                return;

-            smem[threadIdx.y][threadIdx.x] = accum(n, r);
+            smem[threadIdx.y][threadIdx.x] = accum(y, x);
            __syncthreads();

-            r -= 1;
-            n -= 1;
+            const int r = x - 1;
+            const int n = y - 1;

            if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
                return;
@@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device

        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));

-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            total_count = ::min(total_count, maxSize);
+            totalCount = ::min(totalCount, maxSize);

-            if (doSort && total_count > 0)
+            if (doSort && totalCount > 0)
            {
-                thrust::device_ptr<float2> out_ptr(out);
-                thrust::device_ptr<int> votes_ptr(votes);
-                thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
            }

-            return total_count;
+            return totalCount;
        }
    }
 }}}
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in && 0x000000ff) >>  0;
+            bytes.y = (in && 0x0000ff00) >>  8;
+            bytes.z = (in && 0x00ff0000) >> 16;
+            bytes.w = (in && 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                const int block = img.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace device
                rect.z = __float2int_rn(windowW * scale);
                rect.w = __float2int_rn(windowH * scale);

-                int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
+                int res = atomicInc(classified, (unsigned int)objects.cols);
                objects(0, res) = rect;
            }
        }
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -215,7 +215,7 @@ namespace cv { namespace gpu { namespace device
                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
                }

-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@@ -535,7 +535,7 @@ namespace cv { namespace gpu { namespace device

                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@@ -841,7 +841,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, uint>(scount, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;

 		        if (tid == 0)
@@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, R>(smem, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@@ -1115,7 +1115,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem, tid);
                sumInSmem<nthreads, R>(smem + nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@@ -1222,7 +1222,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + nthreads, tid);
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
                __shared__ bool is_last;

                if (tid == 0)
@@ -1339,7 +1339,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);

-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                __shared__ bool is_last;

                if (tid == 0)
@@ -1975,7 +1975,7 @@ namespace cv { namespace gpu { namespace device
            for (int c = 0; c < cn; ++c)
                myVal[c] = op.startValue();

-        #if __CUDA_ARCH__ >= 200
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200

            // For cc >= 2.0 prefer L1 cache
            for (int x = threadIdx.x; x < src.cols; x += 256)
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -82,7 +82,7 @@ namespace cv { namespace gpu { namespace device
            smem3[tid] = val3;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
            smem2[tid] = val2;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@@ -184,7 +184,7 @@ namespace cv { namespace gpu { namespace device
            smem1[tid] = val1;
            __syncthreads();

-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
            if (tid < 128)
            {
                smem1[tid] = val1 += smem1[tid + 128];
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace device
        template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
        __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
        {
-#if __CUDA_ARCH__ <= 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
            __shared__ float smem1[128];
            __shared__ float smem2[128];
            __shared__ float smem3[128];
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
        template <int KSIZE, typename T, typename D, typename B>
        __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 32;
                const int BLOCK_DIM_Y = 8;
                const int PATCH_PER_BLOCK = 4;
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -0,0 +1,92 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_GPU_TEXTURE_BINDER_HPP_
+#define OPENCV_GPU_TEXTURE_BINDER_HPP_
+
+#include "opencv2/gpu/devmem2d.hpp"
+#include <safe_call.hpp>
+
+namespace cv
+{
+  namespace gpu
+  {
+    class TextureBinder
+    {
+    public:
+      template<class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
+      {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+      }
+      
+      template<class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
+      {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
+      }
+
+      template<class A, class T, enum cudaTextureReadMode readMode>
+      TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex)
+      {
+        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+      }
+
+
+      ~TextureBinder()
+      {
+        cudaSafeCall( cudaUnbindTexture(texref) );
+      }
+    private:
+      const struct textureReference *texref;
+    };
+  }
+
+  namespace device
+  {
+      using pcl::gpu::TextureBinder;
+  }
+}
+
+#endif /* OPENCV_GPU_TEXTURE_BINDER_HPP_*/
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -48,7 +48,7 @@ void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Gpu
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }

 void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_nogpu(); }
-void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int, Stream& stream) { throw_nogpu(); }
+void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -315,7 +315,7 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect
  double scale = 1.;
  int levels = 0;

-  for (levels = 0; levels < conf_out.size(); levels++)
+  for (levels = 0; levels < (int)conf_out.size(); levels++)
    {
      scale = conf_out[levels].scale;
      level_scale.push_back(scale);
@@ -332,8 +332,8 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect

  for (size_t i = 0; i < level_scale.size(); i++)
    {
-      double scale = level_scale[i];
-      Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+      double _scale = level_scale[i];
+      Size sz(cvRound(img.cols / _scale), cvRound(img.rows / _scale));
      GpuMat smaller_img;

      if (sz == img.size())
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
    namespace hough
    {
        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+
        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
    }
 }}}

+//////////////////////////////////////////////////////////
+// HoughLines
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    GpuMat accum, buf;
+    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
+}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesTransform(src, accum, buf, rho, theta);
+    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
+}
+
 void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
 {
    using namespace cv::gpu::device::hough;
@@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
    CV_Assert(numangle > 0 && numrho > 0);

    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
-    accum.setTo(cv::Scalar::all(0));
+    accum.setTo(Scalar::all(0));

-    cv::gpu::DeviceInfo devInfo;
+    DeviceInfo devInfo;

    if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 }

 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
 {
-    using namespace cv::gpu::device;
+    using namespace cv::gpu::device::hough;

    CV_Assert(accum.type() == CV_32SC1);

    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);

-    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);

    if (count > 0)
        lines.cols = count;
@@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
        lines.release();
 }

-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    cv::gpu::GpuMat accum, buf;
-    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
-}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesTransform(src, accum, buf, rho, theta);
-    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
-}
-
 void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
 {
    if (d_lines.empty())
@@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);

    h_lines_.create(1, d_lines.cols, CV_32FC2);
-    cv::Mat h_lines = h_lines_.getMat();
+    Mat h_lines = h_lines_.getMat();
    d_lines.row(0).download(h_lines);

    if (h_votes_.needed())
    {
        h_votes_.create(1, d_lines.cols, CV_32SC1);
-        cv::Mat h_votes = h_votes_.getMat();
-        cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
        d_votes.download(h_votes);
    }
 }
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -261,6 +261,12 @@ namespace
    }
 }

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
+#else
+typedef Npp32s Npp32s_a;
+#endif
+
 void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
@@ -308,7 +314,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
-                Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
+                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
@@ -527,11 +533,65 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
    integralBuffered(src, sum, buffer, s);
 }

+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
 void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
 {
    CV_Assert(src.type() == CV_8UC1);
-    if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1)
-        sum.create(src.rows + 1, src.cols + 1, CV_32S);
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    DeviceInfo info;
+
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS))
+    {
+        GpuMat src16;
+
+        if (src.cols % 16 == 0)
+            src16 = src;
+        else
+        {
+            ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer);
+
+            GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
+
+            if (s)
+            {
+                s.enqueueMemSet(buffer, Scalar::all(0));
+                s.enqueueCopy(src, inner);
+            }
+            else
+            {
+                buffer.setTo(Scalar::all(0));
+                src.copyTo(inner);
+            }
+
+            src16 = buffer;
+        }
+
+        sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1);
+
+        if (s)
+            s.enqueueMemSet(sum, Scalar::all(0));
+        else
+            sum.setTo(Scalar::all(0));
+
+        GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows));
+
+        cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream);
+
+        if (src16.cols != src.cols)
+            sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));
+    }
+    else
+    {
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);

        NcvSize32u roiSize;
        roiSize.width = src.cols;
@@ -544,7 +604,6 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);

-    cudaStream_t stream = StreamAccessor::getStream(s);

        NppStStreamHandler h(stream);

@@ -553,6 +612,7 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }

 //////////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -227,6 +227,7 @@ namespace
    void matchTemplate_SQDIFF_32F(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
+        (void)buf;
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -67,7 +67,11 @@
 // Guaranteed size cross-platform classifier structures
 //
 //==============================================================================
-
+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32f __attribute__((__may_alias__)) Ncv32f_a;
+#else
+typedef Ncv32f Ncv32f_a;
+#endif

 struct HaarFeature64
 {
@@ -87,7 +91,7 @@ struct HaarFeature64

    __host__ NCVStatus setWeight(Ncv32f weight)
    {
-        ((Ncv32f*)&(this->_ui2.y))[0] = weight;
+        ((Ncv32f_a*)&(this->_ui2.y))[0] = weight;
        return NCV_SUCCESS;
    }

@@ -102,7 +106,7 @@ struct HaarFeature64

    __device__ __host__ Ncv32f getWeight(void)
    {
-        return *(Ncv32f*)(&this->_ui2.y);
+        return *(Ncv32f_a*)(&this->_ui2.y);
    }
 };

@@ -168,14 +172,13 @@ public:
    }
 };

-
 struct HaarClassifierNodeDescriptor32
 {
    uint1 _ui1;

    __host__ NCVStatus create(Ncv32f leafValue)
    {
-        *(Ncv32f *)&this->_ui1 = leafValue;
+        *(Ncv32f_a *)&this->_ui1 = leafValue;
        return NCV_SUCCESS;
    }

@@ -187,7 +190,7 @@ struct HaarClassifierNodeDescriptor32

    __host__ Ncv32f getLeafValueHost(void)
    {
-        return *(Ncv32f *)&this->_ui1.x;
+        return *(Ncv32f_a *)&this->_ui1.x;
    }

 #ifdef __CUDACC__
@@ -203,6 +206,11 @@ struct HaarClassifierNodeDescriptor32
    }
 };

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif

 struct HaarClassifierNode128
 {
@@ -216,19 +224,19 @@ struct HaarClassifierNode128

    __host__ NCVStatus setThreshold(Ncv32f t)
    {
-        this->_ui4.y = *(Ncv32u *)&t;
+        this->_ui4.y = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
    {
-        this->_ui4.z = *(Ncv32u *)&nl;
+        this->_ui4.z = *(Ncv32u_a *)&nl;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
    {
-        this->_ui4.w = *(Ncv32u *)&nr;
+        this->_ui4.w = *(Ncv32u_a *)&nr;
        return NCV_SUCCESS;
    }

@@ -239,7 +247,7 @@ struct HaarClassifierNode128

    __host__ __device__ Ncv32f getThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui4.y;
+        return *(Ncv32f_a*)&this->_ui4.y;
    }

    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
@@ -264,7 +272,7 @@ struct HaarStage64

    __host__ NCVStatus setStageThreshold(Ncv32f t)
    {
-        this->_ui2.x = *(Ncv32u *)&t;
+        this->_ui2.x = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

@@ -290,7 +298,7 @@ struct HaarStage64

    __host__ __device__ Ncv32f getStageThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui2.x;
+        return *(Ncv32f_a*)&this->_ui2.x;
    }

    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -1423,7 +1423,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     d_hierSums.ptr() + partSumOffsets[i+1],
-                     NULL);
+                     0);
            }
            else
            {
@@ -1433,7 +1433,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     NULL,
-                     NULL);
+                     0);
            }

            ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@@ -1557,16 +1557,21 @@ NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
 }


+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif
+
 NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
                            Ncv32f *d_dst, Ncv32u *p_dstLen,
                            Ncv32f elemRemove, Ncv8u *pBuffer,
                            Ncv32u bufSize, cudaDeviceProp &devProp)
 {
    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
-                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+                             *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
 }

-
 NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
 {
@@ -1602,17 +1607,16 @@ NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
 NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }


 NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }

-
 //==============================================================================
 //
 // Filter.cu
@@ -2066,7 +2070,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
 //==============================================================================


-#if __CUDA_ARCH__ < 200
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)

 // FP32 atomic add
 static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -51,11 +51,11 @@ namespace cv { namespace gpu { namespace device
    struct Emulation
    {

-        static __device__ __forceinline__ int sycthOr(int pred)
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
        {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
                // just campilation stab
-                return false;
+                return 0;
 #else
                return __syncthreads_or(pred);
 #endif
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -119,7 +119,6 @@ namespace

        int depth = src.depth();
        int num_channels = src.channels();
-        Size size = src.size();

        if (depth == CV_64F)
        {
--- a/modules/gpu/src/video_decoder.cpp
+++ b/modules/gpu/src/video_decoder.cpp
@@ -49,36 +49,36 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
 {
    release();

-    cudaVideoCodec codec = static_cast<cudaVideoCodec>(videoFormat.codec);
-    cudaVideoChromaFormat chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+    cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
+    cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);

-    cudaVideoCreateFlags videoCreateFlags = (codec == cudaVideoCodec_JPEG || codec == cudaVideoCodec_MPEG2) ?
+    cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
                                            cudaVideoCreate_PreferCUDA :
                                            cudaVideoCreate_PreferCUVID;

    // Validate video format.  These are the currently supported formats via NVCUVID
-    CV_Assert(cudaVideoCodec_MPEG1 == codec ||
-              cudaVideoCodec_MPEG2 == codec ||
-              cudaVideoCodec_MPEG4 == codec ||
-              cudaVideoCodec_VC1   == codec ||
-              cudaVideoCodec_H264  == codec ||
-              cudaVideoCodec_JPEG  == codec ||
-              cudaVideoCodec_YUV420== codec ||
-              cudaVideoCodec_YV12  == codec ||
-              cudaVideoCodec_NV12  == codec ||
-              cudaVideoCodec_YUYV  == codec ||
-              cudaVideoCodec_UYVY  == codec );
+    CV_Assert(cudaVideoCodec_MPEG1 == _codec ||
+              cudaVideoCodec_MPEG2 == _codec ||
+              cudaVideoCodec_MPEG4 == _codec ||
+              cudaVideoCodec_VC1   == _codec ||
+              cudaVideoCodec_H264  == _codec ||
+              cudaVideoCodec_JPEG  == _codec ||
+              cudaVideoCodec_YUV420== _codec ||
+              cudaVideoCodec_YV12  == _codec ||
+              cudaVideoCodec_NV12  == _codec ||
+              cudaVideoCodec_YUYV  == _codec ||
+              cudaVideoCodec_UYVY  == _codec );

-    CV_Assert(cudaVideoChromaFormat_Monochrome == chromaFormat ||
-              cudaVideoChromaFormat_420        == chromaFormat ||
-              cudaVideoChromaFormat_422        == chromaFormat ||
-              cudaVideoChromaFormat_444        == chromaFormat);
+    CV_Assert(cudaVideoChromaFormat_Monochrome == _chromaFormat ||
+              cudaVideoChromaFormat_420        == _chromaFormat ||
+              cudaVideoChromaFormat_422        == _chromaFormat ||
+              cudaVideoChromaFormat_444        == _chromaFormat);

    // Fill the decoder-create-info struct from the given video-format struct.
    std::memset(&createInfo_, 0, sizeof(CUVIDDECODECREATEINFO));

    // Create video decoder
-    createInfo_.CodecType           = codec;
+    createInfo_.CodecType           = _codec;
    createInfo_.ulWidth             = videoFormat.width;
    createInfo_.ulHeight            = videoFormat.height;
    createInfo_.ulNumDecodeSurfaces = FrameQueue::MaximumSize;
@@ -87,7 +87,7 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
    while (createInfo_.ulNumDecodeSurfaces * videoFormat.width * videoFormat.height > 16 * 1024 * 1024)
        createInfo_.ulNumDecodeSurfaces--;

-    createInfo_.ChromaFormat    = chromaFormat;
+    createInfo_.ChromaFormat    = _chromaFormat;
    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
    createInfo_.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;

--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

@@ -49,76 +49,103 @@ using namespace cv::gpu;
 using namespace cvtest;
 using namespace testing;

-void print_info()
+void printOsInfo()
 {
-    printf("\n");
 #if defined _WIN32
 #   if defined _WIN64
-        puts("OS: Windows 64");
+        cout << "OS: Windows x64 \n" << endl;
 #   else
-        puts("OS: Windows 32");
+        cout << "OS: Windows x32 \n" << endl;
 #   endif
 #elif defined linux
 #   if defined _LP64
-        puts("OS: Linux 64");
+        cout << "OS: Linux x64 \n" << endl;
 #   else
-        puts("OS: Linux 32");
+        cout << "OS: Linux x32 \n" << endl;
 #   endif
 #elif defined __APPLE__
 #   if defined _LP64
-        puts("OS: Apple 64");
+        cout << "OS: Apple x64 \n" << endl;
 #   else
-        puts("OS: Apple 32");
+        cout << "OS: Apple x32 \n" << endl;
 #   endif
 #endif
+}

-    int deviceCount = getCudaEnabledDeviceCount();
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
    int driver;
    cudaDriverGetVersion(&driver);

-    printf("CUDA Driver  version: %d\n", driver);
-    printf("CUDA Runtime version: %d\n", CUDART_VERSION);
-    printf("CUDA device count: %d\n\n", deviceCount);
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;

    for (int i = 0; i < deviceCount; ++i)
    {
        DeviceInfo info(i);

-        printf("Device %d:\n", i);
-        printf("    Name: %s\n", info.name().c_str());
-        printf("    Compute capability version: %d.%d\n", info.majorVersion(), info.minorVersion());
-        printf("    Total memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0));
-        printf("    Free  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0));
-        if (info.isCompatible())
-            puts("    This device is compatible with current GPU module build\n");
-        else
-            puts("    This device is NOT compatible with current GPU module build\n");
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
+
+        cout << endl;
    }
-
-    puts("GPU module was compiled for the following GPU archs:");
-    printf("    BIN: %s\n", CUDA_ARCH_BIN);
-    printf("    PTX: %s\n\n", CUDA_ARCH_PTX);
+#endif
 }

-enum OutputLevel
-{
-    OutputLevelNone,
-    OutputLevelCompact,
-    OutputLevelFull
-};
-
-extern OutputLevel nvidiaTestOutputLevel;
-
 int main(int argc, char** argv)
 {
-    TS::ptr()->init("gpu");
-    InitGoogleTest(&argc, argv);
+    try
+    {
+        CommandLineParser cmd(argc, (const char**)argv,
+            "{ print_info_only | print_info_only | false | Print information about system and exit }"
+            "{ device | device | -1 | Device on which tests will be executed (-1 means all devices) }"
+            "{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }"
+        );

-    const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";
+        printOsInfo();
+        printCudaInfo();

-    CommandLineParser parser(argc, (const char**)argv, keys);
+        if (cmd.get<bool>("print_info_only"))
+            return 0;

-    string outputLevel = parser.get<string>("nvtest_output_level", "none");
+        int device = cmd.get<int>("device");
+        if (device < 0)
+        {
+            DeviceManager::instance().loadAll();
+
+            cout << "Run tests on all supported devices \n" << endl;
+        }
+        else
+        {
+            DeviceManager::instance().load(device);
+
+            DeviceInfo info(device);
+            cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+        }
+
+        string outputLevel = cmd.get<string>("nvtest_output_level");

        if (outputLevel == "none")
            nvidiaTestOutputLevel = OutputLevelNone;
@@ -127,9 +154,23 @@ int main(int argc, char** argv)
        else if (outputLevel == "full")
            nvidiaTestOutputLevel = OutputLevelFull;

-    print_info();
+        TS::ptr()->init("gpu");
+        InitGoogleTest(&argc, argv);

        return RUN_ALL_TESTS();
+    }
+    catch (const exception& e)
+    {
+        cerr << e.what() << endl;
+        return -1;
+    }
+    catch (...)
+    {
+        cerr << "Unknown error" << endl;
+        return -1;
+    }
+
+    return 0;
 }

 #else // HAVE_CUDA
--- a/modules/gpu/test/main_test_nvidia.h
+++ b/modules/gpu/test/main_test_nvidia.h
@@ -1,7 +1,7 @@
 #ifndef __main_test_nvidia_h__
 #define __main_test_nvidia_h__

-#include<string>
+#include <string>

 enum OutputLevel
 {
@@ -10,6 +10,8 @@ enum OutputLevel
    OutputLevelFull
 };

+extern OutputLevel nvidiaTestOutputLevel;
+
 bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_RectStdDev(const std::string& test_data_path, OutputLevel outputLevel);
--- a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
+++ b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
@@ -245,8 +245,8 @@ bool TestHaarCascadeApplication::process()

    int devId;
    ncvAssertCUDAReturn(cudaGetDevice(&devId), false);
-    cudaDeviceProp devProp;
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), false);
+    cudaDeviceProp _devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&_devProp, devId), false);

    ncvStat = ncvApplyHaarClassifierCascade_device(
        d_integralImage, d_rectStdDev, d_pixelMask,
@@ -254,7 +254,7 @@ bool TestHaarCascadeApplication::process()
        haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
        searchRoiU, 1, 1.0f,
        *this->allocatorGPU.get(), *this->allocatorCPU.get(),
-        devProp, 0);
+        _devProp, 0);
    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);

    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height);
--- a/modules/gpu/test/nvidia/main_nvidia.cpp
+++ b/modules/gpu/test/nvidia/main_nvidia.cpp
@@ -1,4 +1,6 @@
-#pragma warning (disable : 4408 4201 4100)
+#if defined _MSC_VER && _MSC_VER >= 1200
+# pragma warning (disable : 4408 4201 4100)
+#endif

 #include <cstdio>

--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@@ -343,3 +345,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ReprojectImageTo3D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@@ -98,3 +100,5 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@@ -3396,3 +3398,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@@ -984,3 +986,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    testing::Values(UseMask(false), UseMask(true))));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@@ -552,3 +554,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/Show More
+++ b/Show More