diff --git a/CMakeLists.txt b/CMakeLists.txt index 470c5520d..742d3e1fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -413,15 +413,6 @@ endif() # --- OpenCL --- if(WITH_OPENCL) include(cmake/OpenCVDetectOpenCL.cmake) - if(OPENCL_FOUND) - set(HAVE_OPENCL 1) - endif() - if(WITH_OPENCLAMDFFT AND CLAMDFFT_INCLUDE_DIR) - set(HAVE_CLAMDFFT 1) - endif() - if(WITH_OPENCLAMDBLAS AND CLAMDBLAS_INCLUDE_DIR) - set(HAVE_CLAMDBLAS 1) - endif() endif() # ---------------------------------------------------------------------------- @@ -799,11 +790,11 @@ if(HAVE_CUDA) status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO) endif() -if(HAVE_OPENCL AND BUILD_opencv_ocl) +if(HAVE_OPENCL) status("") status(" OpenCL") if(OPENCL_INCLUDE_DIR) - status(" Include:" ${OPENCL_INCLUDE_DIR}) + status(" Include path:" ${OPENCL_INCLUDE_DIRS}) endif() if(OPENCL_LIBRARIES) status(" libraries:" ${OPENCL_LIBRARIES}) diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.cpp b/android/service/engine/jni/BinderComponent/HardwareDetector.cpp index b5e0fa600..eab49ac5f 100644 --- a/android/service/engine/jni/BinderComponent/HardwareDetector.cpp +++ b/android/service/engine/jni/BinderComponent/HardwareDetector.cpp @@ -163,22 +163,13 @@ int DetectKnownPlatforms() { int tegra_status = DetectTegra(); - if (3 == tegra_status) + // All Tegra platforms since Tegra3 + if (2 < tegra_status) { - return PLATFORM_TEGRA3; + return PLATFORM_TEGRA + tegra_status - 1; } else { return PLATFORM_UNKNOWN; } - - // NOTE: Uncomment when all Tegras will be supported - /*if (tegra_status > 0) - * { - * return PLATFORM_TEGRA + tegra_status - 1; - } - else - { - return PLATFORM_UNKNOWN; - }*/ } \ No newline at end of file diff --git a/android/service/engine/jni/BinderComponent/HardwareDetector.h b/android/service/engine/jni/BinderComponent/HardwareDetector.h index e049db9c5..135684418 100644 --- a/android/service/engine/jni/BinderComponent/HardwareDetector.h +++ b/android/service/engine/jni/BinderComponent/HardwareDetector.h @@ -27,6 +27,7 @@ #define PLATFORM_TEGRA 1L #define PLATFORM_TEGRA2 2L #define PLATFORM_TEGRA3 3L +#define PLATFORM_TEGRA4 4L int DetectKnownPlatforms(); int GetProcessorCount(); diff --git a/android/service/engine/jni/BinderComponent/TegraDetector.cpp b/android/service/engine/jni/BinderComponent/TegraDetector.cpp index f7db1fa85..3933efe49 100644 --- a/android/service/engine/jni/BinderComponent/TegraDetector.cpp +++ b/android/service/engine/jni/BinderComponent/TegraDetector.cpp @@ -7,6 +7,7 @@ #define KERNEL_CONFIG_TEGRA_MAGIC "CONFIG_ARCH_TEGRA=y" #define KERNEL_CONFIG_TEGRA2_MAGIC "CONFIG_ARCH_TEGRA_2x_SOC=y" #define KERNEL_CONFIG_TEGRA3_MAGIC "CONFIG_ARCH_TEGRA_3x_SOC=y" +#define KERNEL_CONFIG_TEGRA4_MAGIC "CONFIG_ARCH_TEGRA_11x_SOC=y" #define MAX_DATA_LEN 4096 int DetectTegra() @@ -19,9 +20,11 @@ int DetectTegra() const char *tegra_config = KERNEL_CONFIG_TEGRA_MAGIC; const char *tegra2_config = KERNEL_CONFIG_TEGRA2_MAGIC; const char *tegra3_config = KERNEL_CONFIG_TEGRA3_MAGIC; + const char *tegra4_config = KERNEL_CONFIG_TEGRA4_MAGIC; int len = strlen(tegra_config); int len2 = strlen(tegra2_config); int len3 = strlen(tegra3_config); + int len4 = strlen(tegra4_config); while (0 != gzgets(kernelConfig, tmpbuf, KERNEL_CONFIG_MAX_LINE_WIDTH)) { if (0 == strncmp(tmpbuf, tegra_config, len)) @@ -41,6 +44,11 @@ int DetectTegra() break; } + if (0 == strncmp(tmpbuf, tegra4_config, len4)) + { + result = 4; + break; + } } gzclose(kernelConfig); } diff --git a/android/service/engine/jni/NativeService/CommonPackageManager.cpp b/android/service/engine/jni/NativeService/CommonPackageManager.cpp index dbcd8ff68..eaa03d4d8 100644 --- a/android/service/engine/jni/NativeService/CommonPackageManager.cpp +++ b/android/service/engine/jni/NativeService/CommonPackageManager.cpp @@ -197,6 +197,7 @@ std::vector > CommonPackageManager::InitArmRating() result.push_back(std::pair(PLATFORM_UNKNOWN, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON)); result.push_back(std::pair(PLATFORM_UNKNOWN, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_VFPv3d16 | FEATURES_HAS_NEON)); result.push_back(std::pair(PLATFORM_TEGRA3, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON)); + result.push_back(std::pair(PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON)); return result; } @@ -218,8 +219,8 @@ std::vector > CommonPackageManager::InitMipsRating() } const std::vector > CommonPackageManager::ArchRatings[] = { - CommonPackageManager::InitArmRating(), - CommonPackageManager::InitIntelRating(), + CommonPackageManager::InitArmRating(), + CommonPackageManager::InitIntelRating(), CommonPackageManager::InitMipsRating() }; diff --git a/android/service/engine/jni/NativeService/PackageInfo.cpp b/android/service/engine/jni/NativeService/PackageInfo.cpp index 2eb823073..2f8dde043 100644 --- a/android/service/engine/jni/NativeService/PackageInfo.cpp +++ b/android/service/engine/jni/NativeService/PackageInfo.cpp @@ -18,6 +18,7 @@ map PackageInfo::InitPlatformNameMap() result[PLATFORM_TEGRA] = PLATFORM_TEGRA_NAME; result[PLATFORM_TEGRA2] = PLATFORM_TEGRA2_NAME; result[PLATFORM_TEGRA3] = PLATFORM_TEGRA3_NAME; + result[PLATFORM_TEGRA4] = PLATFORM_TEGRA4_NAME; return result; } @@ -186,6 +187,10 @@ inline int SplitPlatfrom(const vector& features) { result = PLATFORM_TEGRA3; } + else if (PLATFORM_TEGRA4_NAME == tmp) + { + result = PLATFORM_TEGRA4; + } } else { @@ -425,6 +430,10 @@ InstallPath(install_path) { CpuID = ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON; } break; + case PLATFORM_TEGRA4: + { + CpuID = ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON; + } break; } } else diff --git a/android/service/engine/jni/NativeService/PackageInfo.h b/android/service/engine/jni/NativeService/PackageInfo.h index b86ef7a92..2ce561e2f 100644 --- a/android/service/engine/jni/NativeService/PackageInfo.h +++ b/android/service/engine/jni/NativeService/PackageInfo.h @@ -12,7 +12,6 @@ #define ARCH_ARMv7_NAME "armv7a" #define ARCH_ARMv8_NAME "armv8" - #define FEATURES_HAS_VFPv3d16_NAME "vfpv3d16" #define FEATURES_HAS_VFPv3_NAME "vfpv3" #define FEATURES_HAS_NEON_NAME "neon" @@ -25,7 +24,7 @@ #define PLATFORM_TEGRA_NAME "tegra" #define PLATFORM_TEGRA2_NAME "tegra2" #define PLATFORM_TEGRA3_NAME "tegra3" - +#define PLATFORM_TEGRA4_NAME "tegra4" class PackageInfo { diff --git a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp index 4e390386f..ce5159f81 100644 --- a/android/service/engine/jni/Tests/OpenCVEngineTest.cpp +++ b/android/service/engine/jni/Tests/OpenCVEngineTest.cpp @@ -201,6 +201,24 @@ TEST(OpenCVEngineTest, GetPathForCompatiblePackage2) #endif } +TEST(OpenCVEngineTest, GetPathForCompatiblePackage3) +{ + sp Engine = InitConnect(); + Starter.PackageManager->InstalledPackages.clear(); + Starter.PackageManager->InstallVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON); + EXPECT_FALSE(NULL == Engine.get()); + String16 result = Engine->GetLibPathByVersion(String16("2.4")); + #ifdef __SUPPORT_TEGRA3 + EXPECT_STREQ("/data/data/org.opencv.lib_v24_tegra4/lib", String8(result).string()); + #else + #ifdef __SUPPORT_ARMEABI_V7A_FEATURES + EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a_neon/lib", String8(result).string()); + #else + EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", String8(result).string()); + #endif + #endif +} + TEST(OpenCVEngineTest, InstallAndGetVersion) { sp Engine = InitConnect(); diff --git a/android/service/engine/jni/Tests/PackageInfoTest.cpp b/android/service/engine/jni/Tests/PackageInfoTest.cpp index 6bc84856c..6cbb06943 100644 --- a/android/service/engine/jni/Tests/PackageInfoTest.cpp +++ b/android/service/engine/jni/Tests/PackageInfoTest.cpp @@ -85,6 +85,21 @@ TEST(PackageInfo, FullNameTegra3) #endif } +TEST(PackageInfo, FullNameTegra4) +{ + PackageInfo info(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_NEON); + string name = info.GetFullName(); + #ifdef __SUPPORT_TEGRA3 + EXPECT_STREQ("org.opencv.lib_v24_tegra4", name.c_str()); + #else + #ifdef __SUPPORT_ARMEABI_V7A_FEATURES + EXPECT_STREQ("org.opencv.lib_v24_armv7a_neon", name.c_str()); + #else + EXPECT_STREQ("org.opencv.lib_v24_armv7a", name.c_str()); + #endif + #endif +} + TEST(PackageInfo, FullNameX86SSE2) { PackageInfo info(2030000, PLATFORM_UNKNOWN, ARCH_X86 | FEATURES_HAS_SSE2); @@ -148,6 +163,13 @@ TEST(PackageInfo, Tegra3FromFullName) EXPECT_EQ(PLATFORM_TEGRA3, info.GetPlatform()); } +TEST(PackageInfo, Tegra4FromFullName) +{ + PackageInfo info("org.opencv.lib_v24_tegra4", "/data/data/org.opencv.lib_v24_tegra4"); + EXPECT_EQ(2040000, info.GetVersion()); + EXPECT_EQ(PLATFORM_TEGRA4, info.GetPlatform()); +} + #ifdef __SUPPORT_MIPS TEST(PackageInfo, MipsFromFullName) { diff --git a/android/service/engine/jni/Tests/PackageManagmentTest.cpp b/android/service/engine/jni/Tests/PackageManagmentTest.cpp index f9ccb7484..e21dcf760 100644 --- a/android/service/engine/jni/Tests/PackageManagmentTest.cpp +++ b/android/service/engine/jni/Tests/PackageManagmentTest.cpp @@ -102,6 +102,22 @@ TEST(PackageManager, GetPackagePathForTegra3) #endif } +TEST(PackageManager, GetPackagePathForTegra4) +{ + PackageManagerStub pm; + EXPECT_TRUE(pm.InstallVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON)); + string path = pm.GetPackagePathByVersion(2040400, PLATFORM_TEGRA4, ARCH_ARMv7 | FEATURES_HAS_VFPv3 | FEATURES_HAS_NEON); + #ifdef __SUPPORT_TEGRA3 + EXPECT_STREQ("/data/data/org.opencv.lib_v24_tegra4/lib", path.c_str()); + #else + #ifdef __SUPPORT_ARMEABI_V7A_FEATURES + EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a_neon/lib", path.c_str()); + #else + EXPECT_STREQ("/data/data/org.opencv.lib_v24_armv7a/lib", path.c_str()); + #endif + #endif +} + #ifdef __SUPPORT_MIPS TEST(PackageManager, GetPackagePathForMips) { diff --git a/android/service/engine/src/org/opencv/engine/HardwareDetector.java b/android/service/engine/src/org/opencv/engine/HardwareDetector.java index 7e2a6135f..67320865a 100644 --- a/android/service/engine/src/org/opencv/engine/HardwareDetector.java +++ b/android/service/engine/src/org/opencv/engine/HardwareDetector.java @@ -33,6 +33,8 @@ public class HardwareDetector public static final int PLATFORM_TEGRA = 1; public static final int PLATFORM_TEGRA2 = 2; public static final int PLATFORM_TEGRA3 = 3; + public static final int PLATFORM_TEGRA4 = 4; + public static final int PLATFORM_UNKNOWN = 0; diff --git a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java index fad279772..5213d9149 100644 --- a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java +++ b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java @@ -83,10 +83,14 @@ public class ManagerActivity extends Activity { HardwarePlatformView.setText("Tegra 2"); } - else + else if (HardwareDetector.PLATFORM_TEGRA3 == Platfrom) { HardwarePlatformView.setText("Tegra 3"); } + else + { + HardwarePlatformView.setText("Tegra 4"); + } } else { @@ -367,10 +371,10 @@ public class ManagerActivity extends Activity temp.put("Version", NormalizeVersion(OpenCVersion, VersionName)); // HACK: OpenCV Manager for Armv7-a Neon already has Tegra3 optimizations // that is enabled on proper hardware - if (HardwareDetector.DetectKnownPlatforms() == HardwareDetector.PLATFORM_TEGRA3 && + if (HardwareDetector.DetectKnownPlatforms() >= HardwareDetector.PLATFORM_TEGRA3 && HardwareName.equals("armv7a neon ") && Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) { - temp.put("Hardware", "Tegra 3"); + temp.put("Hardware", "Tegra"); if (Tags == null) { Tags = "optimized"; diff --git a/cmake/OpenCVDetectAndroidSDK.cmake b/cmake/OpenCVDetectAndroidSDK.cmake index 0e0240ca8..b125561d4 100644 --- a/cmake/OpenCVDetectAndroidSDK.cmake +++ b/cmake/OpenCVDetectAndroidSDK.cmake @@ -264,13 +264,23 @@ macro(add_android_project target path) ocv_list_filterout(android_proj_jni_files "\\\\.svn") if(android_proj_jni_files AND EXISTS ${path}/jni/Android.mk AND NOT DEFINED JNI_LIB_NAME) + # find local module name in Android.mk file to build native lib file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" ) string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}") + # find using of native app glue to determine native activity + file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" ) + if(JNI_LIB_NAME) ocv_include_modules_recurse(${android_proj_NATIVE_DEPS}) ocv_include_directories("${path}/jni") + if (NATIVE_APP_GLUE) + include_directories(${ANDROID_NDK}/sources/android/native_app_glue) + list(APPEND android_proj_jni_files ${ANDROID_NDK}/sources/android/native_app_glue/android_native_app_glue.c) + set(android_proj_NATIVE_DEPS ${android_proj_NATIVE_DEPS} android) + endif() + add_library(${JNI_LIB_NAME} MODULE ${android_proj_jni_files}) target_link_libraries(${JNI_LIB_NAME} ${OPENCV_LINKER_LIBS} ${android_proj_NATIVE_DEPS}) diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 12ab9d3ea..76f76ebc1 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -1,154 +1,104 @@ if(APPLE) set(OPENCL_FOUND YES) - set(OPENCL_LIBRARIES "-framework OpenCL") -else() + set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library") + set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory") + mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) +else(APPLE) find_package(OpenCL QUIET) - if(WITH_OPENCLAMDFFT) - set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH}) - if(NOT CLAMDFFT_SEARCH_PATH) - if(WIN32) - set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" ) - endif() - endif() - set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include ) - if(UNIX) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib) - else() - set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64) - endif() - else() - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import) - else() - set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import) - endif() - endif() - find_path(CLAMDFFT_INCLUDE_DIR - NAMES clAmdFft.h - PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH} - PATH_SUFFIXES clAmdFft - NO_DEFAULT_PATH) - find_library(CLAMDFFT_LIBRARY - NAMES clAmdFft.Runtime - PATHS ${CLAMDFFT_LIB_SEARCH_PATH} - NO_DEFAULT_PATH) - if(CLAMDFFT_LIBRARY) - set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY}) - else() - set(CLAMDFFT_LIBRARIES "") - endif() - endif() - if(WITH_OPENCLAMDBLAS) - set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH}) - if(NOT CLAMDBLAS_SEARCH_PATH) - if(WIN32) - set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" ) - endif() - endif() - set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include ) - if(UNIX) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib) - else() - set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64) - endif() - else() - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import) - else() - set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import) - endif() - endif() - find_path(CLAMDBLAS_INCLUDE_DIR - NAMES clAmdBlas.h - PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH} - PATH_SUFFIXES clAmdBlas - NO_DEFAULT_PATH) - find_library(CLAMDBLAS_LIBRARY - NAMES clAmdBlas - PATHS ${CLAMDBLAS_LIB_SEARCH_PATH} - NO_DEFAULT_PATH) - if(CLAMDBLAS_LIBRARY) - set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY}) - else() - set(CLAMDBLAS_LIBRARIES "") - endif() - endif() - # Try AMD/ATI Stream SDK + if (NOT OPENCL_FOUND) - set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT}) - set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT}) - set(ENV_OPENCLROOT $ENV{OPENCLROOT}) - set(ENV_CUDA_PATH $ENV{CUDA_PATH}) - set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT}) - if(ENV_AMDSTREAMSDKROOT) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86_64) - endif() - elseif(ENV_AMDSTREAMSDKROOT) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64) - endif() - elseif(ENV_CUDA_PATH AND WIN32) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64) - endif() - elseif(ENV_OPENCLROOT AND UNIX) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64) - endif() - elseif(ENV_INTELOCLSDKROOT) - set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include) - if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86) - else() - set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64) - endif() + find_path(OPENCL_ROOT_DIR + NAMES OpenCL/cl.h CL/cl.h include/CL/cl.h include/nvidia-current/CL/cl.h + PATHS ENV OCLROOT ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT + DOC "OpenCL root directory" + NO_DEFAULT_PATH) + + find_path(OPENCL_INCLUDE_DIR + NAMES OpenCL/cl.h CL/cl.h + HINTS ${OPENCL_ROOT_DIR} + PATH_SUFFIXES include include/nvidia-current + DOC "OpenCL include directory") + + if (X86_64) + set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64) + elseif (X86) + set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86) endif() - if(OPENCL_INCLUDE_SEARCH_PATH) - find_path(OPENCL_INCLUDE_DIR - NAMES CL/cl.h OpenCL/cl.h - PATHS ${OPENCL_INCLUDE_SEARCH_PATH} - NO_DEFAULT_PATH) - else() - find_path(OPENCL_INCLUDE_DIR - NAMES CL/cl.h OpenCL/cl.h) - endif() - - if(OPENCL_LIB_SEARCH_PATH) - find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH) - else() - find_library(OPENCL_LIBRARY NAMES OpenCL) - endif() + find_library(OPENCL_LIBRARY + NAMES OpenCL + HINTS ${OPENCL_ROOT_DIR} + PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES} + DOC "OpenCL library") + mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) include(FindPackageHandleStandardArgs) - find_package_handle_standard_args( - OPENCL - DEFAULT_MSG - OPENCL_LIBRARY OPENCL_INCLUDE_DIR - ) + FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR ) + endif() +endif(APPLE) - if(OPENCL_FOUND) - set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) - set(HAVE_OPENCL 1) - else() - set(OPENCL_LIBRARIES) +if(OPENCL_FOUND) + set(HAVE_OPENCL 1) + set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR}) + set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) + + if (X86_64) + set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) + elseif (X86) + set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import) + endif() + + if(WITH_OPENCLAMDFFT) + find_path(CLAMDFFT_ROOT_DIR + NAMES include/clAmdFft.h + PATHS ENV CLAMDFFT_PATH ENV ProgramFiles + PATH_SUFFIXES clAmdFft AMD/clAmdFft + DOC "AMD FFT root directory" + NO_DEFAULT_PATH) + + find_path(CLAMDFFT_INCLUDE_DIR + NAMES clAmdFft.h + HINTS ${CLAMDFFT_ROOT_DIR} + PATH_SUFFIXES include + DOC "clAmdFft include directory") + + find_library(CLAMDFFT_LIBRARY + NAMES clAmdFft.Runtime + HINTS ${CLAMDFFT_ROOT_DIR} + PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES} + DOC "clAmdFft library") + + if(CLAMDFFT_LIBRARY AND CLAMDFFT_INCLUDE_DIR) + set(HAVE_CLAMDFFT 1) + list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDFFT_INCLUDE_DIR}") + list(APPEND OPENCL_LIBRARIES "${CLAMDFFT_LIBRARY}") + endif() + endif() + + if(WITH_OPENCLAMDBLAS) + find_path(CLAMDBLAS_ROOT_DIR + NAMES include/clAmdBlas.h + PATHS ENV CLAMDFFT_PATH ENV ProgramFiles + PATH_SUFFIXES clAmdBlas AMD/clAmdBlas + DOC "AMD FFT root directory" + NO_DEFAULT_PATH) + + find_path(CLAMDBLAS_INCLUDE_DIR + NAMES clAmdBlas.h + HINTS ${CLAMDBLAS_ROOT_DIR} + PATH_SUFFIXES include + DOC "clAmdFft include directory") + + find_library(CLAMDBLAS_LIBRARY + NAMES clAmdBlas + HINTS ${CLAMDBLAS_ROOT_DIR} + PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES} + DOC "clAmdBlas library") + + if(CLAMDBLAS_LIBRARY AND CLAMDBLAS_INCLUDE_DIR) + set(HAVE_CLAMDBLAS 1) + list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDBLAS_INCLUDE_DIR}") + list(APPEND OPENCL_LIBRARIES "${CLAMDBLAS_LIBRARY}") endif() - else() - set(HAVE_OPENCL 1) endif() endif() diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index 48aa7139a..75f91a1a2 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -444,6 +444,18 @@ macro(ocv_glob_module_sources) source_group("Src\\Cuda" FILES ${lib_device_srcs} ${lib_device_hdrs}) endif() + file(GLOB cl_kernels "src/opencl/*.cl") + + if(HAVE_OPENCL AND cl_kernels) + ocv_include_directories(${OPENCL_INCLUDE_DIRS}) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" + COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" + DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") + source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") + list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") + endif() + ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs} ${device_objs} ${lib_device_srcs} ${lib_device_hdrs}) @@ -465,6 +477,9 @@ macro(ocv_create_module) if (HAVE_CUDA) target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) endif() + if(HAVE_OPENCL AND OPENCL_LIBRARIES) + target_link_libraries(${the_module} ${OPENCL_LIBRARIES}) + endif() endif() add_dependencies(opencv_modules ${the_module}) diff --git a/modules/ocl/cl2cpp.cmake b/cmake/cl2cpp.cmake similarity index 100% rename from modules/ocl/cl2cpp.cmake rename to cmake/cl2cpp.cmake diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 158881bd9..0955a1a3a 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -195,9 +195,9 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) d1 = _mm_sub_epi16(d1, c1); __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z); - __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z); + __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z); __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z); - __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z); + __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z); d2 = _mm_sub_epi16(d2, c2); d3 = _mm_sub_epi16(d3, c3); diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index bf376c875..53ea2d40e 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -531,12 +531,12 @@ template<> inline int VBLAS::givensx(double* a, double* b, int n, double #endif template void -JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, int m, int n, int n1, double minval) +JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, + int m, int n, int n1, double minval, _Tp eps) { VBLAS<_Tp> vblas; AutoBuffer Wbuf(n); double* W = Wbuf; - _Tp eps = DBL_EPSILON*10; int i, j, k, iter, max_iter = std::max(m, 30); _Tp c, s; double sd; @@ -729,12 +729,12 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, int m, int static void JacobiSVD(float* At, size_t astep, float* W, float* Vt, size_t vstep, int m, int n, int n1=-1) { - JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN); + JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2); } static void JacobiSVD(double* At, size_t astep, double* W, double* Vt, size_t vstep, int m, int n, int n1=-1) { - JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN); + JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN, DBL_EPSILON*10); } /* y[0:m,0:n] += diag(a[0:1,0:m]) * x[0:m,0:n] */ diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp index e02f78c54..bbe754b33 100644 --- a/modules/core/test/test_math.cpp +++ b/modules/core/test/test_math.cpp @@ -2599,6 +2599,35 @@ TEST(Core_Trace, accuracy) { Core_TraceTest test; test.safe_run(); } TEST(Core_SolvePoly, accuracy) { Core_SolvePolyTest test; test.safe_run(); } TEST(Core_Phase, accuracy) { Core_PhaseTest test; test.safe_run(); } + +TEST(Core_SVD, flt) +{ + float a[] = { + 1.23377746e+011f, -7.05490125e+010f, -4.18380882e+010f, -11693456.f, + -39091328.f, 77492224.f, -7.05490125e+010f, 2.36211143e+011f, + -3.51093473e+010f, 70773408.f, -4.83386156e+005f, -129560368.f, + -4.18380882e+010f, -3.51093473e+010f, 9.25311222e+010f, -49052424.f, + 43922752.f, 12176842.f, -11693456.f, 70773408.f, -49052424.f, 8.40836094e+004f, + 5.17475293e+003f, -1.16122949e+004f, -39091328.f, -4.83386156e+005f, + 43922752.f, 5.17475293e+003f, 5.16047969e+004f, 5.68887842e+003f, 77492224.f, + -129560368.f, 12176842.f, -1.16122949e+004f, 5.68887842e+003f, + 1.28060578e+005f + }; + + float b[] = { + 283751232.f, 2.61604198e+009f, -745033216.f, 2.31125625e+005f, + -4.52429188e+005f, -1.37596525e+006f + }; + + Mat A(6, 6, CV_32F, a); + Mat B(6, 1, CV_32F, b); + Mat X, B1; + solve(A, B, X, DECOMP_SVD); + B1 = A*X; + EXPECT_LE(norm(B1, B, NORM_L2 + NORM_RELATIVE), FLT_EPSILON*10); +} + + // TODO: eigenvv, invsqrt, cbrt, fastarctan, (round, floor, ceil(?)), diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 26f806fb8..9db4e5f09 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -3,7 +3,7 @@ if(ANDROID OR IOS) endif() set(the_description "GPU-accelerated Computer Vision") -ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree opencv_photo opencv_legacy) +ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy) ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda") diff --git a/modules/gpu/doc/feature_detection_and_description.rst b/modules/gpu/doc/feature_detection_and_description.rst index 0c4caf77c..8a0288e15 100644 --- a/modules/gpu/doc/feature_detection_and_description.rst +++ b/modules/gpu/doc/feature_detection_and_description.rst @@ -5,109 +5,6 @@ Feature Detection and Description -gpu::SURF_GPU -------------- -.. ocv:class:: gpu::SURF_GPU - -Class used for extracting Speeded Up Robust Features (SURF) from an image. :: - - class SURF_GPU - { - public: - enum KeypointLayout - { - X_ROW = 0, - Y_ROW, - LAPLACIAN_ROW, - OCTAVE_ROW, - SIZE_ROW, - ANGLE_ROW, - HESSIAN_ROW, - ROWS_COUNT - }; - - //! the default constructor - SURF_GPU(); - //! the full constructor taking all the necessary parameters - explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4, - int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f); - - //! returns the descriptor size in float's (64 or 128) - int descriptorSize() const; - - //! upload host keypoints to device memory - void uploadKeypoints(const vector& keypoints, - GpuMat& keypointsGPU); - //! download keypoints from device to host memory - void downloadKeypoints(const GpuMat& keypointsGPU, - vector& keypoints); - - //! download descriptors from device to host memory - void downloadDescriptors(const GpuMat& descriptorsGPU, - vector& descriptors); - - void operator()(const GpuMat& img, const GpuMat& mask, - GpuMat& keypoints); - - void operator()(const GpuMat& img, const GpuMat& mask, - GpuMat& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false, - bool calcOrientation = true); - - void operator()(const GpuMat& img, const GpuMat& mask, - std::vector& keypoints); - - void operator()(const GpuMat& img, const GpuMat& mask, - std::vector& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false, - bool calcOrientation = true); - - void operator()(const GpuMat& img, const GpuMat& mask, - std::vector& keypoints, - std::vector& descriptors, - bool useProvidedKeypoints = false, - bool calcOrientation = true); - - void releaseMemory(); - - // SURF parameters - double hessianThreshold; - int nOctaves; - int nOctaveLayers; - bool extended; - bool upright; - - //! max keypoints = keypointsRatio * img.size().area() - float keypointsRatio; - - GpuMat sum, mask1, maskSum, intBuffer; - - GpuMat det, trace; - - GpuMat maxPosBuffer; - }; - - -The class ``SURF_GPU`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported. - -The class ``SURF_GPU`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``GpuMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type. - -* ``keypoints.ptr(X_ROW)[i]`` contains x coordinate of the i-th feature. -* ``keypoints.ptr(Y_ROW)[i]`` contains y coordinate of the i-th feature. -* ``keypoints.ptr(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature. -* ``keypoints.ptr(OCTAVE_ROW)[i]`` contains the octave of the i-th feature. -* ``keypoints.ptr(SIZE_ROW)[i]`` contains the size of the i-th feature. -* ``keypoints.ptr(ANGLE_ROW)[i]`` contain orientation of the i-th feature. -* ``keypoints.ptr(HESSIAN_ROW)[i]`` contains the response of the i-th feature. - -The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type. - -The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls. - -.. seealso:: :ocv:class:`SURF` - - - gpu::FAST_GPU ------------- .. ocv:class:: gpu::FAST_GPU diff --git a/modules/gpu/doc/video.rst b/modules/gpu/doc/video.rst index fc5b1fb6c..284bb17fa 100644 --- a/modules/gpu/doc/video.rst +++ b/modules/gpu/doc/video.rst @@ -579,76 +579,6 @@ Releases all inner buffer's memory. -gpu::VIBE_GPU -------------- -.. ocv:class:: gpu::VIBE_GPU - -Class used for background/foreground segmentation. :: - - class VIBE_GPU - { - public: - explicit VIBE_GPU(unsigned long rngSeed = 1234567); - - void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null()); - - void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null()); - - void release(); - - ... - }; - -The class discriminates between foreground and background pixels by building and maintaining a model of the background. Any pixel which does not fit this model is then deemed to be foreground. The class implements algorithm described in [VIBE2011]_. - - - -gpu::VIBE_GPU::VIBE_GPU ------------------------ -The constructor. - -.. ocv:function:: gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed = 1234567) - - :param rngSeed: Value used to initiate a random sequence. - -Default constructor sets all parameters to default values. - - - -gpu::VIBE_GPU::initialize -------------------------- -Initialize background model and allocates all inner buffers. - -.. ocv:function:: void gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null()) - - :param firstFrame: First frame from video sequence. - - :param stream: Stream for the asynchronous version. - - - -gpu::VIBE_GPU::operator() -------------------------- -Updates the background model and returns the foreground mask - -.. ocv:function:: void gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null()) - - :param frame: Next video frame. - - :param fgmask: The output foreground mask as an 8-bit binary image. - - :param stream: Stream for the asynchronous version. - - - -gpu::VIBE_GPU::release ----------------------- -Releases all inner buffer's memory. - -.. ocv:function:: void gpu::VIBE_GPU::release() - - - gpu::GMG_GPU ------------ .. ocv:class:: gpu::GMG_GPU @@ -1209,5 +1139,4 @@ Parse next video frame. Implementation must call this method after new frame was .. [MOG2001] P. KadewTraKuPong and R. Bowden. *An improved adaptive background mixture model for real-time tracking with shadow detection*. Proc. 2nd European Workshop on Advanced Video-Based Surveillance Systems, 2001 .. [MOG2004] Z. Zivkovic. *Improved adaptive Gausian mixture model for background subtraction*. International Conference Pattern Recognition, UK, August, 2004 .. [ShadowDetect2003] Prati, Mikic, Trivedi and Cucchiarra. *Detecting Moving Shadows...*. IEEE PAMI, 2003 -.. [VIBE2011] O. Barnich and M. Van D Roogenbroeck. *ViBe: A universal background subtraction algorithm for video sequences*. IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011 .. [GMG2012] A. Godbehere, A. Matsukawa and K. Goldberg. *Visual Tracking of Human Visitors under Variable-Lighting Conditions for a Responsive Audio Art Installation*. American Control Conference, Montreal, June 2012 diff --git a/modules/gpu/include/opencv2/gpu.hpp b/modules/gpu/include/opencv2/gpu.hpp index 21a03dc20..e0933342b 100644 --- a/modules/gpu/include/opencv2/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu.hpp @@ -491,6 +491,26 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& //! converts image from one color space to another CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null()); +enum +{ + // Bayer Demosaicing (Malvar, He, and Cutler) + COLOR_BayerBG2BGR_MHT = 256, + COLOR_BayerGB2BGR_MHT = 257, + COLOR_BayerRG2BGR_MHT = 258, + COLOR_BayerGR2BGR_MHT = 259, + + COLOR_BayerBG2RGB_MHT = COLOR_BayerRG2BGR_MHT, + COLOR_BayerGB2RGB_MHT = COLOR_BayerGR2BGR_MHT, + COLOR_BayerRG2RGB_MHT = COLOR_BayerBG2BGR_MHT, + COLOR_BayerGR2RGB_MHT = COLOR_BayerGB2BGR_MHT, + + COLOR_BayerBG2GRAY_MHT = 260, + COLOR_BayerGB2GRAY_MHT = 261, + COLOR_BayerRG2GRAY_MHT = 262, + COLOR_BayerGR2GRAY_MHT = 263 +}; +CV_EXPORTS void demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn = -1, Stream& stream = Stream::Null()); + //! swap channels //! dstOrder - Integer array describing how channel values are permutated. The n-th entry //! of the array contains the number of the channel that is stored in the n-th channel of @@ -894,9 +914,11 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels //! Calculates histogram for 8u one channel image //! Output hist will have one row, 256 cols and CV32SC1 type. CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); +CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //! normalizes the grayscale image brightness and contrast by normalizing its histogram CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); +CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); //////////////////////////////// StereoBM_GPU //////////////////////////////// @@ -1386,82 +1408,6 @@ private: friend class CascadeClassifier_GPU_LBP; }; -////////////////////////////////// SURF ////////////////////////////////////////// - -class CV_EXPORTS SURF_GPU -{ -public: - enum KeypointLayout - { - X_ROW = 0, - Y_ROW, - LAPLACIAN_ROW, - OCTAVE_ROW, - SIZE_ROW, - ANGLE_ROW, - HESSIAN_ROW, - ROWS_COUNT - }; - - //! the default constructor - SURF_GPU(); - //! the full constructor taking all the necessary parameters - explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4, - int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false); - - //! returns the descriptor size in float's (64 or 128) - int descriptorSize() const; - - //! upload host keypoints to device memory - static void uploadKeypoints(const std::vector& keypoints, GpuMat& keypointsGPU); - //! download keypoints from device to host memory - static void downloadKeypoints(const GpuMat& keypointsGPU, std::vector& keypoints); - - //! download descriptors from device to host memory - static void downloadDescriptors(const GpuMat& descriptorsGPU, std::vector& descriptors); - - //! finds the keypoints using fast hessian detector used in SURF - //! supports CV_8UC1 images - //! keypoints will have nFeature cols and 6 rows - //! keypoints.ptr(X_ROW)[i] will contain x coordinate of i'th feature - //! keypoints.ptr(Y_ROW)[i] will contain y coordinate of i'th feature - //! keypoints.ptr(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature - //! keypoints.ptr(OCTAVE_ROW)[i] will contain octave of i'th feature - //! keypoints.ptr(SIZE_ROW)[i] will contain size of i'th feature - //! keypoints.ptr(ANGLE_ROW)[i] will contain orientation of i'th feature - //! keypoints.ptr(HESSIAN_ROW)[i] will contain response of i'th feature - void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); - //! finds the keypoints and computes their descriptors. - //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction - void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false); - - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints); - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false); - - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, std::vector& descriptors, - bool useProvidedKeypoints = false); - - void releaseMemory(); - - // SURF parameters - double hessianThreshold; - int nOctaves; - int nOctaveLayers; - bool extended; - bool upright; - - //! max keypoints = min(keypointsRatio * img.size().area(), 65535) - float keypointsRatio; - - GpuMat sum, mask1, maskSum, intBuffer; - - GpuMat det, trace; - - GpuMat maxPosBuffer; -}; - ////////////////////////////////// FAST ////////////////////////////////////////// class CV_EXPORTS FAST_GPU @@ -2129,41 +2075,6 @@ private: GpuMat bgmodelUsedModes_; //keep track of number of modes per pixel }; -/*! - * The class implements the following algorithm: - * "ViBe: A universal background subtraction algorithm for video sequences" - * O. Barnich and M. Van D Roogenbroeck - * IEEE Transactions on Image Processing, 20(6) :1709-1724, June 2011 - */ -class CV_EXPORTS VIBE_GPU -{ -public: - //! the default constructor - explicit VIBE_GPU(unsigned long rngSeed = 1234567); - - //! re-initiaization method - void initialize(const GpuMat& firstFrame, Stream& stream = Stream::Null()); - - //! the update operator - void operator()(const GpuMat& frame, GpuMat& fgmask, Stream& stream = Stream::Null()); - - //! releases all inner buffers - void release(); - - int nbSamples; // number of samples per pixel - int reqMatches; // #_min - int radius; // R - int subsamplingFactor; // amount of random subsampling - -private: - Size frameSize_; - - unsigned long rngSeed_; - GpuMat randStates_; - - GpuMat samples_; -}; - /** * Background Subtractor module. Takes a series of images and returns a sequence of mask (8UC1) * images of the same size, where 255 indicates Foreground and 0 represents Background. diff --git a/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp new file mode 100644 index 000000000..55b1f247f --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/simd_functions.hpp @@ -0,0 +1,910 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2010-2013, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +/* + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of NVIDIA Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ +#define __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ + +#include "common.hpp" + +/* + This header file contains inline functions that implement intra-word SIMD + operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient + emulation code paths are provided for earlier architectures (sm_1x, sm_2x) + to make the code portable across all GPUs supported by CUDA. The following + functions are currently implemented: + + vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b + vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b + vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b| + vavg2(a,b) per-halfword unsigned average: (a + b) / 2 + vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2 + vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0 + vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0 + vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0 + vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0 + vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0 + vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0 + vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0 + vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0 + vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0 + vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0 + vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0 + vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0 + vmax2(a,b) per-halfword unsigned maximum: max(a, b) + vmin2(a,b) per-halfword unsigned minimum: min(a, b) + + vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b + vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b + vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b| + vavg4(a,b) per-byte unsigned average: (a + b) / 2 + vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2 + vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0 + vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0 + vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0 + vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0 + vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0 + vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0 + vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0 + vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0 + vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0 + vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0 + vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0 + vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0 + vmax4(a,b) per-byte unsigned maximum: max(a, b) + vmin4(a,b) per-byte unsigned minimum: min(a, b) +*/ + +namespace cv { namespace gpu { namespace device +{ + // 2 + + static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = a ^ b; // sum bits + r = a + b; // actual sum + s = s ^ r; // determine carry-ins for each bit position + s = s & 0x00010000; // carry-in to high word (= carry-out from low word) + r = r - s; // subtract out carry-out from low word + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = a ^ b; // sum bits + r = a - b; // actual sum + s = s ^ r; // determine carry-ins for each bit position + s = s & 0x00010000; // borrow to high word + r = r + s; // compensate for borrow from low word + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u, v; + s = a & 0x0000ffff; // extract low halfword + r = b & 0x0000ffff; // extract low halfword + u = ::max(r, s); // maximum of low halfwords + v = ::min(r, s); // minimum of low halfwords + s = a & 0xffff0000; // extract high halfword + r = b & 0xffff0000; // extract high halfword + t = ::max(r, s); // maximum of high halfwords + s = ::min(r, s); // minimum of high halfwords + r = u | t; // maximum of both halfwords + s = v | s; // minimum of both halfwords + r = r - s; // |a - b| = max(a,b) - min(a,b); + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b) + { + unsigned int r, s; + + // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> + // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) + s = a ^ b; + r = a & b; + s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries + s = s >> 1; + s = r + s; + + return s; + } + + static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> + // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) + unsigned int s; + s = a ^ b; + r = a | b; + s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries + s = s >> 1; + r = r - s; + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r & ~c; // msb = 1, if r was 0x0000 + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vseteq2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r & ~c; // msb = 1, if r was 0x0000 + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetge2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80008000; // msbs = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetgt2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80008000; // msbs = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetle2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetlt2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80008000; // msb = carry-outs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r | c; // msb = 1, if r was not 0x0000 + c = c & 0x80008000; // extract msbs + r = c >> 15; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetne2(a, b); + c = r << 16; // convert bool + r = c - r; // into mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x0000 if a == b + c = r | 0x80008000; // set msbs, to catch carry out + c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 + c = r | c; // msb = 1, if r was not 0x0000 + c = c & 0x80008000; // extract msbs + r = c >> 15; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u; + r = a & 0x0000ffff; // extract low halfword + s = b & 0x0000ffff; // extract low halfword + t = ::max(r, s); // maximum of low halfwords + r = a & 0xffff0000; // extract high halfword + s = b & 0xffff0000; // extract high halfword + u = ::max(r, s); // maximum of high halfwords + r = t | u; // combine halfword maximums + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t, u; + r = a & 0x0000ffff; // extract low halfword + s = b & 0x0000ffff; // extract low halfword + t = ::min(r, s); // minimum of low halfwords + r = a & 0xffff0000; // extract high halfword + s = b & 0xffff0000; // extract high halfword + u = ::min(r, s); // minimum of high halfwords + r = t | u; // combine halfword minimums + #endif + + return r; + } + + // 4 + + static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t; + s = a ^ b; // sum bits + r = a & 0x7f7f7f7f; // clear msbs + t = b & 0x7f7f7f7f; // clear msbs + s = s & 0x80808080; // msb sum bits + r = r + t; // add without msbs, record carry-out in msbs + r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out + #endif /* __CUDA_ARCH__ >= 300 */ + + return r; + } + + static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s, t; + s = a ^ ~b; // inverted sum bits + r = a | 0x80808080; // set msbs + t = b & 0x7f7f7f7f; // clear msbs + s = s & 0x80808080; // inverted msb sum bits + r = r - t; // subtract w/o msbs, record inverted borrows in msb + r = r ^ s; // combine inverted msb sum bits and borrows + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b) + { + unsigned int r, s; + + // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> + // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) + s = a ^ b; + r = a & b; + s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries + s = s >> 1; + s = r + s; + + return s; + } + + static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> + // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) + unsigned int c; + c = a ^ b; + r = a | b; + c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries + c = c >> 1; + r = r - c; + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + r = r ^ c; // extract msbs, msb = 1 if r < 0x80 + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r & ~c; // msb = 1, if r was 0x00 + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b) + { + unsigned int r, t; + + #if __CUDA_ARCH__ >= 300 + r = vseteq4(a, b); + t = r << 8; // convert bool + r = t - r; // to mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + t = a ^ b; // 0x00 if a == b + r = t | 0x80808080; // set msbs, to catch carry out + t = t ^ r; // extract msbs, msb = 1 if t < 0x80 + r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80 + r = t & ~r; // msb = 1, if t was 0x00 + t = r >> 7; // build mask + t = r - t; // from + r = t | r; // msbs + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetle4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 + c = c & 0x80808080; // msbs = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetlt4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(a)); + c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] + c = c & 0x80808080; // msbs = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b) + { + unsigned int r, s; + + #if __CUDA_ARCH__ >= 300 + r = vsetge4(a, b); + s = r << 8; // convert bool + r = s - r; // to mask + #else + asm ("not.b32 %0,%0;" : "+r"(b)); + r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2 + r = r & 0x80808080; // msb = carry-outs + s = r >> 7; // build mask + s = r - s; // from + r = s | r; // msbs + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int c; + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetgt4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + asm("not.b32 %0, %0;" : "+r"(b)); + c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] + c = c & 0x80808080; // msb = carry-outs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + unsigned int c; + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r | c; // msb = 1, if r was not 0x00 + c = c & 0x80808080; // extract msbs + r = c >> 7; // convert to bool + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b) + { + unsigned int r, c; + + #if __CUDA_ARCH__ >= 300 + r = vsetne4(a, b); + c = r << 8; // convert bool + r = c - r; // to mask + #else + // inspired by Alan Mycroft's null-byte detection algorithm: + // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) + r = a ^ b; // 0x00 if a == b + c = r | 0x80808080; // set msbs, to catch carry out + c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 + c = r | c; // msb = 1, if r was not 0x00 + c = c & 0x80808080; // extract msbs + r = c >> 7; // convert + r = c - r; // msbs to + r = c | r; // mask + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(a, b); // mask = 0xff if a >= b + r = a ^ b; // + s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b) + r = s ^ r; // select a when b >= a, else select b => min(a,b) + r = s - r; // |a - b| = max(a,b) - min(a,b); + #endif + + return r; + } + + static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(a, b); // mask = 0xff if a >= b + r = a & s; // select a when b >= a + s = b & ~s; // select b when b < a + r = r | s; // combine byte selections + #endif + + return r; // byte-wise unsigned maximum + } + + static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b) + { + unsigned int r = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #elif __CUDA_ARCH__ >= 200 + asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); + #else + unsigned int s; + s = vcmpge4(b, a); // mask = 0xff if a >= b + r = a & s; // select a when b >= a + s = b & ~s; // select b when b < a + r = r | s; // combine byte selections + #endif + + return r; + } +}}} + +#endif // __OPENCV_GPU_SIMD_FUNCTIONS_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index d82211bf3..159f4b968 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -7,7 +7,7 @@ // copy or use the software. // // -// License Agreement +// License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. diff --git a/modules/gpu/perf/perf_calib3d.cpp b/modules/gpu/perf/perf_calib3d.cpp index b174d9a12..8019c0349 100644 --- a/modules/gpu/perf/perf_calib3d.cpp +++ b/modules/gpu/perf/perf_calib3d.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; ////////////////////////////////////////////////////////////////////// // StereoBM @@ -12,7 +13,7 @@ DEF_PARAM_TEST_1(ImagePair, pair_string); PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png"))) { - declare.time(5.0); + declare.time(300.0); const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE); ASSERT_FALSE(imgLeft.empty()); @@ -53,7 +54,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png"))) { - declare.time(10.0); + declare.time(300.0); const cv::Mat imgLeft = readImage(GET_PARAM(0)); ASSERT_FALSE(imgLeft.empty()); @@ -87,7 +88,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png"))) { - declare.time(10.0); + declare.time(300.0); const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE); ASSERT_FALSE(imgLeft.empty()); diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp index 22840f9f4..70bb8f24f 100644 --- a/modules/gpu/perf/perf_core.cpp +++ b/modules/gpu/perf/perf_core.cpp @@ -1748,7 +1748,10 @@ PERF_TEST_P(Sz_Depth_Norm, Core_Norm, const int normType = GET_PARAM(2); cv::Mat src(size, depth); - declare.in(src, WARMUP_RNG); + if (depth == CV_8U) + cv::randu(src, 0, 254); + else + declare.in(src, WARMUP_RNG); if (PERF_RUN_GPU()) { @@ -1923,7 +1926,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMax, const int depth = GET_PARAM(1); cv::Mat src(size, depth); - declare.in(src, WARMUP_RNG); + if (depth == CV_8U) + cv::randu(src, 0, 254); + else + declare.in(src, WARMUP_RNG); if (PERF_RUN_GPU()) { @@ -1958,7 +1964,10 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, const int depth = GET_PARAM(1); cv::Mat src(size, depth); - declare.in(src, WARMUP_RNG); + if (depth == CV_8U) + cv::randu(src, 0, 254); + else + declare.in(src, WARMUP_RNG); if (PERF_RUN_GPU()) { diff --git a/modules/gpu/perf/perf_denoising.cpp b/modules/gpu/perf/perf_denoising.cpp index 6f03994bd..970122568 100644 --- a/modules/gpu/perf/perf_denoising.cpp +++ b/modules/gpu/perf/perf_denoising.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; #define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p) @@ -63,7 +64,7 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans, Values(21), Values(5))) { - declare.time(60.0); + declare.time(600.0); const cv::Size size = GET_PARAM(0); const int depth = GET_PARAM(1); diff --git a/modules/gpu/perf/perf_features2d.cpp b/modules/gpu/perf/perf_features2d.cpp index 480f58238..e5a6ef3c8 100644 --- a/modules/gpu/perf/perf_features2d.cpp +++ b/modules/gpu/perf/perf_features2d.cpp @@ -2,105 +2,7 @@ using namespace std; using namespace testing; - -struct KeypointIdxCompare -{ - std::vector* keypoints; - - explicit KeypointIdxCompare(std::vector* _keypoints) : keypoints(_keypoints) {} - - bool operator ()(size_t i1, size_t i2) const - { - cv::KeyPoint kp1 = (*keypoints)[i1]; - cv::KeyPoint kp2 = (*keypoints)[i2]; - if (kp1.pt.x != kp2.pt.x) - return kp1.pt.x < kp2.pt.x; - if (kp1.pt.y != kp2.pt.y) - return kp1.pt.y < kp2.pt.y; - if (kp1.response != kp2.response) - return kp1.response < kp2.response; - return kp1.octave < kp2.octave; - } -}; - -static void sortKeyPoints(std::vector& keypoints, cv::InputOutputArray _descriptors = cv::noArray()) -{ - std::vector indexies(keypoints.size()); - for (size_t i = 0; i < indexies.size(); ++i) - indexies[i] = i; - - std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints)); - - std::vector new_keypoints; - cv::Mat new_descriptors; - - new_keypoints.resize(keypoints.size()); - - cv::Mat descriptors; - if (_descriptors.needed()) - { - descriptors = _descriptors.getMat(); - new_descriptors.create(descriptors.size(), descriptors.type()); - } - - for (size_t i = 0; i < indexies.size(); ++i) - { - size_t new_idx = indexies[i]; - new_keypoints[i] = keypoints[new_idx]; - if (!new_descriptors.empty()) - descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i)); - } - - keypoints.swap(new_keypoints); - if (_descriptors.needed()) - new_descriptors.copyTo(_descriptors); -} - -////////////////////////////////////////////////////////////////////// -// SURF - -DEF_PARAM_TEST_1(Image, string); - -PERF_TEST_P(Image, Features2D_SURF, - Values("gpu/perf/aloe.png")) -{ - declare.time(50.0); - - const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE); - ASSERT_FALSE(img.empty()); - - if (PERF_RUN_GPU()) - { - cv::gpu::SURF_GPU d_surf; - - const cv::gpu::GpuMat d_img(img); - cv::gpu::GpuMat d_keypoints, d_descriptors; - - TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors); - - std::vector gpu_keypoints; - d_surf.downloadKeypoints(d_keypoints, gpu_keypoints); - - cv::Mat gpu_descriptors(d_descriptors); - - sortKeyPoints(gpu_keypoints, gpu_descriptors); - - SANITY_CHECK_KEYPOINTS(gpu_keypoints); - SANITY_CHECK(gpu_descriptors, 1e-3); - } - else - { - cv::SURF surf; - - std::vector cpu_keypoints; - cv::Mat cpu_descriptors; - - TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors); - - SANITY_CHECK_KEYPOINTS(cpu_keypoints); - SANITY_CHECK(cpu_descriptors); - } -} +using namespace perf; ////////////////////////////////////////////////////////////////////// // FAST @@ -153,6 +55,8 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB, Combine(Values("gpu/perf/aloe.png"), Values(4000))) { + declare.time(300.0); + const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp index 3516954a6..a343d1057 100644 --- a/modules/gpu/perf/perf_filters.cpp +++ b/modules/gpu/perf/perf_filters.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; ////////////////////////////////////////////////////////////////////// // Blur diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index be6eb4877..84cb0e200 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -632,7 +632,7 @@ DEF_PARAM_TEST_1(Image, string); PERF_TEST_P(Image, ImgProc_MeanShiftFiltering, Values("gpu/meanshift/cones.png")) { - declare.time(15.0); + declare.time(300.0); const cv::Mat img = readImage(GetParam()); ASSERT_FALSE(img.empty()); @@ -668,7 +668,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftFiltering, PERF_TEST_P(Image, ImgProc_MeanShiftProc, Values("gpu/meanshift/cones.png")) { - declare.time(5.0); + declare.time(300.0); const cv::Mat img = readImage(GetParam()); ASSERT_FALSE(img.empty()); @@ -702,7 +702,7 @@ PERF_TEST_P(Image, ImgProc_MeanShiftProc, PERF_TEST_P(Image, ImgProc_MeanShiftSegmentation, Values("gpu/meanshift/cones.png")) { - declare.time(5.0); + declare.time(300.0); const cv::Mat img = readImage(GetParam()); ASSERT_FALSE(img.empty()); @@ -830,6 +830,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate8U, GPU_CHANNELS_1_3_4, ALL_TEMPLATE_METHODS)) { + declare.time(300.0); + const cv::Size size = GET_PARAM(0); const cv::Size templ_size = GET_PARAM(1); const int cn = GET_PARAM(2); @@ -868,6 +870,8 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, ImgProc_MatchTemplate32F, GPU_CHANNELS_1_3_4, Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))) { + declare.time(300.0); + const cv::Size size = GET_PARAM(0); const cv::Size templ_size = GET_PARAM(1); const int cn = GET_PARAM(2); @@ -1034,7 +1038,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerHarris, TEST_CYCLE() cv::gpu::cornerHarris(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, k, borderMode); - GPU_SANITY_CHECK(dst); + GPU_SANITY_CHECK(dst, 1e-4); } else { @@ -1077,7 +1081,7 @@ PERF_TEST_P(Image_Type_Border_BlockSz_ApertureSz, ImgProc_CornerMinEigenVal, TEST_CYCLE() cv::gpu::cornerMinEigenVal(d_img, dst, d_Dx, d_Dy, d_buf, blockSize, apertureSize, borderMode); - GPU_SANITY_CHECK(dst); + GPU_SANITY_CHECK(dst, 1e-4); } else { @@ -1341,7 +1345,12 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer, Values(CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR), CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR), CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR), - CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR)))) + CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR), + + CvtColorInfo(1, 1, cv::COLOR_BayerBG2GRAY), + CvtColorInfo(1, 1, cv::COLOR_BayerGB2GRAY), + CvtColorInfo(1, 1, cv::COLOR_BayerRG2GRAY), + CvtColorInfo(1, 1, cv::COLOR_BayerGR2GRAY)))) { const cv::Size size = GET_PARAM(0); const int depth = GET_PARAM(1); @@ -1369,6 +1378,50 @@ PERF_TEST_P(Sz_Depth_Code, ImgProc_CvtColorBayer, } } +CV_ENUM(DemosaicingCode, + cv::COLOR_BayerBG2BGR, cv::COLOR_BayerGB2BGR, cv::COLOR_BayerRG2BGR, cv::COLOR_BayerGR2BGR, + cv::COLOR_BayerBG2GRAY, cv::COLOR_BayerGB2GRAY, cv::COLOR_BayerRG2GRAY, cv::COLOR_BayerGR2GRAY, + cv::gpu::COLOR_BayerBG2BGR_MHT, cv::gpu::COLOR_BayerGB2BGR_MHT, cv::gpu::COLOR_BayerRG2BGR_MHT, cv::gpu::COLOR_BayerGR2BGR_MHT, + cv::gpu::COLOR_BayerBG2GRAY_MHT, cv::gpu::COLOR_BayerGB2GRAY_MHT, cv::gpu::COLOR_BayerRG2GRAY_MHT, cv::gpu::COLOR_BayerGR2GRAY_MHT) + +DEF_PARAM_TEST(Sz_Code, cv::Size, DemosaicingCode); + +PERF_TEST_P(Sz_Code, ImgProc_Demosaicing, + Combine(GPU_TYPICAL_MAT_SIZES, + ValuesIn(DemosaicingCode::all()))) +{ + const cv::Size size = GET_PARAM(0); + const int code = GET_PARAM(1); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::demosaicing(d_src, dst, code); + + GPU_SANITY_CHECK(dst); + } + else + { + if (code >= cv::COLOR_COLORCVT_MAX) + { + FAIL_NO_CPU(); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::cvtColor(src, dst, code); + + CPU_SANITY_CHECK(dst); + } + } +} + ////////////////////////////////////////////////////////////////////// // SwapChannels diff --git a/modules/gpu/perf/perf_labeling.cpp b/modules/gpu/perf/perf_labeling.cpp index f3ad12c94..cbc9ff0a2 100644 --- a/modules/gpu/perf/perf_labeling.cpp +++ b/modules/gpu/perf/perf_labeling.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; DEF_PARAM_TEST_1(Image, string); diff --git a/modules/gpu/perf/perf_main.cpp b/modules/gpu/perf/perf_main.cpp index 312b74448..07c1b519c 100644 --- a/modules/gpu/perf/perf_main.cpp +++ b/modules/gpu/perf/perf_main.cpp @@ -1,70 +1,5 @@ #include "perf_precomp.hpp" -static void printOsInfo() -{ -#if defined _WIN32 -# if defined _WIN64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"), fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"), fflush(stdout); -# endif -#elif defined linux -# if defined _LP64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"), fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"), fflush(stdout); -# endif -#elif defined __APPLE__ -# if defined _LP64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"), fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"), fflush(stdout); -# endif -#endif - -} - -static void printCudaInfo() -{ - printOsInfo(); -#ifndef HAVE_CUDA - printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout); -#else - int driver; - cudaDriverGetVersion(&driver); - - printf("[----------]\n"), fflush(stdout); - printf("[ GPU INFO ] \tCUDA Driver version: %d.\n", driver), fflush(stdout); - printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout); - printf("[----------]\n"), fflush(stdout); - - printf("[----------]\n"), fflush(stdout); - printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout); - printf("[ BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout); - printf("[ PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout); - printf("[----------]\n"), fflush(stdout); - - printf("[----------]\n"), fflush(stdout); - int deviceCount = cv::gpu::getCudaEnabledDeviceCount(); - printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout); - printf("[----------]\n"), fflush(stdout); - - for (int i = 0; i < deviceCount; ++i) - { - cv::gpu::DeviceInfo info(i); - - printf("[----------]\n"), fflush(stdout); - printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout); - printf("[ ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout); - printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()), fflush(stdout); - printf("[ ] \tTotal memory: %d Mb\n", static_cast(static_cast(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout); - printf("[ ] \tFree memory: %d Mb\n", static_cast(static_cast(info.freeMemory() / 1024.0) / 1024.0)), fflush(stdout); - if (!info.isCompatible()) - printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n"); - printf("[----------]\n"), fflush(stdout); - } - -#endif -} +using namespace perf; CV_PERF_TEST_MAIN(gpu, printCudaInfo()) diff --git a/modules/gpu/perf/perf_matop.cpp b/modules/gpu/perf/perf_matop.cpp index 1696e3a7e..f2803f0f2 100644 --- a/modules/gpu/perf/perf_matop.cpp +++ b/modules/gpu/perf/perf_matop.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; ////////////////////////////////////////////////////////////////////// // SetTo diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 969ac1076..4f8e56853 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -2,6 +2,7 @@ using namespace std; using namespace testing; +using namespace perf; /////////////////////////////////////////////////////////////// // HOG @@ -18,6 +19,8 @@ PERF_TEST_P(Image, ObjDetect_HOG, "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png")) { + declare.time(300.0); + const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); diff --git a/modules/gpu/perf/perf_precomp.hpp b/modules/gpu/perf/perf_precomp.hpp index 71fe9e7d0..322cac094 100644 --- a/modules/gpu/perf/perf_precomp.hpp +++ b/modules/gpu/perf/perf_precomp.hpp @@ -19,6 +19,7 @@ #endif #include "opencv2/ts.hpp" +#include "opencv2/ts/gpu_perf.hpp" #include "opencv2/core.hpp" #include "opencv2/highgui.hpp" @@ -26,12 +27,9 @@ #include "opencv2/calib3d.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/video.hpp" -#include "opencv2/nonfree.hpp" #include "opencv2/legacy.hpp" #include "opencv2/photo.hpp" -#include "utility.hpp" - #ifdef GTEST_CREATE_SHARED_LIBRARY #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined #endif diff --git a/modules/gpu/perf/perf_video.cpp b/modules/gpu/perf/perf_video.cpp index 61c2687ca..b998ff95f 100644 --- a/modules/gpu/perf/perf_video.cpp +++ b/modules/gpu/perf/perf_video.cpp @@ -4,6 +4,18 @@ using namespace std; using namespace testing; using namespace perf; +#if defined(HAVE_XINE) || \ + defined(HAVE_GSTREAMER) || \ + defined(HAVE_QUICKTIME) || \ + defined(HAVE_AVFOUNDATION) || \ + defined(HAVE_FFMPEG) || \ + defined(WIN32) /* assume that we have ffmpeg */ + +# define BUILD_WITH_VIDEO_INPUT_SUPPORT 1 +#else +# define BUILD_WITH_VIDEO_INPUT_SUPPORT 0 +#endif + namespace cv { template<> void Ptr::delete_obj() @@ -142,7 +154,7 @@ PERF_TEST_P(Image_MinDistance, Video_GoodFeaturesToTrack, PERF_TEST_P(ImagePair, Video_BroxOpticalFlow, Values(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png"))) { - declare.time(10); + declare.time(300); cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(frame0.empty()); @@ -372,8 +384,8 @@ PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1, TEST_CYCLE() d_alg(d_frame0, d_frame1, u, v); - GPU_SANITY_CHECK(u, 1e-4); - GPU_SANITY_CHECK(v, 1e-4); + GPU_SANITY_CHECK(u, 1e-2); + GPU_SANITY_CHECK(v, 1e-2); } else { @@ -482,6 +494,8 @@ PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM, ////////////////////////////////////////////////////// // FGDStatModel +#if BUILD_WITH_VIDEO_INPUT_SUPPORT + DEF_PARAM_TEST_1(Video, string); PERF_TEST_P(Video, Video_FGDStatModel, @@ -548,9 +562,13 @@ PERF_TEST_P(Video, Video_FGDStatModel, } } +#endif + ////////////////////////////////////////////////////// // MOG +#if BUILD_WITH_VIDEO_INPUT_SUPPORT + DEF_PARAM_TEST(Video_Cn_LearningRate, string, MatCn, double); PERF_TEST_P(Video_Cn_LearningRate, Video_MOG, @@ -643,9 +661,13 @@ PERF_TEST_P(Video_Cn_LearningRate, Video_MOG, } } +#endif + ////////////////////////////////////////////////////// // MOG2 +#if BUILD_WITH_VIDEO_INPUT_SUPPORT + DEF_PARAM_TEST(Video_Cn, string, int); PERF_TEST_P(Video_Cn, Video_MOG2, @@ -740,9 +762,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2, } } +#endif + ////////////////////////////////////////////////////// // MOG2GetBackgroundImage +#if BUILD_WITH_VIDEO_INPUT_SUPPORT + PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage, Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), GPU_CHANNELS_1_3_4)) @@ -818,74 +844,13 @@ PERF_TEST_P(Video_Cn, Video_MOG2GetBackgroundImage, } } -////////////////////////////////////////////////////// -// VIBE - -PERF_TEST_P(Video_Cn, Video_VIBE, - Combine(Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi"), - GPU_CHANNELS_1_3_4)) -{ - const string inputFile = perf::TestBase::getDataPath(GET_PARAM(0)); - const int cn = GET_PARAM(1); - - cv::VideoCapture cap(inputFile); - ASSERT_TRUE(cap.isOpened()); - - cv::Mat frame; - cap >> frame; - ASSERT_FALSE(frame.empty()); - - if (cn != 3) - { - cv::Mat temp; - if (cn == 1) - cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY); - else - cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA); - cv::swap(temp, frame); - } - - if (PERF_RUN_GPU()) - { - cv::gpu::GpuMat d_frame(frame); - cv::gpu::VIBE_GPU vibe; - cv::gpu::GpuMat foreground; - - vibe(d_frame, foreground); - - for (int i = 0; i < 10; ++i) - { - cap >> frame; - ASSERT_FALSE(frame.empty()); - - if (cn != 3) - { - cv::Mat temp; - if (cn == 1) - cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY); - else - cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA); - cv::swap(temp, frame); - } - - d_frame.upload(frame); - - startTimer(); next(); - vibe(d_frame, foreground); - stopTimer(); - } - - GPU_SANITY_CHECK(foreground); - } - else - { - FAIL_NO_CPU(); - } -} +#endif ////////////////////////////////////////////////////// // GMG +#if BUILD_WITH_VIDEO_INPUT_SUPPORT + DEF_PARAM_TEST(Video_Cn_MaxFeatures, string, MatCn, int); PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG, @@ -993,11 +958,13 @@ PERF_TEST_P(Video_Cn_MaxFeatures, Video_GMG, } } -#ifdef HAVE_NVCUVID +#endif ////////////////////////////////////////////////////// // VideoReader +#if defined(HAVE_NVCUVID) && BUILD_WITH_VIDEO_INPUT_SUPPORT + PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi")) { declare.time(20); @@ -1028,10 +995,12 @@ PERF_TEST_P(Video, Video_VideoReader, Values("gpu/video/768x576.avi", "gpu/video } } +#endif + ////////////////////////////////////////////////////// // VideoWriter -#ifdef WIN32 +#if defined(HAVE_NVCUVID) && defined(WIN32) PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video/1920x1080.avi")) { @@ -1089,6 +1058,4 @@ PERF_TEST_P(Video, Video_VideoWriter, Values("gpu/video/768x576.avi", "gpu/video SANITY_CHECK(frame); } -#endif // WIN32 - -#endif // HAVE_NVCUVID +#endif diff --git a/modules/gpu/perf/utility.cpp b/modules/gpu/perf/utility.cpp deleted file mode 100644 index 16c61e0c7..000000000 --- a/modules/gpu/perf/utility.cpp +++ /dev/null @@ -1,184 +0,0 @@ -#include "perf_precomp.hpp" - -using namespace std; -using namespace cv; - -Mat readImage(const string& fileName, int flags) -{ - return imread(perf::TestBase::getDataPath(fileName), flags); -} - -void PrintTo(const CvtColorInfo& info, ostream* os) -{ - static const char* str[] = - { - "BGR2BGRA", - "BGRA2BGR", - "BGR2RGBA", - "RGBA2BGR", - "BGR2RGB", - "BGRA2RGBA", - - "BGR2GRAY", - "RGB2GRAY", - "GRAY2BGR", - "GRAY2BGRA", - "BGRA2GRAY", - "RGBA2GRAY", - - "BGR2BGR565", - "RGB2BGR565", - "BGR5652BGR", - "BGR5652RGB", - "BGRA2BGR565", - "RGBA2BGR565", - "BGR5652BGRA", - "BGR5652RGBA", - - "GRAY2BGR565", - "BGR5652GRAY", - - "BGR2BGR555", - "RGB2BGR555", - "BGR5552BGR", - "BGR5552RGB", - "BGRA2BGR555", - "RGBA2BGR555", - "BGR5552BGRA", - "BGR5552RGBA", - - "GRAY2BGR555", - "BGR5552GRAY", - - "BGR2XYZ", - "RGB2XYZ", - "XYZ2BGR", - "XYZ2RGB", - - "BGR2YCrCb", - "RGB2YCrCb", - "YCrCb2BGR", - "YCrCb2RGB", - - "BGR2HSV", - "RGB2HSV", - - "", - "", - - "BGR2Lab", - "RGB2Lab", - - "BayerBG2BGR", - "BayerGB2BGR", - "BayerRG2BGR", - "BayerGR2BGR", - - "BGR2Luv", - "RGB2Luv", - - "BGR2HLS", - "RGB2HLS", - - "HSV2BGR", - "HSV2RGB", - - "Lab2BGR", - "Lab2RGB", - "Luv2BGR", - "Luv2RGB", - - "HLS2BGR", - "HLS2RGB", - - "BayerBG2BGR_VNG", - "BayerGB2BGR_VNG", - "BayerRG2BGR_VNG", - "BayerGR2BGR_VNG", - - "BGR2HSV_FULL", - "RGB2HSV_FULL", - "BGR2HLS_FULL", - "RGB2HLS_FULL", - - "HSV2BGR_FULL", - "HSV2RGB_FULL", - "HLS2BGR_FULL", - "HLS2RGB_FULL", - - "LBGR2Lab", - "LRGB2Lab", - "LBGR2Luv", - "LRGB2Luv", - - "Lab2LBGR", - "Lab2LRGB", - "Luv2LBGR", - "Luv2LRGB", - - "BGR2YUV", - "RGB2YUV", - "YUV2BGR", - "YUV2RGB", - - "BayerBG2GRAY", - "BayerGB2GRAY", - "BayerRG2GRAY", - "BayerGR2GRAY", - - //YUV 4:2:0 formats family - "YUV2RGB_NV12", - "YUV2BGR_NV12", - "YUV2RGB_NV21", - "YUV2BGR_NV21", - - "YUV2RGBA_NV12", - "YUV2BGRA_NV12", - "YUV2RGBA_NV21", - "YUV2BGRA_NV21", - - "YUV2RGB_YV12", - "YUV2BGR_YV12", - "YUV2RGB_IYUV", - "YUV2BGR_IYUV", - - "YUV2RGBA_YV12", - "YUV2BGRA_YV12", - "YUV2RGBA_IYUV", - "YUV2BGRA_IYUV", - - "YUV2GRAY_420", - - //YUV 4:2:2 formats family - "YUV2RGB_UYVY", - "YUV2BGR_UYVY", - "YUV2RGB_VYUY", - "YUV2BGR_VYUY", - - "YUV2RGBA_UYVY", - "YUV2BGRA_UYVY", - "YUV2RGBA_VYUY", - "YUV2BGRA_VYUY", - - "YUV2RGB_YUY2", - "YUV2BGR_YUY2", - "YUV2RGB_YVYU", - "YUV2BGR_YVYU", - - "YUV2RGBA_YUY2", - "YUV2BGRA_YUY2", - "YUV2RGBA_YVYU", - "YUV2BGRA_YVYU", - - "YUV2GRAY_UYVY", - "YUV2GRAY_YUY2", - - // alpha premultiplication - "RGBA2mRGBA", - "mRGBA2RGBA", - - "COLORCVT_MAX" - }; - - *os << str[info.code]; -} diff --git a/modules/gpu/perf/utility.hpp b/modules/gpu/perf/utility.hpp deleted file mode 100644 index 18c85854a..000000000 --- a/modules/gpu/perf/utility.hpp +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__ -#define __OPENCV_PERF_GPU_UTILITY_HPP__ - -#include "opencv2/core.hpp" -#include "opencv2/imgproc.hpp" -#include "opencv2/ts/ts_perf.hpp" - -cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR); - -using perf::MatType; -using perf::MatDepth; - -CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP) -#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all()) - -CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA) -#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all()) - -CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX) - -enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 }; -CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA) -#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA)) -#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR)) - -struct CvtColorInfo -{ - int scn; - int dcn; - int code; - - CvtColorInfo() {} - explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {} -}; -void PrintTo(const CvtColorInfo& info, std::ostream* os); - -#define GET_PARAM(k) std::tr1::get< k >(GetParam()) - -#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name -#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name - -DEF_PARAM_TEST_1(Sz, cv::Size); -typedef perf::Size_MatType Sz_Type; -DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth); -DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn); - -#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p) - -#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy" - -#define GPU_SANITY_CHECK(mat, ...) \ - do{ \ - cv::Mat gpu_##mat(mat); \ - SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \ - } while(0) - -#define CPU_SANITY_CHECK(mat, ...) \ - do{ \ - cv::Mat cpu_##mat(mat); \ - SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \ - } while(0) - -#endif // __OPENCV_PERF_GPU_UTILITY_HPP__ diff --git a/modules/gpu/perf4au/main.cpp b/modules/gpu/perf4au/main.cpp index f4a04432b..162a15b2f 100644 --- a/modules/gpu/perf4au/main.cpp +++ b/modules/gpu/perf4au/main.cpp @@ -8,69 +8,19 @@ #include "opencv2/video.hpp" #include "opencv2/legacy.hpp" #include "opencv2/ts.hpp" - -static void printOsInfo() -{ -#if defined _WIN32 -# if defined _WIN64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout); -# endif -#elif defined linux -# if defined _LP64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout); -# endif -#elif defined __APPLE__ -# if defined _LP64 - printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout); -# else - printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout); -# endif -#endif -} - -static void printCudaInfo() -{ - const int deviceCount = cv::gpu::getCudaEnabledDeviceCount(); - - printf("[----------]\n"); fflush(stdout); - printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout); - printf("[----------]\n"); fflush(stdout); - - for (int i = 0; i < deviceCount; ++i) - { - cv::gpu::DeviceInfo info(i); - - printf("[----------]\n"); fflush(stdout); - printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout); - printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout); - printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout); - printf("[ ] \tTotal memory: %d Mb\n", static_cast(static_cast(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout); - printf("[ ] \tFree memory: %d Mb\n", static_cast(static_cast(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout); - if (!info.isCompatible()) - printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n"); - printf("[----------]\n"); fflush(stdout); - } -} +#include "opencv2/ts/gpu_perf.hpp" int main(int argc, char* argv[]) { - printOsInfo(); - printCudaInfo(); + perf::printCudaInfo(); - perf::Regression::Init("nv_perf_test"); + perf::Regression::Init("gpu_perf4au"); perf::TestBase::Init(argc, argv); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name -#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name - ////////////////////////////////////////////////////////// // HoughLinesP diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 851ac938e..47a29881f 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -318,40 +318,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream) void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) { - class LevelsInit - { - public: - Npp32s pLevels[256]; - const Npp32s* pLevels3[3]; - int nValues3[3]; + const int cn = src.channels(); -#if (CUDA_VERSION > 4020) - GpuMat d_pLevels; -#endif + CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 ); + CV_Assert( lut.depth() == CV_8U ); + CV_Assert( lut.channels() == 1 || lut.channels() == cn ); + CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() ); - LevelsInit() - { - nValues3[0] = nValues3[1] = nValues3[2] = 256; - for (int i = 0; i < 256; ++i) - pLevels[i] = i; - - -#if (CUDA_VERSION <= 4020) - pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; -#else - d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); - pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr(); -#endif - } - }; - static LevelsInit lvls; - - int cn = src.channels(); - - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3); - CV_Assert(lut.depth() == CV_8U && (lut.channels() == 1 || lut.channels() == cn) && lut.rows * lut.cols == 256 && lut.isContinuous()); - - dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn)); + dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn)); NppiSize sz; sz.height = src.rows; @@ -360,19 +334,34 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) Mat nppLut; lut.convertTo(nppLut, CV_32S); - cudaStream_t stream = StreamAccessor::getStream(s); + int nValues3[] = {256, 256, 256}; + Npp32s pLevels[256]; + for (int i = 0; i < 256; ++i) + pLevels[i] = i; + + const Npp32s* pLevels3[3]; + +#if (CUDA_VERSION <= 4020) + pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels; +#else + GpuMat d_pLevels; + d_pLevels.upload(Mat(1, 256, CV_32S, pLevels)); + pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr(); +#endif + + cudaStream_t stream = StreamAccessor::getStream(s); NppStreamHandler h(stream); if (src.type() == CV_8UC1) { #if (CUDA_VERSION <= 4020) nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, nppLut.ptr(), lvls.pLevels, 256) ); + dst.ptr(), static_cast(dst.step), sz, nppLut.ptr(), pLevels, 256) ); #else GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data)); nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, d_nppLut.ptr(), lvls.d_pLevels.ptr(), 256) ); + dst.ptr(), static_cast(dst.step), sz, d_nppLut.ptr(), d_pLevels.ptr(), 256) ); #endif } else @@ -409,7 +398,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s) } nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr(), static_cast(src.step), - dst.ptr(), static_cast(dst.step), sz, pValues3, lvls.pLevels3, lvls.nValues3) ); + dst.ptr(), static_cast(dst.step), sz, pValues3, pLevels3, nValues3) ); } if (stream == 0) diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp index 05d460900..76793d520 100644 --- a/modules/gpu/src/color.cpp +++ b/modules/gpu/src/color.cpp @@ -48,6 +48,7 @@ using namespace cv::gpu; #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::demosaicing(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); } void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); } void cv::gpu::gammaCorrection(const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); } @@ -62,6 +63,9 @@ namespace cv { namespace gpu { void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); template void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + + template + void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); } }} @@ -1620,26 +1624,56 @@ namespace funcs[src.depth()][dcn - 1](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream)); } - void bayerBG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream) { bayer_to_bgr(src, dst, dcn, false, false, stream); } - void bayerGB_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream) { bayer_to_bgr(src, dst, dcn, false, true, stream); } - void bayerRG_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream) { bayer_to_bgr(src, dst, dcn, true, false, stream); } - void bayerGR_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream) { bayer_to_bgr(src, dst, dcn, true, true, stream); } + + void bayer_to_gray(const GpuMat& src, GpuMat& dst, bool blue_last, bool start_with_green, Stream& stream) + { + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + static const func_t funcs[3] = + { + Bayer2BGR_8u_gpu<1>, + 0, + Bayer2BGR_16u_gpu<1>, + }; + + CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1); + CV_Assert(src.rows > 2 && src.cols > 2); + + dst.create(src.size(), CV_MAKETYPE(src.depth(), 1)); + + funcs[src.depth()](src, dst, blue_last, start_with_green, StreamAccessor::getStream(stream)); + } + void bayerBG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream) + { + bayer_to_gray(src, dst, false, false, stream); + } + void bayerGB_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream) + { + bayer_to_gray(src, dst, false, true, stream); + } + void bayerRG_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream) + { + bayer_to_gray(src, dst, true, false, stream); + } + void bayerGR_to_gray(const GpuMat& src, GpuMat& dst, int /*dcn*/, Stream& stream) + { + bayer_to_gray(src, dst, true, true, stream); + } } void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream) @@ -1756,10 +1790,10 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream yuv_to_bgr, // CV_YUV2BGR = 84 yuv_to_rgb, // CV_YUV2RGB = 85 - 0, // CV_BayerBG2GRAY = 86 - 0, // CV_BayerGB2GRAY = 87 - 0, // CV_BayerRG2GRAY = 88 - 0, // CV_BayerGR2GRAY = 89 + bayerBG_to_gray, // CV_BayerBG2GRAY = 86 + bayerGB_to_gray, // CV_BayerGB2GRAY = 87 + bayerRG_to_gray, // CV_BayerRG2GRAY = 88 + bayerGR_to_gray, // CV_BayerGR2GRAY = 89 //YUV 4:2:0 formats family 0, // CV_YUV2RGB_NV12 = 90, @@ -1825,6 +1859,74 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream func(src, dst, dcn, stream); } +void cv::gpu::demosaicing(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream) +{ + const int depth = src.depth(); + + CV_Assert( src.channels() == 1 ); + + switch (code) + { + case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY: + bayer_to_gray(src, dst, code == CV_BayerBG2GRAY || code == CV_BayerGB2GRAY, code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY, stream); + break; + + case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR: + bayer_to_bgr(src, dst, dcn, code == CV_BayerBG2BGR || code == CV_BayerGB2BGR, code == CV_BayerGB2BGR || code == CV_BayerGR2BGR, stream); + break; + + case COLOR_BayerBG2BGR_MHT: case COLOR_BayerGB2BGR_MHT: case COLOR_BayerRG2BGR_MHT: case COLOR_BayerGR2BGR_MHT: + { + if (dcn <= 0) + dcn = 3; + + CV_Assert( depth == CV_8U ); + CV_Assert( dcn == 3 || dcn == 4 ); + + dst.create(src.size(), CV_MAKETYPE(depth, dcn)); + dst.setTo(Scalar::all(0)); + + Size wholeSize; + Point ofs; + src.locateROI(wholeSize, ofs); + PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step); + + const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1, + code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1); + + if (dcn == 3) + device::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream)); + else + device::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream)); + + break; + } + + case COLOR_BayerBG2GRAY_MHT: case COLOR_BayerGB2GRAY_MHT: case COLOR_BayerRG2GRAY_MHT: case COLOR_BayerGR2GRAY_MHT: + { + CV_Assert( depth == CV_8U ); + + dst.create(src.size(), CV_MAKETYPE(depth, 1)); + dst.setTo(Scalar::all(0)); + + Size wholeSize; + Point ofs; + src.locateROI(wholeSize, ofs); + PtrStepSzb srcWhole(wholeSize.height, wholeSize.width, src.datastart, src.step); + + const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1, + code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1); + + device::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream)); + + break; + } + + default: + CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" ); + } +} + void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s) { CV_Assert(image.type() == CV_8UC4); diff --git a/modules/gpu/src/cuda/debayer.cu b/modules/gpu/src/cuda/debayer.cu index 57322ed81..1d2f18e7a 100644 --- a/modules/gpu/src/cuda/debayer.cu +++ b/modules/gpu/src/cuda/debayer.cu @@ -42,42 +42,38 @@ #if !defined CUDA_DISABLER -#include -#include -#include -#include +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/vec_traits.hpp" +#include "opencv2/gpu/device/vec_math.hpp" +#include "opencv2/gpu/device/limits.hpp" +#include "opencv2/gpu/device/color.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { - namespace device +namespace cv { namespace gpu { namespace device +{ + template struct Bayer2BGR; + + template <> struct Bayer2BGR { - template - __global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz dst, const bool blue_last, const bool start_with_green) + uchar3 res0; + uchar3 res1; + uchar3 res2; + uchar3 res3; + + __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green) { - const int s_x = blockIdx.x * blockDim.x + threadIdx.x; - int s_y = blockIdx.y * blockDim.y + threadIdx.y; - - if (s_y >= dst.rows || (s_x << 2) >= dst.cols) - return; - - s_y = ::min(::max(s_y, 1), dst.rows - 2); - uchar4 patch[3][3]; patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x]; patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)]; - patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)]; + patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x]; patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)]; - patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)]; + patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x]; patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)]; - patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 3) >> 2) - 1)]; - - D res0 = VecTraits::all(numeric_limits::max()); - D res1 = VecTraits::all(numeric_limits::max()); - D res2 = VecTraits::all(numeric_limits::max()); - D res3 = VecTraits::all(numeric_limits::max()); + patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)]; if ((s_y & 1) ^ start_with_green) { @@ -181,45 +177,69 @@ namespace cv { namespace gpu { res3.z = t7; } } - - const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2; - const int d_y = blockIdx.y * blockDim.y + threadIdx.y; - - dst(d_y, d_x) = res0; - if (d_x + 1 < dst.cols) - dst(d_y, d_x + 1) = res1; - if (d_x + 2 < dst.cols) - dst(d_y, d_x + 2) = res2; - if (d_x + 3 < dst.cols) - dst(d_y, d_x + 3) = res3; } + }; - template - __global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz dst, const bool blue_last, const bool start_with_green) + template __device__ __forceinline__ D toDst(const uchar3& pix); + template <> __device__ __forceinline__ uchar toDst(const uchar3& pix) + { + typename bgr_to_gray_traits::functor_type f = bgr_to_gray_traits::create_functor(); + return f(pix); + } + template <> __device__ __forceinline__ uchar3 toDst(const uchar3& pix) + { + return pix; + } + template <> __device__ __forceinline__ uchar4 toDst(const uchar3& pix) + { + return make_uchar4(pix.x, pix.y, pix.z, 255); + } + + template + __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep dst, const bool blue_last, const bool start_with_green) + { + const int s_x = blockIdx.x * blockDim.x + threadIdx.x; + int s_y = blockIdx.y * blockDim.y + threadIdx.y; + + if (s_y >= src.rows || (s_x << 2) >= src.cols) + return; + + s_y = ::min(::max(s_y, 1), src.rows - 2); + + Bayer2BGR bayer; + bayer.apply(src, s_x, s_y, blue_last, start_with_green); + + const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2; + const int d_y = blockIdx.y * blockDim.y + threadIdx.y; + + dst(d_y, d_x) = toDst(bayer.res0); + if (d_x + 1 < src.cols) + dst(d_y, d_x + 1) = toDst(bayer.res1); + if (d_x + 2 < src.cols) + dst(d_y, d_x + 2) = toDst(bayer.res2); + if (d_x + 3 < src.cols) + dst(d_y, d_x + 3) = toDst(bayer.res3); + } + + template <> struct Bayer2BGR + { + ushort3 res0; + ushort3 res1; + + __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green) { - const int s_x = blockIdx.x * blockDim.x + threadIdx.x; - int s_y = blockIdx.y * blockDim.y + threadIdx.y; - - if (s_y >= dst.rows || (s_x << 1) >= dst.cols) - return; - - s_y = ::min(::max(s_y, 1), dst.rows - 2); - ushort2 patch[3][3]; patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x]; patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)]; - patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)]; + patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x]; patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)]; - patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)]; + patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x]; patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)]; - patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((dst.cols + 1) >> 1) - 1)]; - - D res0 = VecTraits::all(numeric_limits::max()); - D res1 = VecTraits::all(numeric_limits::max()); + patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)]; if ((s_y & 1) ^ start_with_green) { @@ -279,53 +299,246 @@ namespace cv { namespace gpu { res1.z = t3; } } - - const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1; - const int d_y = blockIdx.y * blockDim.y + threadIdx.y; - - dst(d_y, d_x) = res0; - if (d_x + 1 < dst.cols) - dst(d_y, d_x + 1) = res1; } + }; - template - void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) - { - typedef typename TypeVec::vec_type dst_t; - - const dim3 block(32, 8); - const dim3 grid(divUp(dst.cols, 4 * block.x), divUp(dst.rows, block.y)); - - cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u, cudaFuncCachePreferL1) ); - - Bayer2BGR_8u<<>>(src, (PtrStepSz)dst, blue_last, start_with_green); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - template - void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) - { - typedef typename TypeVec::vec_type dst_t; - - const dim3 block(32, 8); - const dim3 grid(divUp(dst.cols, 2 * block.x), divUp(dst.rows, block.y)); - - cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u, cudaFuncCachePreferL1) ); - - Bayer2BGR_16u<<>>(src, (PtrStepSz)dst, blue_last, start_with_green); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - - template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); - template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); - template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); - template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + template __device__ __forceinline__ D toDst(const ushort3& pix); + template <> __device__ __forceinline__ ushort toDst(const ushort3& pix) + { + typename bgr_to_gray_traits::functor_type f = bgr_to_gray_traits::create_functor(); + return f(pix); + } + template <> __device__ __forceinline__ ushort3 toDst(const ushort3& pix) + { + return pix; + } + template <> __device__ __forceinline__ ushort4 toDst(const ushort3& pix) + { + return make_ushort4(pix.x, pix.y, pix.z, numeric_limits::max()); } -}} -#endif /* CUDA_DISABLER */ \ No newline at end of file + template + __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep dst, const bool blue_last, const bool start_with_green) + { + const int s_x = blockIdx.x * blockDim.x + threadIdx.x; + int s_y = blockIdx.y * blockDim.y + threadIdx.y; + + if (s_y >= src.rows || (s_x << 1) >= src.cols) + return; + + s_y = ::min(::max(s_y, 1), src.rows - 2); + + Bayer2BGR bayer; + bayer.apply(src, s_x, s_y, blue_last, start_with_green); + + const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1; + const int d_y = blockIdx.y * blockDim.y + threadIdx.y; + + dst(d_y, d_x) = toDst(bayer.res0); + if (d_x + 1 < src.cols) + dst(d_y, d_x + 1) = toDst(bayer.res1); + } + + template + void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) + { + typedef typename TypeVec::vec_type dst_t; + + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y)); + + cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u, cudaFuncCachePreferL1) ); + + Bayer2BGR_8u<<>>(src, (PtrStepSz)dst, blue_last, start_with_green); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template + void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream) + { + typedef typename TypeVec::vec_type dst_t; + + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y)); + + cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u, cudaFuncCachePreferL1) ); + + Bayer2BGR_16u<<>>(src, (PtrStepSz)dst, blue_last, start_with_green); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + + template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream); + + ////////////////////////////////////////////////////////////// + // Bayer Demosaicing (Malvar, He, and Cutler) + // + // by Morgan McGuire, Williams College + // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders + // + // ported to CUDA + + texture sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp); + + template + __global__ void MHCdemosaic(PtrStepSz dst, const int2 sourceOffset, const int2 firstRed) + { + const float kAx = -1.0f / 8.0f, kAy = -1.5f / 8.0f, kAz = 0.5f / 8.0f /*kAw = -1.0f / 8.0f*/; + const float kBx = 2.0f / 8.0f, /*kBy = 0.0f / 8.0f,*/ /*kBz = 0.0f / 8.0f,*/ kBw = 4.0f / 8.0f ; + const float kCx = 4.0f / 8.0f, kCy = 6.0f / 8.0f, kCz = 5.0f / 8.0f /*kCw = 5.0f / 8.0f*/; + const float /*kDx = 0.0f / 8.0f,*/ kDy = 2.0f / 8.0f, kDz = -1.0f / 8.0f /*kDw = -1.0f / 8.0f*/; + const float kEx = -1.0f / 8.0f, kEy = -1.5f / 8.0f, /*kEz = -1.0f / 8.0f,*/ kEw = 0.5f / 8.0f ; + const float kFx = 2.0f / 8.0f, /*kFy = 0.0f / 8.0f,*/ kFz = 4.0f / 8.0f /*kFw = 0.0f / 8.0f*/; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1) + return; + + int2 center; + center.x = x + sourceOffset.x; + center.y = y + sourceOffset.y; + + int4 xCoord; + xCoord.x = center.x - 2; + xCoord.y = center.x - 1; + xCoord.z = center.x + 1; + xCoord.w = center.x + 2; + + int4 yCoord; + yCoord.x = center.y - 2; + yCoord.y = center.y - 1; + yCoord.z = center.y + 1; + yCoord.w = center.y + 2; + + float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0) + + float4 Dvec; + Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1) + Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1) + Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1) + Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1) + + float4 value; + value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0 + value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0 + value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0 + value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0 + + // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1) + value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1 + value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1 + value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1 + value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1 + + float4 PATTERN; + PATTERN.x = kCx * C; + PATTERN.y = kCy * C; + PATTERN.z = kCz * C; + PATTERN.w = PATTERN.z; + + float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w; + + // There are five filter patterns (identity, cross, checker, + // theta, phi). Precompute the terms from all of them and then + // use swizzles to assign to color channels. + // + // Channel Matches + // x cross (e.g., EE G) + // y checker (e.g., EE B) + // z theta (e.g., EO R) + // w phi (e.g., EO B) + + #define A value.x // A0 + A1 + #define B value.y // B0 + B1 + #define E value.z // E0 + E1 + #define F value.w // F0 + F1 + + float3 temp; + + // PATTERN.yzw += (kD.yz * D).xyy; + temp.x = kDy * D; + temp.y = kDz * D; + PATTERN.y += temp.x; + PATTERN.z += temp.y; + PATTERN.w += temp.y; + + // PATTERN += (kA.xyz * A).xyzx; + temp.x = kAx * A; + temp.y = kAy * A; + temp.z = kAz * A; + PATTERN.x += temp.x; + PATTERN.y += temp.y; + PATTERN.z += temp.z; + PATTERN.w += temp.x; + + // PATTERN += (kE.xyw * E).xyxz; + temp.x = kEx * E; + temp.y = kEy * E; + temp.z = kEw * E; + PATTERN.x += temp.x; + PATTERN.y += temp.y; + PATTERN.z += temp.x; + PATTERN.w += temp.z; + + // PATTERN.xw += kB.xw * B; + PATTERN.x += kBx * B; + PATTERN.w += kBw * B; + + // PATTERN.xz += kF.xz * F; + PATTERN.x += kFx * F; + PATTERN.z += kFz * F; + + // Determine which of four types of pixels we are on. + int2 alternate; + alternate.x = (x + firstRed.x) % 2; + alternate.y = (y + firstRed.y) % 2; + + // in BGR sequence; + uchar3 pixelColor = + (alternate.y == 0) ? + ((alternate.x == 0) ? + make_uchar3(saturate_cast(PATTERN.y), saturate_cast(PATTERN.x), saturate_cast(C)) : + make_uchar3(saturate_cast(PATTERN.w), saturate_cast(C), saturate_cast(PATTERN.z))) : + ((alternate.x == 0) ? + make_uchar3(saturate_cast(PATTERN.z), saturate_cast(C), saturate_cast(PATTERN.w)) : + make_uchar3(saturate_cast(C), saturate_cast(PATTERN.x), saturate_cast(PATTERN.y))); + + dst(y, x) = toDst(pixelColor); + } + + template + void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream) + { + typedef typename TypeVec::vec_type dst_t; + + const dim3 block(32, 8); + const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); + + bindTexture(&sourceTex, src); + + MHCdemosaic<<>>((PtrStepSz)dst, sourceOffset, firstRed); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + + template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); + template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); + template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream); +}}} + +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 5165b352a..e9397e534 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -48,6 +48,7 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" +#include "opencv2/gpu/device/simd_functions.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -154,170 +155,28 @@ namespace arithm namespace arithm { - template struct VAdd4; - template <> struct VAdd4 : binary_function + struct VAdd4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vadd4(a, b); } __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} - }; - template <> struct VAdd4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd4() {} - __device__ __forceinline__ VAdd4(const VAdd4& other) {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; //////////////////////////////////// - template struct VAdd2; - template <> struct VAdd2 : binary_function + struct VAdd2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vadd2(a, b); } __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} - }; - template <> struct VAdd2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAdd2() {} - __device__ __forceinline__ VAdd2(const VAdd2& other) {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; //////////////////////////////////// @@ -336,13 +195,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits { }; @@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); + transform(src1, src2, dst, VAdd4(), WithOutMask(), stream); } - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + transform(src1, src2, dst, VAdd2(), WithOutMask(), stream); } - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { @@ -543,170 +390,28 @@ namespace arithm namespace arithm { - template struct VSub4; - template <> struct VSub4 : binary_function + struct VSub4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vsub4(a, b); } __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} - }; - template <> struct VSub4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub4() {} - __device__ __forceinline__ VSub4(const VSub4& other) {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; //////////////////////////////////// - template struct VSub2; - template <> struct VSub2 : binary_function + struct VSub2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vsub2(a, b); } __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ int operator ()(uint a, uint b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ uint operator ()(int a, int b) const - { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} - }; - template <> struct VSub2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VSub2() {} - __device__ __forceinline__ VSub2(const VSub2& other) {} + __device__ __forceinline__ VSub2(const VSub2& other) {} }; //////////////////////////////////// @@ -725,13 +430,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits { }; @@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); + transform(src1, src2, dst, VSub4(), WithOutMask(), stream); } - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); + transform(src1, src2, dst, VSub2(), WithOutMask(), stream); } - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { @@ -1496,90 +1189,28 @@ namespace arithm namespace arithm { - template struct VAbsDiff4; - template <> struct VAbsDiff4 : binary_function + struct VAbsDiff4 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vabsdiff4(a, b); } __device__ __forceinline__ VAbsDiff4() {} - __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} - }; - template <> struct VAbsDiff4 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAbsDiff4() {} - __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; //////////////////////////////////// - template struct VAbsDiff2; - template <> struct VAbsDiff2 : binary_function + struct VAbsDiff2 : binary_function { __device__ __forceinline__ uint operator ()(uint a, uint b) const { - uint res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; + return vabsdiff2(a, b); } __device__ __forceinline__ VAbsDiff2() {} - __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} - }; - template <> struct VAbsDiff2 : binary_function - { - __device__ __forceinline__ int operator ()(int a, int b) const - { - int res = 0; - - #if __CUDA_ARCH__ >= 300 - asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #elif __CUDA_ARCH__ >= 200 - asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); - #endif - - return res; - } - - __device__ __forceinline__ VAbsDiff2() {} - __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} + __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {} }; //////////////////////////////////// @@ -1611,13 +1242,13 @@ namespace arithm namespace cv { namespace gpu { namespace device { - template struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits { }; //////////////////////////////////// - template struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits + template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits { }; @@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device namespace arithm { - template - void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void absDiffMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff4(), WithOutMask(), stream); + transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream); } - template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - - template - void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void absDiffMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) { - transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAbsDiff2(), WithOutMask(), stream); + transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream); } - template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { @@ -1877,6 +1500,49 @@ namespace arithm namespace arithm { + struct VCmpEq4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmpeq4(a, b); + } + + __device__ __forceinline__ VCmpEq4() {} + __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {} + }; + struct VCmpNe4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmpne4(a, b); + } + + __device__ __forceinline__ VCmpNe4() {} + __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {} + }; + struct VCmpLt4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmplt4(a, b); + } + + __device__ __forceinline__ VCmpLt4() {} + __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {} + }; + struct VCmpLe4 : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + return vcmple4(a, b); + } + + __device__ __forceinline__ VCmpLe4() {} + __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {} + }; + + //////////////////////////////////// + template struct Cmp : binary_function { @@ -1890,6 +1556,21 @@ namespace arithm namespace cv { namespace gpu { namespace device { + template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits + { + }; + template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits + { + }; + + //////////////////////////////////// + template struct TransformFunctorTraits< arithm::Cmp > : arithm::ArithmFuncTraits { }; @@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device namespace arithm { + void cmpMatEq_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream); + } + void cmpMatNe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream); + } + void cmpMatLt_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream); + } + void cmpMatLe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream) + { + transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream); + } + template