modified according to CUDA 4.0 API updates

This commit is contained in:
Vladislav Vinogradov
2011-05-31 08:31:10 +00:00
parent 98d663e7e0
commit 926a6bba00
40 changed files with 1134 additions and 1818 deletions

View File

@@ -1,236 +0,0 @@
set(name "gpu")
set(the_target "opencv_${name}")
project(${the_target})
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_BINARY_DIR}")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
file(GLOB lib_cuda "src/cuda/*.cu*")
file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
source_group("Include" FILES ${lib_hdrs})
#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
endif()
if (HAVE_CUDA)
get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
find_package(NPP 3.2.16 REQUIRED)
message(STATUS "NPP detected: " ${NPP_VERSION})
include_directories(${CUDA_INCLUDE_DIRS} ${CUDA_NPP_INCLUDES})
if (UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
endif()
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
if(MSVC)
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
endif()
if (OPENCV_BUILD_SHARED_LIB)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
foreach(d ${DEPS})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_target}_pch "src/precomp.cpp")
endif()
add_native_precompiled_header(${the_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_target} ${pch_header})
endif()
endif()
# For dynamic link numbering convenions
set_target_properties(${the_target} PROPERTIES
VERSION ${OPENCV_VERSION}
SOVERSION ${OPENCV_SOVERSION}
OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "modules")
endif()
if (OPENCV_BUILD_SHARED_LIB)
if (MSVC)
set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
else()
add_definitions(-DCVAPI_EXPORTS)
endif()
endif()
# Additional target properties
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
)
# Add the required libraries for linking:
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
endif()
if(MSVC)
if(CMAKE_CROSSCOMPILING)
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
endif()
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
endif()
# Dependencies of this target:
add_dependencies(${the_target} ${DEPS})
install(TARGETS ${the_target}
RUNTIME DESTINATION bin COMPONENT main
LIBRARY DESTINATION lib COMPONENT main
ARCHIVE DESTINATION lib COMPONENT main)
install(FILES ${lib_hdrs}
DESTINATION include/opencv2/${name}
COMPONENT main)
install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp
DESTINATION include/opencv2/${name}
COMPONENT main)
#install(FILES ${lib_device_hdrs}
# DESTINATION include/opencv2/${name}/device
# COMPONENT main)
################################################################################################################
################################ GPU Module Tests #####################################################
################################################################################################################
# Test files processing is in the separated directory to avoid 'Src' source
# filter creation in Visual Studio
if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
set(the_test_target "opencv_test_${name}")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/test"
"${CMAKE_CURRENT_BINARY_DIR}")
set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
foreach(d ${test_deps})
if(${d} MATCHES "opencv_")
if(${d} MATCHES "opencv_lapack")
else()
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endif()
endforeach()
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.h*")
source_group("Src" FILES ${test_hdrs} ${test_srcs})
if(HAVE_CUDA)
include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")
source_group("Src\\NVidia" FILES ${nvidia})
endif()
add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_test_target}_pch "test/test_precomp.cpp")
endif()
add_native_precompiled_header(${the_test_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_test_target} ${pch_header})
endif()
endif()
# Additional target properties
set_target_properties(${the_test_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
endif()
add_dependencies(${the_test_target} ${test_deps})
# Add the required libraries for linking:
target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
enable_testing()
get_target_property(LOC ${the_test_target} LOCATION)
add_test(${the_test_target} "${LOC}")
if(WIN32)
install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
endif()
endif()

View File

@@ -1,240 +0,0 @@
set(name "gpu")
set(the_target "opencv_${name}")
project(${the_target})
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/src/cuda"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_BINARY_DIR}")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
file(GLOB lib_cuda "src/cuda/*.cu*")
file(GLOB lib_cuda_hdrs "src/cuda/*.h*")
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Src\\Cuda" FILES ${lib_cuda} ${lib_cuda_hdrs})
file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
source_group("Include" FILES ${lib_hdrs})
#file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
source_group("Device" FILES ${lib_device_hdrs})
if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h")
source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
include_directories("src/nvidia/core" "src/nvidia/NPP_staging")
endif()
if (HAVE_CUDA)
#get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})
#find_package(NPP 3.2.16 REQUIRED)
#message(STATUS "NPP detected: " ${NPP_VERSION})
include_directories(${CUDA_INCLUDE_DIRS})
if (UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
endif()
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
if(MSVC)
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
endif()
if (OPENCV_BUILD_SHARED_LIB)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
foreach(d ${DEPS})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_target}_pch "src/precomp.cpp")
endif()
add_native_precompiled_header(${the_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_target} ${pch_header})
endif()
endif()
# For dynamic link numbering convenions
set_target_properties(${the_target} PROPERTIES
VERSION ${OPENCV_VERSION}
SOVERSION ${OPENCV_SOVERSION}
OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "modules")
endif()
if (OPENCV_BUILD_SHARED_LIB)
if (MSVC)
set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
else()
add_definitions(-DCVAPI_EXPORTS)
endif()
endif()
# Additional target properties
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib/"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib"
)
# Add the required libraries for linking:
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES})
CUDA_ADD_CUFFT_TO_TARGET(${the_target})
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp)
target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
endif()
if(MSVC)
if(CMAKE_CROSSCOMPILING)
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:secchk")
endif()
set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:libc")
endif()
# Dependencies of this target:
add_dependencies(${the_target} ${DEPS})
install(TARGETS ${the_target}
RUNTIME DESTINATION bin COMPONENT main
LIBRARY DESTINATION lib COMPONENT main
ARCHIVE DESTINATION lib COMPONENT main)
install(FILES ${lib_hdrs}
DESTINATION include/opencv2/${name}
COMPONENT main)
install(FILES src/nvidia/NPP_staging/NPP_staging.hpp src/nvidia/core/NCV.hpp
DESTINATION include/opencv2/${name}
COMPONENT main)
#install(FILES ${lib_device_hdrs}
# DESTINATION include/opencv2/${name}/device
# COMPONENT main)
################################################################################################################
################################ GPU Module Tests #####################################################
################################################################################################################
# Test files processing is in the separated directory to avoid 'Src' source
# filter creation in Visual Studio
if(BUILD_TESTS AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/test)
set(the_test_target "opencv_test_${name}")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/test"
"${CMAKE_CURRENT_BINARY_DIR}")
set(test_deps opencv_${name} opencv_ts opencv_highgui opencv_calib3d ${DEPS})
foreach(d ${test_deps})
if(${d} MATCHES "opencv_")
if(${d} MATCHES "opencv_lapack")
else()
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endif()
endforeach()
file(GLOB test_srcs "test/*.cpp")
file(GLOB test_hdrs "test/*.h*")
source_group("Src" FILES ${test_hdrs} ${test_srcs})
if(HAVE_CUDA)
include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging)
file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.h*")
source_group("Src\\NVidia" FILES ${nvidia})
endif()
add_executable(${the_test_target} ${test_srcs} ${test_hdrs} ${nvidia})
if(PCHSupport_FOUND)
set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/test/test_precomp.hpp)
if(${CMAKE_GENERATOR} MATCHES "Visual*" OR ${CMAKE_GENERATOR} MATCHES "Xcode*")
if(${CMAKE_GENERATOR} MATCHES "Visual*")
set(${the_test_target}_pch "test/test_precomp.cpp")
endif()
add_native_precompiled_header(${the_test_target} ${pch_header})
elseif(CMAKE_COMPILER_IS_GNUCXX AND ${CMAKE_GENERATOR} MATCHES ".*Makefiles")
add_precompiled_header(${the_test_target} ${pch_header})
endif()
endif()
# Additional target properties
set_target_properties(${the_test_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/"
)
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_test_target} PROPERTIES FOLDER "tests")
endif()
add_dependencies(${the_test_target} ${test_deps})
# Add the required libraries for linking:
target_link_libraries(${the_test_target} ${OPENCV_LINKER_LIBS} ${test_deps})
enable_testing()
get_target_property(LOC ${the_test_target} LOCATION)
add_test(${the_test_target} "${LOC}")
if(WIN32)
install(TARGETS ${the_test_target} RUNTIME DESTINATION bin COMPONENT main)
endif()
endif()

View File

@@ -1,125 +0,0 @@
###############################################################################
#
# FindNPP.cmake
#
# CUDA_NPP_LIBRARY_ROOT_DIR -- Path to the NPP dorectory.
# CUDA_NPP_INCLUDES -- NPP Include directories.
# CUDA_NPP_LIBRARIES -- NPP libraries.
# NPP_VERSION -- NPP version in format "major.minor.build".
#
# If not found automatically, please set CUDA_NPP_LIBRARY_ROOT_DIR
# in CMake or set enviroment varivabe $CUDA_NPP_ROOT
#
# Author: Anatoly Baksheev, Itseez Ltd.
#
# The MIT License
#
# License for the specific language governing rights and limitations under
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
cmake_policy(PUSH)
cmake_minimum_required(VERSION 2.8.0)
cmake_policy(POP)
if(NOT "${CUDA_NPP_LIBRARY_ROOT_DIR}" STREQUAL "${CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL}")
unset(CUDA_NPP_INCLUDES CACHE)
unset(CUDA_NPP_LIBRARIES CACHE)
endif()
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
if (UNIX OR APPLE)
set(NPP_SUFFIX "32")
else()
set(NPP_SUFFIX "-mt")
endif()
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
if (UNIX OR APPLE)
set(NPP_SUFFIX "64")
else()
set(NPP_SUFFIX "-mt-x64")
endif()
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
if(NOT CUDA_NPP_LIBRARY_ROOT_DIR OR CUDA_NPP_LIBRARY_ROOT_DIR STREQUAL "")
unset(CUDA_NPP_LIBRARY_ROOT_DIR CACHE)
find_path(CUDA_NPP_LIBRARY_ROOT_DIR "common/npp/include/npp.h" PATHS ENV CUDA_NPP_ROOT DOC "NPP root directory.")
MESSAGE(STATUS "NPP root directory: " ${CUDA_NPP_LIBRARY_ROOT_DIR})
endif()
# Search includes in our own paths.
find_path(CUDA_NPP_INCLUDES npp.h PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}/common/npp/include")
# Search default search paths, after we search our own set of paths.
find_path(CUDA_NPP_INCLUDES device_functions.h)
mark_as_advanced(CUDA_NPP_INCLUDES)
# Find NPP library
find_library(CUDA_NPP_LIBRARIES
NAMES "npp" "npp${NPP_SUFFIX}" "libnpp${NPP_SUFFIX}"
PATHS "${CUDA_NPP_LIBRARY_ROOT_DIR}"
PATH_SUFFIXES "common/lib" "common/npp/lib"
DOC "NPP library"
)
# Search default search paths, after we search our own set of paths.
find_library(CUDA_NPP_LIBRARIES NAMES npp${NPP_SUFFIX} libnpp${NPP_SUFFIX} DOC "NPP library")
mark_as_advanced(CUDA_NPP_LIBRARIES)
if(EXISTS ${CUDA_NPP_INCLUDES}/nppversion.h)
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_major REGEX "#define NPP_VERSION_MAJOR.*")
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_minor REGEX "#define NPP_VERSION_MINOR.*")
file( STRINGS ${CUDA_NPP_INCLUDES}/nppversion.h npp_build REGEX "#define NPP_VERSION_BUILD.*")
string( REGEX REPLACE "#define NPP_VERSION_MAJOR[ \t]+|//.*" "" npp_major ${npp_major})
string( REGEX REPLACE "#define NPP_VERSION_MINOR[ \t]+|//.*" "" npp_minor ${npp_minor})
string( REGEX REPLACE "#define NPP_VERSION_BUILD[ \t]+|//.*" "" npp_build ${npp_build})
string( REGEX MATCH "[0-9]+" npp_major ${npp_major} )
string( REGEX MATCH "[0-9]+" npp_minor ${npp_minor} )
string( REGEX MATCH "[0-9]+" npp_build ${npp_build} )
set( NPP_VERSION "${npp_major}.${npp_minor}.${npp_build}")
endif()
if(NOT EXISTS ${CUDA_NPP_LIBRARIES} OR NOT EXISTS ${CUDA_NPP_INCLUDES}/npp.h)
set(CUDA_NPP_FOUND FALSE)
message(FATAL_ERROR "NPP headers/libraries are not found. Please specify CUDA_NPP_LIBRARY_ROOT_DIR in CMake or set $CUDA_NPP_ROOT.")
endif()
include( FindPackageHandleStandardArgs )
find_package_handle_standard_args( NPP
REQUIRED_VARS
CUDA_NPP_INCLUDES
CUDA_NPP_LIBRARIES
#Need cmake 2.8.3 to uncomment this.
#VERSION_VAR
NPP_VERSION)
if(APPLE)
# We need to add the path to cudart to the linker using rpath, since the library name for the cuda libraries is prepended with @rpath.
get_filename_component(_cuda_path_to_npp "${CUDA_NPP_LIBRARIES}" PATH)
if(_cuda_path_to_npp)
list(APPEND CUDA_NPP_LIBRARIES "-Wl,-rpath,${_cuda_path_to_npp}")
endif()
endif()
set(CUDA_NPP_FOUND TRUE)
set(CUDA_NPP_LIBRARY_ROOT_DIR_INTERNAL "${CUDA_NPP_LIBRARY_ROOT_DIR}" CACHE INTERNAL "This is the value of the last time CUDA_NPP_LIBRARY_ROOT_DIR was set successfully." FORCE)

View File

@@ -447,12 +447,21 @@ namespace cv
// converts matrix type, ex from float to uchar depending on type // converts matrix type, ex from float to uchar depending on type
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
static Stream& Null();
operator bool() const;
private: private:
void create(); void create();
void release(); void release();
struct Impl; struct Impl;
Impl *impl; Impl *impl;
friend struct StreamAccessor; friend struct StreamAccessor;
explicit Stream(Impl* impl);
}; };
@@ -460,168 +469,130 @@ namespace cv
//! transposes the matrix //! transposes the matrix
//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst); CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
//! reverses the order of the rows, columns or both in a matrix //! reverses the order of the rows, columns or both in a matrix
//! supports CV_8UC1, CV_8UC4 types //! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode); CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
//! destination array will have the depth type as lut and the same channels number as source //! destination array will have the depth type as lut and the same channels number as source
//! supports CV_8UC1, CV_8UC3 types //! supports CV_8UC1, CV_8UC3 types
CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst); CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
//! makes multi-channel array out of several single-channel arrays //! makes multi-channel array out of several single-channel arrays
CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst); CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
//! makes multi-channel array out of several single-channel arrays //! makes multi-channel array out of several single-channel arrays
CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst); CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
//! makes multi-channel array out of several single-channel arrays (async version)
CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream);
//! makes multi-channel array out of several single-channel arrays (async version)
CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream);
//! copies each plane of a multi-channel array to a dedicated array //! copies each plane of a multi-channel array to a dedicated array
CV_EXPORTS void split(const GpuMat& src, GpuMat* dst); CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
//! copies each plane of a multi-channel array to a dedicated array //! copies each plane of a multi-channel array to a dedicated array
CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst); CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null());
//! copies each plane of a multi-channel array to a dedicated array (async version)
CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, const Stream& stream);
//! copies each plane of a multi-channel array to a dedicated array (async version)
CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream);
//! computes magnitude of complex (x(i).re, x(i).im) vector //! computes magnitude of complex (x(i).re, x(i).im) vector
//! supports only CV_32FC2 type //! supports only CV_32FC2 type
CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude); CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());
//! computes squared magnitude of complex (x(i).re, x(i).im) vector //! computes squared magnitude of complex (x(i).re, x(i).im) vector
//! supports only CV_32FC2 type //! supports only CV_32FC2 type
CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude); CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null());
//! computes magnitude of each (x(i), y(i)) vector //! computes magnitude of each (x(i), y(i)) vector
//! supports only floating-point source //! supports only floating-point source
CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude); CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, const Stream& stream);
//! computes squared magnitude of each (x(i), y(i)) vector //! computes squared magnitude of each (x(i), y(i)) vector
//! supports only floating-point source //! supports only floating-point source
CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude); CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, const Stream& stream);
//! computes angle (angle(i)) of each (x(i), y(i)) vector //! computes angle (angle(i)) of each (x(i), y(i)) vector
//! supports only floating-point source //! supports only floating-point source
CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false); CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, const Stream& stream);
//! converts Cartesian coordinates to polar //! converts Cartesian coordinates to polar
//! supports only floating-point source //! supports only floating-point source
CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false); CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees, const Stream& stream);
//! converts polar coordinates to Cartesian //! converts polar coordinates to Cartesian
//! supports only floating-point source //! supports only floating-point source
CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false); CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream);
//////////////////////////// Per-element operations //////////////////////////////////// //////////////////////////// Per-element operations ////////////////////////////////////
//! adds one matrix to another (c = a + b) //! adds one matrix to another (c = a + b)
//! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c); CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
//! adds scalar to a matrix (c = a + s) //! adds scalar to a matrix (c = a + s)
//! supports CV_32FC1 and CV_32FC2 type //! supports CV_32FC1 and CV_32FC2 type
CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c); CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null());
//! subtracts one matrix from another (c = a - b) //! subtracts one matrix from another (c = a - b)
//! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c); CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
//! subtracts scalar from a matrix (c = a - s) //! subtracts scalar from a matrix (c = a - s)
//! supports CV_32FC1 and CV_32FC2 type //! supports CV_32FC1 and CV_32FC2 type
CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c); CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null());
//! computes element-wise product of the two arrays (c = a * b) //! computes element-wise product of the two arrays (c = a * b)
//! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c); CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
//! multiplies matrix to a scalar (c = a * s) //! multiplies matrix to a scalar (c = a * s)
//! supports CV_32FC1 and CV_32FC2 type //! supports CV_32FC1 and CV_32FC2 type
CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c); CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null());
//! computes element-wise quotient of the two arrays (c = a / b) //! computes element-wise quotient of the two arrays (c = a / b)
//! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c); CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
//! computes element-wise quotient of matrix and scalar (c = a / s) //! computes element-wise quotient of matrix and scalar (c = a / s)
//! supports CV_32FC1 and CV_32FC2 type //! supports CV_32FC1 and CV_32FC2 type
CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c); CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, Stream& stream = Stream::Null());
//! computes exponent of each matrix element (b = e**a) //! computes exponent of each matrix element (b = e**a)
//! supports only CV_32FC1 type //! supports only CV_32FC1 type
CV_EXPORTS void exp(const GpuMat& a, GpuMat& b); CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
//! supports only CV_32FC1 type //! supports only CV_32FC1 type
CV_EXPORTS void log(const GpuMat& a, GpuMat& b); CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
//! computes element-wise absolute difference of two arrays (c = abs(a - b)) //! computes element-wise absolute difference of two arrays (c = abs(a - b))
//! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32FC1 types
CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c); CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
//! computes element-wise absolute difference of array and scalar (c = abs(a - s)) //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
//! supports only CV_32FC1 type //! supports only CV_32FC1 type
CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c); CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
//! compares elements of two arrays (c = a <cmpop> b) //! compares elements of two arrays (c = a <cmpop> b)
//! supports CV_8UC4, CV_32FC1 types //! supports CV_8UC4, CV_32FC1 types
CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop); CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
//! performs per-elements bit-wise inversion //! performs per-elements bit-wise inversion
CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat()); CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream);
//! calculates per-element bit-wise disjunction of two arrays //! calculates per-element bit-wise disjunction of two arrays
CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
//! calculates per-element bit-wise conjunction of two arrays //! calculates per-element bit-wise conjunction of two arrays
CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
//! calculates per-element bit-wise "exclusive or" operation //! calculates per-element bit-wise "exclusive or" operation
CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat()); CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream);
//! computes per-element minimum of two arrays (dst = min(src1, src2)) //! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
//! Async version
CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element minimum of array and scalar (dst = min(src1, src2)) //! computes per-element minimum of array and scalar (dst = min(src1, src2))
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst); CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
//! Async version
CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of two arrays (dst = max(src1, src2)) //! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst); CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
//! Async version
CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream);
//! computes per-element maximum of array and scalar (dst = max(src1, src2)) //! computes per-element maximum of array and scalar (dst = max(src1, src2))
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst); CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
//! Async version
CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream);
////////////////////////////// Image processing ////////////////////////////// ////////////////////////////// Image processing //////////////////////////////
@@ -645,68 +616,60 @@ namespace cv
//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV. //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV.
//! Supported types of input disparity: CV_8U, CV_16S. //! Supported types of input disparity: CV_8U, CV_16S.
//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255). //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255).
CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp); CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, const Stream& stream);
//! Reprojects disparity image to 3D space. //! Reprojects disparity image to 3D space.
//! Supports CV_8U and CV_16S types of input disparity. //! Supports CV_8U and CV_16S types of input disparity.
//! The output is a 4-channel floating-point (CV_32FC4) matrix. //! The output is a 4-channel floating-point (CV_32FC4) matrix.
//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map. //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map.
//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify. //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify.
CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q); CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const Stream& stream);
//! converts image from one color space to another //! converts image from one color space to another
CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0); CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream);
//! applies fixed threshold to the image //! applies fixed threshold to the image
CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type); CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
//! async version
CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, const Stream& stream);
//! resizes the image //! resizes the image
//! Supports INTER_NEAREST, INTER_LINEAR //! Supports INTER_NEAREST, INTER_LINEAR
//! supports CV_8UC1, CV_8UC4 types //! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR); CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
//! warps the image using affine transformation //! warps the image using affine transformation
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR); CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());
//! warps the image using perspective transformation //! warps the image using perspective transformation
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR); CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null());
//! rotate 8bit single or four channel image //! rotate 8bit single or four channel image
//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
//! supports CV_8UC1, CV_8UC4 types //! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR); CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
//! copies 2D array to a larger destination array and pads borders with user-specifiable constant //! copies 2D array to a larger destination array and pads borders with user-specifiable constant
//! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types //! supports CV_8UC1, CV_8UC4, CV_32SC1 and CV_32FC1 types
CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar()); CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value = Scalar(), Stream& stream = Stream::Null());
//! computes the integral image //! computes the integral image
//! sum will have CV_32S type, but will contain unsigned int values //! sum will have CV_32S type, but will contain unsigned int values
//! supports only CV_8UC1 source type //! supports only CV_8UC1 source type
CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum); CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
//! buffered version //! buffered version
CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer); CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
//! computes the integral image and integral for the squared image //! computes the integral image and integral for the squared image
//! sum will have CV_32S type, sqsum - CV32F type //! sum will have CV_32S type, sqsum - CV32F type
//! supports only CV_8UC1 source type //! supports only CV_8UC1 source type
CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum); CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null());
//! computes squared integral image //! computes squared integral image
//! result matrix will have 64F type, but will contain 64U values //! result matrix will have 64F type, but will contain 64U values
//! supports source images of 8UC1 type only //! supports source images of 8UC1 type only
CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum); CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null());
//! computes vertical sum, supports only CV_32FC1 images //! computes vertical sum, supports only CV_32FC1 images
CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum); CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum);
@@ -714,14 +677,7 @@ namespace cv
//! computes the standard deviation of integral images //! computes the standard deviation of integral images
//! supports only CV_32SC1 source type and CV_32FC1 sqr type //! supports only CV_32SC1 source type and CV_32FC1 sqr type
//! output will have CV_32FC1 type //! output will have CV_32FC1 type
CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect); CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null());
// applies Canny edge detector and produces the edge map
// disabled until fix crash
//CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);
//CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);
//! computes Harris cornerness criteria at each image pixel //! computes Harris cornerness criteria at each image pixel
CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101); CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);
@@ -792,7 +748,7 @@ namespace cv
//! performs linear blending of two images //! performs linear blending of two images
//! to avoid accuracy errors sum of weigths shouldn't be very close to zero //! to avoid accuracy errors sum of weigths shouldn't be very close to zero
CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
GpuMat& result); GpuMat& result, Stream& stream = Stream::Null());
////////////////////////////// Matrix reductions ////////////////////////////// ////////////////////////////// Matrix reductions //////////////////////////////
@@ -863,17 +819,11 @@ namespace cv
///////////////////////////// Calibration 3D ////////////////////////////////// ///////////////////////////// Calibration 3D //////////////////////////////////
CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
GpuMat& dst); GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
GpuMat& dst, const Stream& stream);
CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst);
CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
const Stream& stream); Stream& stream = Stream::Null());
CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false,
@@ -893,7 +843,7 @@ namespace cv
public: public:
BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
virtual ~BaseRowFilter_GPU() {} virtual ~BaseRowFilter_GPU() {}
virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
int ksize, anchor; int ksize, anchor;
}; };
@@ -908,7 +858,7 @@ namespace cv
public: public:
BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {}
virtual ~BaseColumnFilter_GPU() {} virtual ~BaseColumnFilter_GPU() {}
virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
int ksize, anchor; int ksize, anchor;
}; };
@@ -922,7 +872,7 @@ namespace cv
public: public:
BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {} BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {}
virtual ~BaseFilter_GPU() {} virtual ~BaseFilter_GPU() {}
virtual void operator()(const GpuMat& src, GpuMat& dst) = 0; virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
Size ksize; Size ksize;
Point anchor; Point anchor;
}; };
@@ -938,7 +888,7 @@ namespace cv
public: public:
virtual ~FilterEngine_GPU() {} virtual ~FilterEngine_GPU() {}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) = 0; virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
}; };
//! returns the non-separable filter engine with the specified filter //! returns the non-separable filter engine with the specified filter
@@ -1027,47 +977,47 @@ namespace cv
//! smooths the image using the normalized box filter //! smooths the image using the normalized box filter
//! supports CV_8UC1, CV_8UC4 types //! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1)); CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null());
//! a synonym for normalized box filter //! a synonym for normalized box filter
static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1)) { boxFilter(src, dst, -1, ksize, anchor); } static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
//! erodes the image (applies the local minimum operator) //! erodes the image (applies the local minimum operator)
CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! dilates the image (applies the local maximum operator) //! dilates the image (applies the local maximum operator)
CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! applies an advanced morphological operation to the image //! applies an advanced morphological operation to the image
CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! applies non-separable 2D linear filter to the image //! applies non-separable 2D linear filter to the image
CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1)); CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
//! applies separable 2D linear filter to the image //! applies separable 2D linear filter to the image
CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies generalized Sobel operator to the image //! applies generalized Sobel operator to the image
CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies the vertical or horizontal Scharr operator to the image //! applies the vertical or horizontal Scharr operator to the image
CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! smooths the image using Gaussian filter. //! smooths the image using Gaussian filter.
CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies Laplacian operator to the image //! applies Laplacian operator to the image
//! supports only ksize = 1 and ksize = 3 //! supports only ksize = 1 and ksize = 3
CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1); CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null());
//////////////////////////////// Image Labeling //////////////////////////////// //////////////////////////////// Image Labeling ////////////////////////////////
//!performs labeling via graph cuts //!performs labeling via graph cuts
CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf); CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& stream = Stream::Null());
////////////////////////////////// Histograms ////////////////////////////////// ////////////////////////////////// Histograms //////////////////////////////////
@@ -1076,23 +1026,23 @@ namespace cv
//! Calculates histogram with evenly distributed bins for signle channel source. //! Calculates histogram with evenly distributed bins for signle channel source.
//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types. //! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types.
//! Output hist will have one row and histSize cols and CV_32SC1 type. //! Output hist will have one row and histSize cols and CV_32SC1 type.
CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel); CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null());
//! Calculates histogram with evenly distributed bins for four-channel source. //! Calculates histogram with evenly distributed bins for four-channel source.
//! All channels of source are processed separately. //! All channels of source are processed separately.
//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types. //! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types.
//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type. //! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type.
CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]); CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null());
//! Calculates histogram with bins determined by levels array. //! Calculates histogram with bins determined by levels array.
//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. //! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types. //! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types.
//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type. //! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type.
CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels); CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null());
//! Calculates histogram with bins determined by levels array. //! Calculates histogram with bins determined by levels array.
//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. //! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise.
//! All channels of source are processed separately. //! All channels of source are processed separately.
//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types. //! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type. //! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]); CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
//////////////////////////////// StereoBM_GPU //////////////////////////////// //////////////////////////////// StereoBM_GPU ////////////////////////////////
@@ -1110,10 +1060,7 @@ namespace cv
//! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair
//! Output disparity has CV_8U type. //! Output disparity has CV_8U type.
void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity); void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
//! async version
void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream & stream);
//! Some heuristics that tries to estmate //! Some heuristics that tries to estmate
// if current GPU will be faster than CPU in this algorithm. // if current GPU will be faster than CPU in this algorithm.
@@ -1165,15 +1112,11 @@ namespace cv
//! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
//! if disparity is empty output type will be CV_16S else output type will be disparity.type(). //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity); void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
//! async version
void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream);
//! version for user specified data term //! version for user specified data term
void operator()(const GpuMat& data, GpuMat& disparity); void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream);
int ndisp; int ndisp;
@@ -1194,7 +1137,7 @@ namespace cv
/////////////////////////// StereoConstantSpaceBP /////////////////////////// /////////////////////////// StereoConstantSpaceBP ///////////////////////////
// "A Constant-Space Belief Propagation Algorithm for Stereo Matching" // "A Constant-Space Belief Propagation Algorithm for Stereo Matching"
// Qingxiong Yang, Liang Wang<EFBFBD>, Narendra Ahuja // Qingxiong Yang, Liang Wang, Narendra Ahuja
// http://vision.ai.uiuc.edu/~qyang6/ // http://vision.ai.uiuc.edu/~qyang6/
class CV_EXPORTS StereoConstantSpaceBP class CV_EXPORTS StereoConstantSpaceBP
@@ -1224,10 +1167,7 @@ namespace cv
//! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair,
//! if disparity is empty output type will be CV_16S else output type will be disparity.type(). //! if disparity is empty output type will be CV_16S else output type will be disparity.type().
void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity); void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null());
//! async version
void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream);
int ndisp; int ndisp;
@@ -1280,10 +1220,7 @@ namespace cv
//! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image. //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image.
//! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type. //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type.
void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst); void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null());
//! async version
void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream);
private: private:
int ndisp; int ndisp;
@@ -1406,7 +1343,7 @@ namespace cv
// distance.at<float>(0, queryIdx) will contain distance // distance.at<float>(0, queryIdx) will contain distance
void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, GpuMat& trainIdx, GpuMat& distance,
const GpuMat& mask = GpuMat()); const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
// Download trainIdx and distance to CPU vector with DMatch // Download trainIdx and distance to CPU vector with DMatch
static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches); static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
@@ -1425,7 +1362,7 @@ namespace cv
// distance.at<float>(0, queryIdx) will contain distance // distance.at<float>(0, queryIdx) will contain distance
void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection,
GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
const GpuMat& maskCollection); const GpuMat& maskCollection, Stream& stream = Stream::Null());
// Download trainIdx, imgIdx and distance to CPU vector with DMatch // Download trainIdx, imgIdx and distance to CPU vector with DMatch
static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@@ -1443,7 +1380,7 @@ namespace cv
// allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best, // allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best,
// otherwise it will contain distance between queryIdx and trainIdx descriptors // otherwise it will contain distance between queryIdx and trainIdx descriptors
void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat()); GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
// Download trainIdx and distance to CPU vector with DMatch // Download trainIdx and distance to CPU vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches // compactResult is used when mask is not empty. If compactResult is false matches
@@ -1478,7 +1415,7 @@ namespace cv
// Matches doesn't sorted. // Matches doesn't sorted.
void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance,
const GpuMat& mask = GpuMat()); const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
// Download trainIdx, nMatches and distance to CPU vector with DMatch. // Download trainIdx, nMatches and distance to CPU vector with DMatch.
// matches will be sorted in increasing order of distances. // matches will be sorted in increasing order of distances.

View File

@@ -48,37 +48,36 @@ using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::transpose(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); } void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::magnitude(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); } void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_nogpu(); }
void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, const Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// transpose // transpose
void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
{ {
CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8); CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);
dst.create( src.cols, src.rows, src.type() ); dst.create( src.cols, src.rows, src.type() );
cudaStream_t stream = StreamAccessor::getStream(s);
if (src.elemSize() == 1) if (src.elemSize() == 1)
{ {
NppStreamHandler h(stream);
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
@@ -87,6 +86,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
} }
else if (src.elemSize() == 4) else if (src.elemSize() == 4)
{ {
NppStStreamHandler h(stream);
NcvSize32u sz; NcvSize32u sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
@@ -96,6 +97,8 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
} }
else // if (src.elemSize() == 8) else // if (src.elemSize() == 8)
{ {
NppStStreamHandler h(stream);
NcvSize32u sz; NcvSize32u sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
@@ -104,13 +107,14 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
dst.ptr<Ncv64u>(), dst.step, sz) ); dst.ptr<Ncv64u>(), dst.step, sz) );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// flip // flip
void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode) void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s)
{ {
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
@@ -120,6 +124,10 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1) if (src.type() == CV_8UC1)
{ {
nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), src.step, nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), src.step,
@@ -133,13 +141,14 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) ); (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// LUT // LUT
void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst) void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
{ {
class LevelsInit class LevelsInit
{ {
@@ -172,6 +181,10 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
Mat nppLut; Mat nppLut;
lut.convertTo(nppLut, CV_32S); lut.convertTo(nppLut, CV_32S);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1) if (src.type() == CV_8UC1)
{ {
nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) ); nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
@@ -192,13 +205,14 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) ); nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// exp // exp
void cv::gpu::exp(const GpuMat& src, GpuMat& dst) void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s)
{ {
CV_Assert(src.type() == CV_32FC1); CV_Assert(src.type() == CV_32FC1);
@@ -208,15 +222,20 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst)
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) ); nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// log // log
void cv::gpu::log(const GpuMat& src, GpuMat& dst) void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s)
{ {
CV_Assert(src.type() == CV_32FC1); CV_Assert(src.type() == CV_32FC1);
@@ -226,9 +245,14 @@ void cv::gpu::log(const GpuMat& src, GpuMat& dst)
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) ); nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -238,7 +262,7 @@ namespace
{ {
typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func) inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
{ {
CV_Assert(src.type() == CV_32FC2); CV_Assert(src.type() == CV_32FC2);
@@ -248,20 +272,23 @@ namespace
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp32fc>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) ); nppSafeCall( func(src.ptr<Npp32fc>(), src.step, dst.ptr<Npp32f>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst) void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
{ {
::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R); ::npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
} }
void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst) void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
{ {
::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R); ::npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -310,52 +337,27 @@ namespace
} }
} }
void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst) void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
{
::cartToPolar_caller(x, y, &dst, false, 0, false, 0);
}
void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, const Stream& stream)
{ {
::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
} }
void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst) void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
{
::cartToPolar_caller(x, y, &dst, true, 0, false, 0);
}
void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, const Stream& stream)
{ {
::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
} }
void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees) void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
{
::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, 0);
}
void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, const Stream& stream)
{ {
::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
} }
void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees) void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
{
::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, 0);
}
void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, const Stream& stream)
{ {
::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
} }
void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees) void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
{
::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, 0);
}
void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, const Stream& stream)
{ {
::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
} }

View File

@@ -51,7 +51,6 @@ using namespace std;
cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); } cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); } cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -101,7 +100,7 @@ namespace
template <typename T> template <typename T>
void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold, void bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space, GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream) const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
{ {
short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5)); short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
short max_disc = short(ndisp * max_disc_threshold + 0.5); short max_disc = short(ndisp * max_disc_threshold + 0.5);
@@ -109,14 +108,19 @@ namespace
bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc); bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
if (&dst != &disp) if (&dst != &disp)
{
if (stream)
stream.enqueueCopy(disp, dst);
else
disp.copyTo(dst); disp.copyTo(dst);
}
bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, stream); bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
} }
typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
GpuMat& table_color, GpuMat& table_space, GpuMat& table_color, GpuMat& table_space,
const GpuMat& disp, const GpuMat& img, GpuMat& dst, cudaStream_t stream); const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
const bilateral_filter_operator_t operators[] = const bilateral_filter_operator_t operators[] =
{bilateral_filter_operator<unsigned char>, 0, 0, bilateral_filter_operator<short>, 0, 0, 0, 0}; {bilateral_filter_operator<unsigned char>, 0, 0, bilateral_filter_operator<short>, 0, 0, 0, 0};
@@ -139,18 +143,11 @@ cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radi
calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f); calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
} }
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst)
{
CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, 0);
}
void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream) void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
{ {
CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters); CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3)); CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, StreamAccessor::getStream(stream)); operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@@ -48,8 +48,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&) { throw_nogpu(); }
#else #else
@@ -57,14 +56,14 @@ namespace cv { namespace gpu
{ {
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result); const PtrStep_<float> weights1, const PtrStep_<float> weights2, PtrStep_<T> result, cudaStream_t stream);
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep result); const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream);
}} }}
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
GpuMat& result) GpuMat& result, Stream& stream)
{ {
CV_Assert(img1.size() == img2.size()); CV_Assert(img1.size() == img2.size());
CV_Assert(img1.type() == img2.type()); CV_Assert(img1.type() == img2.type());
@@ -83,12 +82,12 @@ void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat&
{ {
case CV_8U: case CV_8U:
if (cn != 4) if (cn != 4)
blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result); blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
else else
blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result); blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break; break;
case CV_32F: case CV_32F:
blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result); blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
break; break;
default: default:
CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function"); CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function");

View File

@@ -54,18 +54,18 @@ const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors()
void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }
bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; } bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }
bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; } bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }
void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
@@ -77,47 +77,47 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename T> template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T> template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T> template <typename T>
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T> template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T> template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T> template <typename T>
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
}}} }}}
namespace namespace
@@ -168,7 +168,7 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
// Match // Match
void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs, void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask) GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask, Stream& stream)
{ {
if (queryDescs.empty() || trainDescs.empty()) if (queryDescs.empty() || trainDescs.empty())
return; return;
@@ -177,7 +177,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12); bool cc_12, cudaStream_t stream);
static const match_caller_t match_callers[3][8] = static const match_caller_t match_callers[3][8] =
{ {
@@ -213,7 +213,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
// trainIdx store after imgIdx, so we doesn't lose it value. // trainIdx store after imgIdx, so we doesn't lose it value.
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12); func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream));
} }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -301,7 +301,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
} }
void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection, void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection,
GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection) GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection, Stream& stream)
{ {
if (queryDescs.empty() || trainCollection.empty()) if (queryDescs.empty() || trainCollection.empty())
return; return;
@@ -310,7 +310,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12); const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
static const match_caller_t match_callers[3][8] = static const match_caller_t match_callers[3][8] =
{ {
@@ -344,7 +344,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12); bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12); func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream));
} }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx,
@@ -403,7 +403,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector
// KnnMatch // KnnMatch
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask) GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask, Stream& stream)
{ {
if (queryDescs.empty() || trainDescs.empty()) if (queryDescs.empty() || trainDescs.empty())
return; return;
@@ -411,7 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
using namespace cv::gpu::bfmatcher; using namespace cv::gpu::bfmatcher;
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
static const match_caller_t match_callers[3][8] = static const match_caller_t match_callers[3][8] =
{ {
@@ -436,16 +436,24 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
const int nTrain = trainDescs.rows; const int nTrain = trainDescs.rows;
trainIdx.create(nQuery, k, CV_32S); trainIdx.create(nQuery, k, CV_32S);
trainIdx.setTo(Scalar::all(-1));
distance.create(nQuery, k, CV_32F); distance.create(nQuery, k, CV_32F);
ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist); ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
if (stream)
{
stream.enqueueMemSet(trainIdx, Scalar::all(-1));
stream.enqueueMemSet(allDist, Scalar::all(numeric_limits<float>::max()));
}
else
{
trainIdx.setTo(Scalar::all(-1));
allDist.setTo(Scalar::all(numeric_limits<float>::max())); allDist.setTo(Scalar::all(numeric_limits<float>::max()));
}
match_caller_t func = match_callers[distType][queryDescs.depth()]; match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist); func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
} }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@@ -547,7 +555,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs,
// RadiusMatch // RadiusMatch
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs, void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask) GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask, Stream& stream)
{ {
if (queryDescs.empty() || trainDescs.empty()) if (queryDescs.empty() || trainDescs.empty())
return; return;
@@ -555,7 +563,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
using namespace cv::gpu::bfmatcher; using namespace cv::gpu::bfmatcher;
typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
static const radiusMatch_caller_t radiusMatch_callers[3][8] = static const radiusMatch_caller_t radiusMatch_callers[3][8] =
{ {
@@ -583,17 +591,21 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size())); CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));
ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches); ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
nMatches.setTo(Scalar::all(0));
if (trainIdx.empty()) if (trainIdx.empty())
{ {
trainIdx.create(nQuery, nTrain, CV_32SC1); trainIdx.create(nQuery, nTrain, CV_32SC1);
distance.create(nQuery, nTrain, CV_32FC1); distance.create(nQuery, nTrain, CV_32FC1);
} }
if (stream)
stream.enqueueMemSet(nMatches, Scalar::all(0));
else
nMatches.setTo(Scalar::all(0));
radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()]; radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance); func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance, StreamAccessor::getStream(stream));
} }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches,

View File

@@ -44,20 +44,11 @@
#if !defined(HAVE_CUDA) #if !defined(HAVE_CUDA)
void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&) { throw_nogpu(); }
void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
const Mat&, const Mat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&,
const Mat&, const Mat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&,
Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
#else #else
@@ -66,14 +57,12 @@ using namespace cv::gpu;
namespace cv { namespace gpu { namespace transform_points namespace cv { namespace gpu { namespace transform_points
{ {
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
DevMem2D_<float3> dst, cudaStream_t stream);
}}} }}}
namespace namespace
{ {
void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
GpuMat& dst, cudaStream_t stream)
{ {
CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
@@ -88,30 +77,20 @@ namespace
} }
} }
void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
GpuMat& dst)
{
::transformPointsCaller(src, rvec, tvec, dst, 0);
}
void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
GpuMat& dst, const Stream& stream)
{ {
::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
} }
namespace cv { namespace gpu { namespace project_points namespace cv { namespace gpu { namespace project_points
{ {
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
}}} }}}
namespace namespace
{ {
void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
cudaStream_t stream)
{ {
CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3); CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F); CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
@@ -124,20 +103,11 @@ namespace
Rodrigues(rvec, rot); Rodrigues(rvec, rot);
dst.create(src.size(), CV_32FC2); dst.create(src.size(), CV_32FC2);
project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
camera_mat.ptr<float>(), dst,stream);
} }
} }
void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst)
{
::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, 0);
}
void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst,
const Stream& stream)
{ {
::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
} }

View File

@@ -47,8 +47,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int) { throw_nogpu(); } void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, const Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -455,12 +454,7 @@ namespace
} }
} }
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn) void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
{
cvtColor_caller(src, dst, code, dcn, 0);
}
void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, const Stream& stream)
{ {
cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream)); cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream));
} }

View File

@@ -68,19 +68,22 @@ namespace cv { namespace gpu
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2, void blendLinearCaller(int rows, int cols, int cn, const PtrStep_<T> img1, const PtrStep_<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result) const PtrStepf weights1, const PtrStepf weights2, PtrStep_<T> result, cudaStream_t stream)
{ {
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
blendLinearKernel<<<grid, threads>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
} }
template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep, template void blendLinearCaller<uchar>(int, int, int, const PtrStep, const PtrStep,
const PtrStepf, const PtrStepf, PtrStep); const PtrStepf, const PtrStepf, PtrStep, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf, template void blendLinearCaller<float>(int, int, int, const PtrStepf, const PtrStepf,
const PtrStepf, const PtrStepf, PtrStepf); const PtrStepf, const PtrStepf, PtrStepf, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
@@ -105,13 +108,16 @@ namespace cv { namespace gpu
void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2, void blendLinearCaller8UC4(int rows, int cols, const PtrStep img1, const PtrStep img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep result) const PtrStepf weights1, const PtrStepf weights2, PtrStep result, cudaStream_t stream)
{ {
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
blendLinearKernel8UC4<<<grid, threads>>>(rows, cols, img1, img2, weights1, weights2, result); blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
} }
}} }}

View File

@@ -589,7 +589,7 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask> template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train, void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
{ {
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
@@ -597,14 +597,15 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T> match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); <<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask> template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train, void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, cudaStream_t stream)
{ {
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
@@ -614,10 +615,11 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T> match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); <<<grid, threads, 0, stream>>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -626,167 +628,165 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Train, typename Mask> template <typename Dist, typename T, typename Train, typename Mask>
void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train, void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12) bool cc_12, cudaStream_t stream)
{ {
if (queryDescs.cols < 64) if (queryDescs.cols < 64)
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 64) else if (queryDescs.cols == 64)
matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols < 128) else if (queryDescs.cols < 128)
matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 128) else if (queryDescs.cols == 128)
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols < 256) else if (queryDescs.cols < 256)
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else if (queryDescs.cols == 256 && cc_12) else if (queryDescs.cols == 256 && cc_12)
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
else else
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance, stream);
cudaSafeCall( cudaThreadSynchronize() );
} }
template <typename T> template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12) bool cc_12, cudaStream_t stream)
{ {
SingleTrain<T> train((DevMem2D_<T>)trainDescs); SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data) if (mask.data)
{ {
SingleMask m(mask); SingleMask m(mask);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12) bool cc_12, cudaStream_t stream)
{ {
SingleTrain<T> train((DevMem2D_<T>)trainDescs); SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data) if (mask.data)
{ {
SingleMask m(mask); SingleMask m(mask);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12) bool cc_12, cudaStream_t stream)
{ {
SingleTrain<T> train((DevMem2D_<T>)trainDescs); SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data) if (mask.data)
{ {
SingleMask m(mask); SingleMask m(mask);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12) const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{ {
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data) if (maskCollection.data)
{ {
MaskCollection mask(maskCollection.data); MaskCollection mask(maskCollection.data);
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12) const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{ {
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data) if (maskCollection.data)
{ {
MaskCollection mask(maskCollection.data); MaskCollection mask(maskCollection.data);
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template <typename T> template <typename T>
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12) const DevMem2Df& distance, bool cc_12, cudaStream_t stream)
{ {
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data) if (maskCollection.data)
{ {
MaskCollection mask(maskCollection.data); MaskCollection mask(maskCollection.data);
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12, stream);
} }
else else
{ {
matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); matchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12, stream);
} }
} }
template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match //////////////////////////////////// //////////////////////////////////// Knn Match ////////////////////////////////////
@@ -833,16 +833,17 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
const Mask& mask, const DevMem2Df& distance) const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>( calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
queryDescs, trainDescs, mask, distance); queryDescs, trainDescs, mask, distance);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -1010,105 +1011,106 @@ namespace cv { namespace gpu { namespace bfmatcher
// find knn match kernel caller // find knn match kernel caller
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
dim3 threads(BLOCK_SIZE, 1, 1); dim3 threads(BLOCK_SIZE, 1, 1);
dim3 grid(trainIdx.rows, 1, 1); dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < knn; ++i) for (int i = 0; i < knn; ++i)
{ {
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance); findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// knn match caller // knn match caller
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist) void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{ {
calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist); calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist, stream);
} }
void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream);
} }
template <typename T> template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist); calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist, stream);
} }
else else
{ {
calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist); calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist, stream);
} }
findKnnMatchDispatcher(knn, trainIdx, distance, allDist); findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
} }
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T> template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist); SingleMask(mask), allDist, stream);
} }
else else
{ {
calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist); WithOutMask(), allDist, stream);
} }
findKnnMatchDispatcher(knn, trainIdx, distance, allDist); findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
} }
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template <typename T> template <typename T>
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist) const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist); SingleMask(mask), allDist, stream);
} }
else else
{ {
calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist); WithOutMask(), allDist, stream);
} }
findKnnMatchDispatcher(knn, trainIdx, distance, allDist); findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);
} }
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match ////////////////////////////////// /////////////////////////////////// Radius Match //////////////////////////////////
@@ -1166,16 +1168,17 @@ namespace cv { namespace gpu { namespace bfmatcher
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask> template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance) const DevMem2Df& distance, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1); dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>( radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance); queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -1184,77 +1187,77 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance) const DevMem2Df& distance, cudaStream_t stream)
{ {
radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask,
trainIdx, nMatches, distance); trainIdx, nMatches, distance, stream);
} }
template <typename T> template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance); maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
} }
else else
{ {
radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance); maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
} }
} }
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T> template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance); maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
} }
else else
{ {
radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance); maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
} }
} }
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template <typename T> template <typename T>
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance) const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance); maxDistance, SingleMask(mask), trainIdx, nMatches, distance, stream);
} }
else else
{ {
radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, radiusMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance); maxDistance, WithOutMask(), trainIdx, nMatches, distance, stream);
} }
} }
template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
}}} }}}

View File

@@ -184,7 +184,9 @@ namespace cv { namespace gpu
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>( computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
num_points, object, image, dist_threshold, hypothesis_scores); num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
} }
} // namespace solvepnp_ransac } // namespace solvepnp_ransac

View File

@@ -64,19 +64,19 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
template <typename T1, typename T2> template <typename T1, typename T2>
inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
NotEqual<T1, T2> op; NotEqual<T1, T2> op;
transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0); transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, stream);
} }
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
compare_ne<uint, uint>(src1, src2, dst); compare_ne<uint, uint>(src1, src2, dst, stream);
} }
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst) void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream)
{ {
compare_ne<float, float>(src1, src2, dst); compare_ne<float, float>(src1, src2, dst, stream);
} }
@@ -133,7 +133,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -165,7 +165,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -290,7 +290,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }

View File

@@ -129,7 +129,7 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters namespace cv { namespace gpu { namespace filters
{ {
template <int ksize, typename T, typename D, template<typename> class B> template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor) void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
@@ -143,16 +143,17 @@ namespace cv { namespace gpu { namespace filters
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
} }
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b); filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor); typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[3][17] = static const caller_t callers[3][17] =
{ {
{ {
@@ -216,15 +217,15 @@ namespace cv { namespace gpu { namespace filters
loadLinearKernel(kernel, ksize); loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }
template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearRowFilter_gpu<uchar , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearRowFilter_gpu<uchar4, float4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type);; template void linearRowFilter_gpu<short , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearRowFilter_gpu<short2, float2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearRowFilter_gpu<int , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearRowFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}} }}}
namespace filter_krnls namespace filter_krnls
@@ -269,7 +270,7 @@ namespace filter_krnls
namespace cv { namespace gpu { namespace filters namespace cv { namespace gpu { namespace filters
{ {
template <int ksize, typename T, typename D, template<typename> class B> template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor) void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
@@ -282,16 +283,17 @@ namespace cv { namespace gpu { namespace filters
"try bigger image or another border extrapolation mode", __FILE__, __LINE__); "try bigger image or another border extrapolation mode", __FILE__, __LINE__);
} }
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b); filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type) void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor); typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[3][17] = static const caller_t callers[3][17] =
{ {
{ {
@@ -355,15 +357,15 @@ namespace cv { namespace gpu { namespace filters
loadLinearKernel(kernel, ksize); loadLinearKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }
template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float , uchar >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float , short >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float2, short2>(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float , int >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); template void linearColumnFilter_gpu<float , float >(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}} }}}
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -541,7 +543,7 @@ namespace cv { namespace gpu { namespace bf
} }
if (stream != 0) if (stream != 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream) void bilateral_filter_gpu(const DevMem2D& disp, const DevMem2D& img, int channels, int iters, cudaStream_t stream)

View File

@@ -220,7 +220,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
img_block_width, grad, qangle, scale, block_hists); img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -324,7 +324,7 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -418,7 +418,7 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
block_hists, coefs, free_coef, threshold, labels); block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@@ -463,7 +463,7 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -512,7 +512,7 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors); img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@@ -636,7 +636,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int nthreads, int correct_gamma> template <int nthreads, int correct_gamma>
@@ -707,7 +708,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle); compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -765,7 +767,9 @@ static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs); resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex) ); cudaSafeCall( cudaUnbindTexture(tex) );
} }

View File

@@ -139,7 +139,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) ); cudaSafeCall( cudaUnbindTexture(tex_remap) );
} }
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace imgproc
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows); remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
@@ -263,7 +263,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps ); meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps)
@@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace imgproc
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps ); meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
@@ -411,7 +411,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -502,7 +502,7 @@ namespace cv { namespace gpu { namespace imgproc
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst); extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
@@ -611,7 +611,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall(cudaUnbindTexture(harrisDxTex)); cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex)); cudaSafeCall(cudaUnbindTexture(harrisDyTex));
} }
@@ -727,7 +728,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
} }
@@ -763,7 +765,7 @@ namespace cv { namespace gpu { namespace imgproc
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst); column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -791,7 +793,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel<<<grid, threads>>>(a, b, c); mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -820,7 +822,7 @@ namespace cv { namespace gpu { namespace imgproc
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c); mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -850,7 +852,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -880,7 +882,7 @@ namespace cv { namespace gpu { namespace imgproc
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c); mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
@@ -904,7 +906,9 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst); downsampleKernel<<<grid, threads>>>(src, rows, cols, k, dst);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
} }
template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst); template void downsampleCaller(const PtrStep src, int rows, int cols, int k, PtrStep dst);

View File

@@ -46,6 +46,8 @@
#include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "npp.h"
#include "NPP_staging.hpp"
namespace cv namespace cv
{ {
@@ -106,6 +108,41 @@ namespace cv
cudaSafeCall( cudaGetTextureReference(&tex, name) ); cudaSafeCall( cudaGetTextureReference(&tex, name) );
cudaSafeCall( cudaUnbindTexture(tex) ); cudaSafeCall( cudaUnbindTexture(tex) );
} }
class NppStreamHandler
{
public:
inline explicit NppStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppGetStream();
nppSetStream(newStream);
}
inline ~NppStreamHandler()
{
nppSetStream(oldStream);
}
private:
cudaStream_t oldStream;
};
class NppStStreamHandler
{
public:
inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
{
oldStream = nppStSetActiveCUDAstream(newStream);
}
inline ~NppStStreamHandler()
{
nppStSetActiveCUDAstream(oldStream);
}
private:
cudaStream_t oldStream;
};
} }
} }

View File

@@ -134,7 +134,7 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -165,7 +165,7 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -228,7 +228,7 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -259,7 +259,7 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -309,7 +309,7 @@ void matchTemplatePrepared_SQDIFF_8U(
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -360,7 +360,7 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -392,7 +392,7 @@ void matchTemplatePrepared_CCOFF_8U(
w, h, (float)templ_sum / (w * h), image_sum, result); w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -434,7 +434,7 @@ void matchTemplatePrepared_CCOFF_8UC2(
image_sum_r, image_sum_g, result); image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -490,7 +490,7 @@ void matchTemplatePrepared_CCOFF_8UC3(
image_sum_r, image_sum_g, image_sum_b, result); image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -556,7 +556,7 @@ void matchTemplatePrepared_CCOFF_8UC4(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -602,7 +602,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
image_sum, image_sqsum, result); image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -665,7 +665,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -742,7 +742,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -833,7 +833,7 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
result); result);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -877,7 +877,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -919,7 +919,7 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
} }

View File

@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream) void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)

View File

@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
@@ -199,7 +199,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);

View File

@@ -275,7 +275,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf); minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -306,7 +306,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf); minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -363,7 +363,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -609,7 +609,7 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
@@ -650,7 +650,7 @@ namespace cv { namespace gpu { namespace mathfunc
minloc_buf, maxloc_buf); minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -724,7 +724,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -766,7 +766,7 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_; T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
@@ -895,7 +895,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf); countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -942,7 +942,7 @@ namespace cv { namespace gpu { namespace mathfunc
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
uint count; uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1543,7 +1543,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1615,7 +1615,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1737,7 +1737,7 @@ namespace cv { namespace gpu { namespace mathfunc
break; break;
} }
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
@@ -1787,7 +1787,7 @@ namespace cv { namespace gpu { namespace mathfunc
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall( cudaDeviceSynchronize() );
R result[4] = {0, 0, 0, 0}; R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));

View File

@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@@ -253,7 +253,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@@ -445,7 +445,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@@ -462,7 +462,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
@@ -480,7 +480,7 @@ namespace cv { namespace gpu { namespace split_merge {
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }

View File

@@ -328,7 +328,7 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
}; };
typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream); typedef void (*kernel_caller_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int maxdisp, cudaStream_t & stream);
@@ -407,7 +407,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForSobel ) ); cudaSafeCall( cudaUnbindTexture (texForSobel ) );
} }
@@ -532,9 +532,9 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture (texForTF) );
cudaSafeCall( cudaUnbindTexture (texForTF) );
} }
}}} }}}

View File

@@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, short>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
@@ -233,7 +233,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, float>(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream)
{ {
@@ -247,7 +247,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -287,7 +287,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream); template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
@@ -337,7 +337,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream); template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2D* mus, DevMem2D* mds, DevMem2D* mls, DevMem2D* mrs, cudaStream_t stream);
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
@@ -520,7 +520,7 @@ namespace cv { namespace gpu { namespace bp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream); template void output_gpu<short>(const DevMem2D& u, const DevMem2D& d, const DevMem2D& l, const DevMem2D& r, const DevMem2D& data, const DevMem2D_<short>& disp, cudaStream_t stream);

View File

@@ -385,7 +385,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step, template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
@@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2, template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
@@ -713,7 +713,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -815,7 +815,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,

View File

@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace); icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -338,7 +338,7 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -483,7 +483,7 @@ namespace cv { namespace gpu { namespace surf
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter); icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -674,7 +674,7 @@ namespace cv { namespace gpu { namespace surf
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir); icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -986,24 +986,24 @@ namespace cv { namespace gpu { namespace surf
compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors); normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
else else
{ {
compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir); compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors); normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
}}} }}}

View File

@@ -64,6 +64,8 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { thro
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); } void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -117,7 +119,7 @@ namespace
} }
} }
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl->stream; }; CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
void cv::gpu::Stream::create() void cv::gpu::Stream::create()
{ {
@@ -188,18 +190,35 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); } void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyHostToDevice); }
void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); } void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
{ {
CV_Assert((src.depth() != CV_64F) || CV_Assert((src.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, impl->stream) );
return;
}
if (src.depth() == CV_8U)
{
int cn = src.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, impl->stream) );
return;
}
}
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream); typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
static const set_caller_t set_callers[] = static const set_caller_t set_callers[] =
{ {
kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>, kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
kernelSet<int>, kernelSet<float>, kernelSet<double> kernelSet<int>, kernelSet<float>, kernelSet<double>
}; };
set_callers[src.depth()](src, val, impl->stream); set_callers[src.depth()](src, s, impl->stream);
} }
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
@@ -246,5 +265,17 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream); matrix_operations::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
} }
cv::gpu::Stream::operator bool() const
{
return impl && impl->stream;
}
cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
cv::gpu::Stream& cv::gpu::Stream::Null()
{
static Stream s((Impl*)0);
return s;
}
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@@ -47,35 +47,26 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&) { throw_nogpu(); } void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_nogpu(); } void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, const Stream&) { throw_nogpu(); } void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_nogpu(); return 0.0;}
void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::min(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&) { throw_nogpu(); }
void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu(); }
double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int) {throw_nogpu(); return 0.0;}
double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, const Stream&) {throw_nogpu(); return 0.0;}
#else #else
@@ -90,7 +81,7 @@ namespace
void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,
npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4, npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,
npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1) npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1, cudaStream_t stream)
{ {
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1); CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
@@ -100,6 +91,8 @@ namespace
sz.width = src1.cols; sz.width = src1.cols;
sz.height = src1.rows; sz.height = src1.rows;
NppStreamHandler h(stream);
switch (src1.type()) switch (src1.type())
{ {
case CV_8UC1: case CV_8UC1:
@@ -118,7 +111,8 @@ namespace
CV_Assert(!"Unsupported source type"); CV_Assert(!"Unsupported source type");
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
template<int SCN> struct NppArithmScalarFunc; template<int SCN> struct NppArithmScalarFunc;
@@ -135,7 +129,7 @@ namespace
template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func> template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>
{ {
static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst) static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream)
{ {
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
@@ -143,14 +137,17 @@ namespace
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) ); nppSafeCall( func(src.ptr<Npp32f>(), src.step, (Npp32f)sc[0], dst.ptr<Npp32f>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<typename NppArithmScalarFunc<2>::func_ptr func> struct NppArithmScalar<2, func> template<typename NppArithmScalarFunc<2>::func_ptr func> struct NppArithmScalar<2, func>
{ {
static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst) static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream)
{ {
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
@@ -162,78 +159,81 @@ namespace
nValue.re = (Npp32f)sc[0]; nValue.re = (Npp32f)sc[0];
nValue.im = (Npp32f)sc[1]; nValue.im = (Npp32f)sc[1];
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp32fc>(), src.step, nValue, dst.ptr<Npp32fc>(), dst.step, sz) ); nppSafeCall( func(src.ptr<Npp32fc>(), src.step, nValue, dst.ptr<Npp32fc>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
} }
void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R); nppArithmCaller(src1, src2, dst, nppiAdd_8u_C1RSfs, nppiAdd_8u_C4RSfs, nppiAdd_32s_C1R, nppiAdd_32f_C1R, StreamAccessor::getStream(stream));
} }
void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R); nppArithmCaller(src2, src1, dst, nppiSub_8u_C1RSfs, nppiSub_8u_C4RSfs, nppiSub_32s_C1R, nppiSub_32f_C1R, StreamAccessor::getStream(stream));
} }
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R); nppArithmCaller(src1, src2, dst, nppiMul_8u_C1RSfs, nppiMul_8u_C4RSfs, nppiMul_32s_C1R, nppiMul_32f_C1R, StreamAccessor::getStream(stream));
} }
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{ {
nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R); nppArithmCaller(src2, src1, dst, nppiDiv_8u_C1RSfs, nppiDiv_8u_C4RSfs, nppiDiv_32s_C1R, nppiDiv_32f_C1R, StreamAccessor::getStream(stream));
} }
void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst) void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{ {
typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
static const caller_t callers[] = {0, NppArithmScalar<1, nppiAddC_32f_C1R>::calc, NppArithmScalar<2, nppiAddC_32fc_C1R>::calc}; static const caller_t callers[] = {0, NppArithmScalar<1, nppiAddC_32f_C1R>::calc, NppArithmScalar<2, nppiAddC_32fc_C1R>::calc};
CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
callers[src.channels()](src, sc, dst); callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst) void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{ {
typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
static const caller_t callers[] = {0, NppArithmScalar<1, nppiSubC_32f_C1R>::calc, NppArithmScalar<2, nppiSubC_32fc_C1R>::calc}; static const caller_t callers[] = {0, NppArithmScalar<1, nppiSubC_32f_C1R>::calc, NppArithmScalar<2, nppiSubC_32fc_C1R>::calc};
CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
callers[src.channels()](src, sc, dst); callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst) void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{ {
typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
static const caller_t callers[] = {0, NppArithmScalar<1, nppiMulC_32f_C1R>::calc, NppArithmScalar<2, nppiMulC_32fc_C1R>::calc}; static const caller_t callers[] = {0, NppArithmScalar<1, nppiMulC_32f_C1R>::calc, NppArithmScalar<2, nppiMulC_32fc_C1R>::calc};
CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
callers[src.channels()](src, sc, dst); callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst) void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{ {
typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst); typedef void (*caller_t)(const GpuMat& src, const Scalar& sc, GpuMat& dst, cudaStream_t stream);
static const caller_t callers[] = {0, NppArithmScalar<1, nppiDivC_32f_C1R>::calc, NppArithmScalar<2, nppiDivC_32fc_C1R>::calc}; static const caller_t callers[] = {0, NppArithmScalar<1, nppiDivC_32f_C1R>::calc, NppArithmScalar<2, nppiDivC_32fc_C1R>::calc};
CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2); CV_Assert(src.type() == CV_32FC1 || src.type() == CV_32FC2);
callers[src.channels()](src, sc, dst); callers[src.channels()](src, sc, dst, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Absolute difference // Absolute difference
void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
{ {
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
@@ -245,6 +245,10 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
sz.width = src1.cols; sz.width = src1.cols;
sz.height = src1.rows; sz.height = src1.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
switch (src1.type()) switch (src1.type())
{ {
case CV_8UC1: case CV_8UC1:
@@ -263,22 +267,28 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
CV_Assert(!"Unsupported source type"); CV_Assert(!"Unsupported source type");
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst) void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
{ {
CV_Assert(src.type() == CV_32FC1); CV_Assert(src1.type() == CV_32FC1);
dst.create( src.size(), src.type() ); dst.create( src1.size(), src1.type() );
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src1.cols;
sz.height = src.rows; sz.height = src1.rows;
nppSafeCall( nppiAbsDiffC_32f_C1R(src.ptr<Npp32f>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, (Npp32f)s[0]) ); cudaStream_t stream = StreamAccessor::getStream(s);
cudaSafeCall( cudaThreadSynchronize() ); NppStreamHandler h(stream);
nppSafeCall( nppiAbsDiffC_32f_C1R(src1.ptr<Npp32f>(), src1.step, dst.ptr<Npp32f>(), dst.step, sz, (Npp32f)src2[0]) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
@@ -287,11 +297,11 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
namespace cv { namespace gpu { namespace mathfunc namespace cv { namespace gpu { namespace mathfunc
{ {
void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst); void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
}}} }}}
void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop) void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s)
{ {
CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type()); CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
@@ -305,34 +315,42 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
sz.width = src1.cols; sz.width = src1.cols;
sz.height = src1.rows; sz.height = src1.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
if (src1.type() == CV_8UC4) if (src1.type() == CV_8UC4)
{ {
if (cmpop != CMP_NE) if (cmpop != CMP_NE)
{ {
NppStreamHandler h(stream);
nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), src1.step, nppSafeCall( nppiCompare_8u_C4R(src1.ptr<Npp8u>(), src1.step,
src2.ptr<Npp8u>(), src2.step, src2.ptr<Npp8u>(), src2.step,
dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) ); dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
else else
{ {
mathfunc::compare_ne_8uc4(src1, src2, dst); mathfunc::compare_ne_8uc4(src1, src2, dst, stream);
} }
} }
else else
{ {
if (cmpop != CMP_NE) if (cmpop != CMP_NE)
{ {
NppStreamHandler h(stream);
nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), src1.step, nppSafeCall( nppiCompare_32f_C1R(src1.ptr<Npp32f>(), src1.step,
src2.ptr<Npp32f>(), src2.step, src2.ptr<Npp32f>(), src2.step,
dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) ); dst.ptr<Npp8u>(), dst.step, sz, nppCmpOp[cmpop]) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
else else
{ {
mathfunc::compare_ne_32f(src1, src2, dst); mathfunc::compare_ne_32f(src1, src2, dst, stream);
} }
} }
} }
@@ -383,16 +401,7 @@ namespace
} }
void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask) void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.empty())
::bitwiseNotCaller(src, dst, 0);
else
::bitwiseNotCaller(src, dst, mask, 0);
}
void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, const Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream)); ::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
@@ -519,16 +528,7 @@ namespace
} }
void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.empty())
::bitwiseOrCaller(src1, src2, dst, 0);
else
::bitwiseOrCaller(src1, src2, dst, mask, 0);
}
void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream)); ::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
@@ -537,16 +537,7 @@ void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, co
} }
void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.empty())
::bitwiseAndCaller(src1, src2, dst, 0);
else
::bitwiseAndCaller(src1, src2, dst, mask, 0);
}
void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream)); ::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
@@ -555,16 +546,7 @@ void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
} }
void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask) void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
if (mask.empty())
::bitwiseXorCaller(src1, src2, dst, 0);
else
::bitwiseXorCaller(src1, src2, dst, mask, 0);
}
void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, const Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream)); ::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
@@ -624,22 +606,7 @@ namespace
} }
} }
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{ {
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) || CV_Assert((src1.depth() != CV_64F) ||
@@ -653,22 +620,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
}; };
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
min_caller<uchar>, min_caller<schar>, min_caller<ushort>, min_caller<short>, min_caller<int>,
min_caller<float>, min_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{ {
CV_Assert((src1.depth() != CV_64F) || CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
@@ -682,22 +634,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
{
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream)
{ {
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
CV_Assert((src1.depth() != CV_64F) || CV_Assert((src1.depth() != CV_64F) ||
@@ -712,21 +649,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream)); funcs[src1.depth()](src1, src2, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
{
CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[] =
{
max_caller<uchar>, max_caller<schar>, max_caller<ushort>, max_caller<short>, max_caller<int>,
max_caller<float>, max_caller<double>
};
funcs[src1.depth()](src1, src2, dst, 0);
}
void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream)
{ {
CV_Assert((src1.depth() != CV_64F) || CV_Assert((src1.depth() != CV_64F) ||
(TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))); (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
@@ -760,10 +683,14 @@ namespace
} }
} }
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type) double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
{ {
cudaStream_t stream = StreamAccessor::getStream(s);
if (src.type() == CV_32FC1 && type == THRESH_TRUNC) if (src.type() == CV_32FC1 && type == THRESH_TRUNC)
{ {
NppStreamHandler h(stream);
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
NppiSize sz; NppiSize sz;
@@ -773,7 +700,8 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step, nppSafeCall( nppiThreshold_32f_C1R(src.ptr<Npp32f>(), src.step,
dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) ); dst.ptr<Npp32f>(), dst.step, sz, static_cast<Npp32f>(thresh), NPP_CMP_GREATER) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
else else
{ {
@@ -801,37 +729,10 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
maxVal = cvRound(maxVal); maxVal = cvRound(maxVal);
} }
callers[src.depth()](src, dst, thresh, maxVal, type, 0); callers[src.depth()](src, dst, thresh, maxVal, type, stream);
} }
return thresh; return thresh;
} }
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, const Stream& stream)
{
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type,
cudaStream_t stream);
static const caller_t callers[] =
{
threshold_caller<unsigned char>, threshold_caller<signed char>,
threshold_caller<unsigned short>, threshold_caller<short>,
threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
};
CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
CV_Assert(type <= THRESH_TOZERO_INV);
dst.create(src.size(), src.type());
if (src.depth() != CV_32F)
{
thresh = cvFloor(thresh);
maxVal = cvRound(maxVal);
}
callers[src.depth()](src, dst, thresh, maxVal, type, StreamAccessor::getStream(stream));
return thresh;
}
#endif #endif

View File

@@ -66,16 +66,16 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, doubl
Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point) { throw_nogpu(); } void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); }
void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); } void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); } void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); } void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); } void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); } void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double) { throw_nogpu(); } void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, Stream&) { throw_nogpu(); }
#else #else
@@ -137,21 +137,25 @@ namespace
filter2D(filter2D_), srcType(srcType_), dstType(dstType_) filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
{} {}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
{ {
CV_Assert(src.type() == srcType); CV_Assert(src.type() == srcType);
Size src_size = src.size(); Size src_size = src.size();
dst.create(src_size, dstType); dst.create(src_size, dstType);
dst = Scalar(0.0);
if (stream)
stream.enqueueMemSet(dst, Scalar::all(0.0));
else
dst.setTo(Scalar::all(0.0));
normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
GpuMat srcROI = src(roi); GpuMat srcROI = src(roi);
GpuMat dstROI = dst(roi); GpuMat dstROI = dst(roi);
(*filter2D)(srcROI, dstROI); (*filter2D)(srcROI, dstROI, stream);
} }
Ptr<BaseFilter_GPU> filter2D; Ptr<BaseFilter_GPU> filter2D;
@@ -181,16 +185,25 @@ namespace
anchor = Point(rowFilter->anchor, columnFilter->anchor); anchor = Point(rowFilter->anchor, columnFilter->anchor);
} }
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
{ {
CV_Assert(src.type() == srcType); CV_Assert(src.type() == srcType);
Size src_size = src.size(); Size src_size = src.size();
dst.create(src_size, dstType); dst.create(src_size, dstType);
dst = Scalar(0.0);
dstBuf.create(src_size, bufType); dstBuf.create(src_size, bufType);
if (stream)
{
stream.enqueueMemSet(dst, Scalar::all(0));
stream.enqueueMemSet(dstBuf, Scalar::all(0));
}
else
{
dst = Scalar(0.0);
dstBuf = Scalar(0.0); dstBuf = Scalar(0.0);
}
normalizeROI(roi, ksize, anchor, src_size); normalizeROI(roi, ksize, anchor, src_size);
@@ -198,8 +211,8 @@ namespace
GpuMat dstROI = dst(roi); GpuMat dstROI = dst(roi);
GpuMat dstBufROI = dstBuf(roi); GpuMat dstBufROI = dstBuf(roi);
(*rowFilter)(srcROI, dstBufROI); (*rowFilter)(srcROI, dstBufROI, stream);
(*columnFilter)(dstBufROI, dstROI); (*columnFilter)(dstBufROI, dstROI, stream);
} }
Ptr<BaseRowFilter_GPU> rowFilter; Ptr<BaseRowFilter_GPU> rowFilter;
@@ -229,15 +242,20 @@ namespace
public: public:
NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {} NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) ); nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
} }
@@ -258,15 +276,20 @@ namespace
public: public:
NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {} NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) ); nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp32f>(), dst.step, sz, ksize, anchor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
} }
@@ -293,7 +316,7 @@ namespace
public: public:
NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
@@ -305,9 +328,14 @@ namespace
oAnchor.x = anchor.x; oAnchor.x = anchor.x;
oAnchor.y = anchor.y; oAnchor.y = anchor.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) ); nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
nppFilterBox_t func; nppFilterBox_t func;
@@ -331,7 +359,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int srcType, int dstType, con
return createFilter2D_GPU(boxFilter, srcType, dstType); return createFilter2D_GPU(boxFilter, srcType, dstType);
} }
void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor) void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor, Stream& stream)
{ {
int sdepth = src.depth(), cn = src.channels(); int sdepth = src.depth(), cn = src.channels();
if( ddepth < 0 ) if( ddepth < 0 )
@@ -340,7 +368,7 @@ void cv::gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize,
dst.create(src.size(), CV_MAKETYPE(ddepth, cn)); dst.create(src.size(), CV_MAKETYPE(ddepth, cn));
Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor); Ptr<FilterEngine_GPU> f = createBoxFilter_GPU(src.type(), dst.type(), ksize, anchor);
f->apply(src, dst); f->apply(src, dst, Rect(0,0,-1,-1), stream);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -356,7 +384,7 @@ namespace
NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) : NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
@@ -368,9 +396,14 @@ namespace
oAnchor.x = anchor.x; oAnchor.x = anchor.x;
oAnchor.y = anchor.y; oAnchor.y = anchor.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) ); nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
GpuMat kernel; GpuMat kernel;
@@ -404,16 +437,16 @@ namespace
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) : MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) :
Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {} Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1)) virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
{ {
if (iters > 1) if (iters > 1)
morfBuf.create(src.size(), src.type()); morfBuf.create(src.size(), src.type());
Filter2DEngine_GPU::apply(src, dst); Filter2DEngine_GPU::apply(src, dst, roi, stream);
for(int i = 1; i < iters; ++i) for(int i = 1; i < iters; ++i)
{ {
dst.swap(morfBuf); dst.swap(morfBuf);
Filter2DEngine_GPU::apply(morfBuf, dst); Filter2DEngine_GPU::apply(morfBuf, dst, roi, stream);
} }
} }
@@ -435,7 +468,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
namespace namespace
{ {
void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations) void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations, Stream& stream)
{ {
Mat kernel; Mat kernel;
Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
@@ -444,6 +477,9 @@ namespace
if (iterations == 0 || _kernel.rows * _kernel.cols == 1) if (iterations == 0 || _kernel.rows * _kernel.cols == 1)
{ {
if (stream)
stream.enqueueCopy(src, dst);
else
src.copyTo(dst); src.copyTo(dst);
return; return;
} }
@@ -468,49 +504,49 @@ namespace
Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
f->apply(src, dst); f->apply(src, dst, Rect(0,0,-1,-1), stream);
} }
} }
void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream)
{ {
morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations); morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, stream);
} }
void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream)
{ {
morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations); morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, stream);
} }
void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations) void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations, Stream& stream)
{ {
GpuMat temp; GpuMat temp;
switch( op ) switch( op )
{ {
case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations); break; case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations, stream); break;
case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations); break; case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations, stream); break;
case MORPH_OPEN: case MORPH_OPEN:
erode( src, temp, kernel, anchor, iterations); erode( src, temp, kernel, anchor, iterations, stream);
dilate( temp, dst, kernel, anchor, iterations); dilate( temp, dst, kernel, anchor, iterations, stream);
break; break;
case CV_MOP_CLOSE: case CV_MOP_CLOSE:
dilate( src, temp, kernel, anchor, iterations); dilate( src, temp, kernel, anchor, iterations, stream);
erode( temp, dst, kernel, anchor, iterations); erode( temp, dst, kernel, anchor, iterations, stream);
break; break;
case CV_MOP_GRADIENT: case CV_MOP_GRADIENT:
erode( src, temp, kernel, anchor, iterations); erode( src, temp, kernel, anchor, iterations, stream);
dilate( src, dst, kernel, anchor, iterations); dilate( src, dst, kernel, anchor, iterations, stream);
subtract(dst, temp, dst); subtract(dst, temp, dst, stream);
break; break;
case CV_MOP_TOPHAT: case CV_MOP_TOPHAT:
erode( src, dst, kernel, anchor, iterations); erode( src, dst, kernel, anchor, iterations, stream);
dilate( dst, temp, kernel, anchor, iterations); dilate( dst, temp, kernel, anchor, iterations, stream);
subtract(src, temp, dst); subtract(src, temp, dst, stream);
break; break;
case CV_MOP_BLACKHAT: case CV_MOP_BLACKHAT:
dilate( src, dst, kernel, anchor, iterations); dilate( src, dst, kernel, anchor, iterations, stream);
erode( dst, temp, kernel, anchor, iterations); erode( dst, temp, kernel, anchor, iterations, stream);
subtract(temp, src, dst); subtract(temp, src, dst, stream);
break; break;
default: default:
CV_Error( CV_StsBadArg, "unknown morphological operation" ); CV_Error( CV_StsBadArg, "unknown morphological operation" );
@@ -531,7 +567,7 @@ namespace
NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) : NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
@@ -543,10 +579,15 @@ namespace
oAnchor.x = anchor.x; oAnchor.x = anchor.x;
oAnchor.y = anchor.y; oAnchor.y = anchor.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,
kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) ); kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
GpuMat kernel; GpuMat kernel;
@@ -578,7 +619,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType,
return createFilter2D_GPU(linearFilter, srcType, dstType); return createFilter2D_GPU(linearFilter, srcType, dstType);
} }
void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor) void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, Stream& stream)
{ {
if( ddepth < 0 ) if( ddepth < 0 )
ddepth = src.depth(); ddepth = src.depth();
@@ -586,7 +627,7 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor); Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor);
f->apply(src, dst); f->apply(src, dst, Rect(0, 0, -1, -1), stream);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -595,10 +636,10 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
namespace cv { namespace gpu { namespace filters namespace cv { namespace gpu { namespace filters
{ {
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}} }}}
namespace namespace
@@ -606,7 +647,7 @@ namespace
typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor); const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type); typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
class NppLinearRowFilter : public BaseRowFilter_GPU class NppLinearRowFilter : public BaseRowFilter_GPU
{ {
@@ -614,15 +655,20 @@ namespace
NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) ); nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
GpuMat kernel; GpuMat kernel;
@@ -636,9 +682,9 @@ namespace
GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type); func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));
} }
Mat kernel; Mat kernel;
@@ -720,15 +766,20 @@ namespace
NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) ); nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
GpuMat kernel; GpuMat kernel;
@@ -742,9 +793,9 @@ namespace
GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type); func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, StreamAccessor::getStream(s));
} }
Mat kernel; Mat kernel;
@@ -834,7 +885,8 @@ Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int
return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType); return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
} }
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType) void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType,
Stream& stream)
{ {
if( ddepth < 0 ) if( ddepth < 0 )
ddepth = src.depth(); ddepth = src.depth();
@@ -842,7 +894,7 @@ void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat&
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType); Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -855,7 +907,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i
return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
} }
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType) void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{ {
Mat kx, ky; Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
@@ -870,10 +922,10 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale; ky *= scale;
} }
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream);
} }
void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType) void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{ {
Mat kx, ky; Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
@@ -888,10 +940,10 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale; ky *= scale;
} }
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream);
} }
void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale) void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, Stream& stream)
{ {
CV_Assert(ksize == 1 || ksize == 3); CV_Assert(ksize == 1 || ksize == 3);
@@ -904,7 +956,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
if (scale != 1) if (scale != 1)
kernel *= scale; kernel *= scale;
filter2D(src, dst, ddepth, kernel, Point(-1,-1)); filter2D(src, dst, ddepth, kernel, Point(-1,-1), stream);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -938,7 +990,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
} }
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType) void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
{ {
if (ksize.width == 1 && ksize.height == 1) if (ksize.width == 1 && ksize.height == 1)
{ {
@@ -949,7 +1001,7 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -965,7 +1017,7 @@ namespace
public: public:
NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
{ {
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
@@ -977,9 +1029,14 @@ namespace
oAnchor.x = anchor.x; oAnchor.x = anchor.x;
oAnchor.y = anchor.y; oAnchor.y = anchor.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) ); nppSafeCall( func(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, oKernelSize, oAnchor) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
nppFilterRank_t func; nppFilterRank_t func;

View File

@@ -44,11 +44,11 @@
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf) void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
{ {
Size src_size = terminals.size(); Size src_size = terminals.size();
CV_Assert(terminals.type() == CV_32S); CV_Assert(terminals.type() == CV_32S);
@@ -73,17 +73,17 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize()) if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
buf.create(1, bufsz, CV_8U); buf.create(1, bufsz, CV_8U);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(), nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) ); terminals.step, leftTransp.step, sznpp, labels.ptr<Npp8u>(), labels.step, buf.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@@ -50,30 +50,24 @@ using namespace cv::gpu;
void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&){ throw_nogpu(); } void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&){ throw_nogpu(); }
void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); } void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int) { throw_nogpu(); } void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, const Stream&) { throw_nogpu(); } void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, Stream&) { throw_nogpu(); }
void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&) { throw_nogpu(); } void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
void cv::gpu::reprojectImageTo3D(const GpuMat&, GpuMat&, const Mat&, const Stream&) { throw_nogpu(); } void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&, Stream&) { throw_nogpu(); }
void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int) { throw_nogpu(); } void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const Scalar&) { throw_nogpu(); } void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, Stream&) { throw_nogpu(); }
void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); }
void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { throw_nogpu(); } void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); } void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::integral(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); } void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_nogpu(); }
void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*) { throw_nogpu(); } void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); }
void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); } void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu(); } void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); }
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); } void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); } void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); } void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }
@@ -203,14 +197,7 @@ namespace
const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0}; const drawColorDisp_caller_t drawColorDisp_callers[] = {drawColorDisp_caller<unsigned char>, 0, 0, drawColorDisp_caller<short>, 0, 0, 0, 0};
} }
void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp) void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& stream)
{
CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
drawColorDisp_callers[src.type()](src, dst, ndisp, 0);
}
void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, const Stream& stream)
{ {
CV_Assert(src.type() == CV_8U || src.type() == CV_16S); CV_Assert(src.type() == CV_8U || src.type() == CV_16S);
@@ -234,14 +221,7 @@ namespace
const reprojectImageTo3D_caller_t reprojectImageTo3D_callers[] = {reprojectImageTo3D_caller<unsigned char>, 0, 0, reprojectImageTo3D_caller<short>, 0, 0, 0, 0}; const reprojectImageTo3D_caller_t reprojectImageTo3D_callers[] = {reprojectImageTo3D_caller<unsigned char>, 0, 0, reprojectImageTo3D_caller<short>, 0, 0, 0, 0};
} }
void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q) void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream)
{
CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4);
reprojectImageTo3D_callers[disp.type()](disp, xyzw, Q, 0);
}
void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const Stream& stream)
{ {
CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4); CV_Assert((disp.type() == CV_8U || disp.type() == CV_16S) && Q.type() == CV_32F && Q.rows == 4 && Q.cols == 4);
@@ -251,7 +231,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// resize // resize
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation) void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{ {
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR/*, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS*/}; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR/*, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS*/};
@@ -284,6 +264,10 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
dstsz.width = dst.cols; dstsz.width = dst.cols;
dstsz.height = dst.rows; dstsz.height = dst.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1) if (src.type() == CV_8UC1)
{ {
nppSafeCall( nppiResize_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcrect, nppSafeCall( nppiResize_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcrect,
@@ -295,13 +279,14 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
dst.ptr<Npp8u>(), dst.step, dstsz, fx, fy, npp_inter[interpolation]) ); dst.ptr<Npp8u>(), dst.step, dstsz, fx, fy, npp_inter[interpolation]) );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// copyMakeBorder // copyMakeBorder
void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value) void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, const Scalar& value, Stream& s)
{ {
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
@@ -314,6 +299,10 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
dstsz.width = dst.cols; dstsz.width = dst.cols;
dstsz.height = dst.rows; dstsz.height = dst.rows;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
switch (src.type()) switch (src.type())
{ {
case CV_8UC1: case CV_8UC1:
@@ -349,7 +338,8 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
CV_Assert(!"Unsupported source type"); CV_Assert(!"Unsupported source type");
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@@ -372,7 +362,7 @@ namespace
void nppWarpCaller(const GpuMat& src, GpuMat& dst, double coeffs[][3], const Size& dsize, int flags, void nppWarpCaller(const GpuMat& src, GpuMat& dst, double coeffs[][3], const Size& dsize, int flags,
npp_warp_8u_t npp_warp_8u[][2], npp_warp_16u_t npp_warp_16u[][2], npp_warp_8u_t npp_warp_8u[][2], npp_warp_16u_t npp_warp_16u[][2],
npp_warp_32s_t npp_warp_32s[][2], npp_warp_32f_t npp_warp_32f[][2]) npp_warp_32s_t npp_warp_32s[][2], npp_warp_32f_t npp_warp_32f[][2], cudaStream_t stream)
{ {
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
@@ -397,6 +387,8 @@ namespace
int warpInd = (flags & WARP_INVERSE_MAP) >> 4; int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
NppStreamHandler h(stream);
switch (src.depth()) switch (src.depth())
{ {
case CV_8U: case CV_8U:
@@ -419,11 +411,12 @@ namespace
CV_Assert(!"Unsupported source type"); CV_Assert(!"Unsupported source type");
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags) void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s)
{ {
static npp_warp_8u_t npp_warpAffine_8u[][2] = static npp_warp_8u_t npp_warpAffine_8u[][2] =
{ {
@@ -464,10 +457,10 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
Mat coeffsMat(2, 3, CV_64F, (void*)coeffs); Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type()); M.convertTo(coeffsMat, coeffsMat.type());
nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpAffine_8u, npp_warpAffine_16u, npp_warpAffine_32s, npp_warpAffine_32f); nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpAffine_8u, npp_warpAffine_16u, npp_warpAffine_32s, npp_warpAffine_32f, StreamAccessor::getStream(s));
} }
void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags) void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, Stream& s)
{ {
static npp_warp_8u_t npp_warpPerspective_8u[][2] = static npp_warp_8u_t npp_warpPerspective_8u[][2] =
{ {
@@ -508,13 +501,13 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
Mat coeffsMat(3, 3, CV_64F, (void*)coeffs); Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
M.convertTo(coeffsMat, coeffsMat.type()); M.convertTo(coeffsMat, coeffsMat.type());
nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f); nppWarpCaller(src, dst, coeffs, dsize, flags, npp_warpPerspective_8u, npp_warpPerspective_16u, npp_warpPerspective_32s, npp_warpPerspective_32f, StreamAccessor::getStream(s));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// rotate // rotate
void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation) void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& s)
{ {
static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC}; static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
@@ -535,6 +528,10 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
dstroi.height = dst.rows; dstroi.height = dst.rows;
dstroi.width = dst.cols; dstroi.width = dst.cols;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
if (src.type() == CV_8UC1) if (src.type() == CV_8UC1)
{ {
nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcroi, nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, src.step, srcroi,
@@ -546,19 +543,20 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
dst.ptr<Npp8u>(), dst.step, dstroi, angle, xShift, yShift, npp_inter[interpolation]) ); dst.ptr<Npp8u>(), dst.step, dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
} }
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// integral // integral
void cv::gpu::integral(const GpuMat& src, GpuMat& sum) void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
{ {
GpuMat buffer; GpuMat buffer;
integralBuffered(src, sum, buffer); integralBuffered(src, sum, buffer, s);
} }
void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer) void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
{ {
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
@@ -575,35 +573,45 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer)
nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step, nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step,
sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) ); sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum) void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s)
{ {
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
int w = src.cols + 1, h = src.rows + 1; int width = src.cols + 1, height = src.rows + 1;
sum.create(h, w, CV_32S); sum.create(height, width, CV_32S);
sqsum.create(h, w, CV_32F); sqsum.create(height, width, CV_32F);
NppiSize sz; NppiSize sz;
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), src.step, sum.ptr<Npp32s>(), cudaStream_t stream = StreamAccessor::getStream(s);
sum.step, sqsum.ptr<Npp32f>(), sqsum.step, sz, 0, 0.0f, h) );
cudaSafeCall( cudaThreadSynchronize() ); NppStreamHandler h(stream);
nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), src.step, sum.ptr<Npp32s>(),
sum.step, sqsum.ptr<Npp32f>(), sqsum.step, sz, 0, 0.0f, height) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// sqrIntegral // sqrIntegral
void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum) void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
{ {
CV_Assert(src.type() == CV_8U); CV_Assert(src.type() == CV_8U);
@@ -618,11 +626,16 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum)
nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
GpuMat buf(1, bufSize, CV_8U); GpuMat buf(1, bufSize, CV_8U);
cudaStream_t stream = StreamAccessor::getStream(s);
NppStStreamHandler h(stream);
sqsum.create(src.rows + 1, src.cols + 1, CV_64F); sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step, nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step,
sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop)); sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -641,7 +654,7 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
imgproc::columnSum_32F(src, dst); imgproc::columnSum_32F(src, dst);
} }
void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect) void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
{ {
CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1); CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1);
@@ -657,69 +670,17 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
nppRect.x = rect.x; nppRect.x = rect.x;
nppRect.y = rect.y; nppRect.y = rect.y;
cudaStream_t stream = StreamAccessor::getStream(s);
NppStreamHandler h(stream);
nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), src.step, sqr.ptr<Npp32f>(), sqr.step, nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), src.step, sqr.ptr<Npp32f>(), sqr.step,
dst.ptr<Npp32f>(), dst.step, sz, nppRect) ); dst.ptr<Npp32f>(), dst.step, sz, nppRect) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////
// Canny
//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat srcDx, srcDy;
//
// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);
// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);
//
// GpuMat buf;
//
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat srcDx, srcDy;
//
// Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);
// Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);
//
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
//
// GpuMat buf;
// Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);
//}
//
//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)
//{
// CV_Assert(!"disabled until fix crash");
// CV_Assert(srcDx.type() == CV_32FC1 && srcDy.type() == CV_32FC1 && srcDx.size() == srcDy.size());
//
// edges.create(srcDx.size(), CV_8UC1);
//
// NppiSize sz;
// sz.height = srcDx.rows;
// sz.width = srcDx.cols;
//
// int bufsz;
// nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) );
// ensureSizeIsEnough(1, bufsz, CV_8UC1, buf);
//
// nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr<Npp32f>(), srcDx.step, srcDy.ptr<Npp32f>(), srcDy.step,
// edges.ptr<Npp8u>(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr<Npp8u>()) );
//
// cudaSafeCall( cudaThreadSynchronize() );
//}
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Histogram // Histogram
@@ -755,7 +716,7 @@ namespace
{ {
typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t; typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
static void hist(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel) static void hist(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
{ {
int levels = histSize + 1; int levels = histSize + 1;
hist.create(1, histSize, CV_32S); hist.create(1, histSize, CV_32S);
@@ -769,10 +730,14 @@ namespace
get_buf_size(sz, levels, &buf_size); get_buf_size(sz, levels, &buf_size);
buffer.create(1, buf_size, CV_8U); buffer.create(1, buf_size, CV_8U);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels, nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels,
lowerLevel, upperLevel, buffer.ptr<Npp8u>()) ); lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size> template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
@@ -780,7 +745,7 @@ namespace
{ {
typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t; typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]) static void hist(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
{ {
int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1}; int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
hist[0].create(1, histSize[0], CV_32S); hist[0].create(1, histSize[0], CV_32S);
@@ -799,9 +764,13 @@ namespace
get_buf_size(sz, levels, &buf_size); get_buf_size(sz, levels, &buf_size);
buffer.create(1, buf_size, CV_8U); buffer.create(1, buf_size, CV_8U);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) ); nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, levels, lowerLevel, upperLevel, buffer.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
@@ -849,7 +818,7 @@ namespace
typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t; typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE}; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels) static void hist(const GpuMat& src, GpuMat& hist, const GpuMat& levels, cudaStream_t stream)
{ {
CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1); CV_Assert(levels.type() == LEVEL_TYPE_CODE && levels.rows == 1);
@@ -864,9 +833,13 @@ namespace
get_buf_size(sz, levels.cols, &buf_size); get_buf_size(sz, levels.cols, &buf_size);
buffer.create(1, buf_size, CV_8U); buffer.create(1, buf_size, CV_8U);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) ); nppSafeCall( func(src.ptr<src_t>(), src.step, sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buffer.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size> template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
@@ -876,7 +849,7 @@ namespace
typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t; typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE}; enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]) static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], cudaStream_t stream)
{ {
CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1); CV_Assert(levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1);
CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1); CV_Assert(levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1);
@@ -901,9 +874,13 @@ namespace
get_buf_size(sz, nLevels, &buf_size); get_buf_size(sz, nLevels, &buf_size);
buffer.create(1, buf_size, CV_8U); buffer.create(1, buf_size, CV_8U);
NppStreamHandler h(stream);
nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) ); nppSafeCall( func(src.ptr<src_t>(), src.step, sz, pHist, pLevels, nLevels, buffer.ptr<Npp8u>()) );
cudaSafeCall( cudaThreadSynchronize() ); if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
} }
@@ -915,11 +892,11 @@ void cv::gpu::evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperL
levels.upload(host_levels); levels.upload(host_levels);
} }
void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel) void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream)
{ {
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 ); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, int levels, int lowerLevel, int upperLevel); typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
static const hist_t hist_callers[] = static const hist_t hist_callers[] =
{ {
NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist, NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
@@ -928,14 +905,14 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerL
NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
}; };
hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel); hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
} }
void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4]) void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
{ {
CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 ); CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4]); typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
static const hist_t hist_callers[] = static const hist_t hist_callers[] =
{ {
NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist, NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
@@ -944,14 +921,14 @@ void cv::gpu::histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int l
NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
}; };
hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel); hist_callers[src.depth()](src, hist, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
} }
void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels) void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream)
{ {
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels); typedef void (*hist_t)(const GpuMat& src, GpuMat& hist, const GpuMat& levels, cudaStream_t stream);
static const hist_t hist_callers[] = static const hist_t hist_callers[] =
{ {
NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist, NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
@@ -962,14 +939,14 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels)
NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
}; };
hist_callers[src.depth()](src, hist, levels); hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream));
} }
void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]) void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream)
{ {
CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4); CV_Assert(src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4);
typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4]); typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], cudaStream_t stream);
static const hist_t hist_callers[] = static const hist_t hist_callers[] =
{ {
NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist, NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
@@ -980,7 +957,7 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
}; };
hist_callers[src.depth()](src, hist, levels); hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////

View File

@@ -128,7 +128,7 @@ void cv::gpu::GpuMat::copyTo( GpuMat& m ) const
CV_DbgAssert(!this->empty()); CV_DbgAssert(!this->empty());
m.create(size(), type()); m.create(size(), type());
cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const void cv::gpu::GpuMat::copyTo( GpuMat& mat, const GpuMat& mask ) const
@@ -179,7 +179,7 @@ namespace
sz.height = src.rows; sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) ); nppSafeCall( func(src.ptr<src_t>(), src.step, dst.ptr<dst_t>(), dst.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func> template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
@@ -193,7 +193,7 @@ namespace
sz.height = src.rows; sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) ); nppSafeCall( func(src.ptr<Npp32f>(), src.step, dst.ptr<dst_t>(), dst.step, sz, NPP_RND_NEAR) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
@@ -349,7 +349,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) ); nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func> template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
@@ -364,7 +364,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) ); nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
@@ -400,7 +400,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) ); nppSafeCall( func(nppS.val, src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func> template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
@@ -415,7 +415,7 @@ namespace
Scalar_<src_t> nppS = s; Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) ); nppSafeCall( func(nppS[0], src.ptr<src_t>(), src.step, sz, mask.ptr<Npp8u>(), mask.step) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
@@ -463,8 +463,8 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
{ {
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set}, {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>}, {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,kernelSet<ushort>,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set}, {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,kernelSet<short>,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set}, {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set}, {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set}, {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}, {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>},

View File

@@ -114,24 +114,14 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
sz.width = src.cols; sz.width = src.cols;
sz.height = src.rows; sz.height = src.rows;
#if NPP_VERSION_MAJOR >= 4
DeviceBuffer dbuf(2); DeviceBuffer dbuf(2);
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) ); nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, dbuf, (double*)dbuf + 1) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
double* ptrs[2] = {mean.val, stddev.val}; double* ptrs[2] = {mean.val, stddev.val};
dbuf.download(ptrs); dbuf.download(ptrs);
#else
nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), src.step, sz, mean.val, stddev.val) );
cudaSafeCall( cudaThreadSynchronize() );
#endif
} }
@@ -185,24 +175,14 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
double retVal; double retVal;
#if NPP_VERSION_MAJOR >= 4
DeviceBuffer dbuf; DeviceBuffer dbuf;
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) ); nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, dbuf) );
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
dbuf.download(&retVal); dbuf.download(&retVal);
#else
nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, sz, &retVal) );
cudaSafeCall( cudaThreadSynchronize() );
#endif
return retVal; return retVal;
} }

View File

@@ -332,7 +332,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T1, typename T2, typename D, typename BinOp, typename Mask> template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -349,7 +349,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template<> struct TransformDispatcher<true> template<> struct TransformDispatcher<true>
@@ -370,7 +370,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T1, typename T2, typename D, typename BinOp, typename Mask> template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -389,7 +389,7 @@ namespace cv
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };

View File

@@ -77,8 +77,8 @@
#include "nvidia/NPP_staging/NPP_staging.hpp" #include "nvidia/NPP_staging/NPP_staging.hpp"
#include "nvidia/NCVHaarObjectDetection.hpp" #include "nvidia/NCVHaarObjectDetection.hpp"
#define CUDART_MINIMUM_REQUIRED_VERSION 3020 #define CUDART_MINIMUM_REQUIRED_VERSION 4000
#define NPP_MINIMUM_REQUIRED_VERSION 3216 #define NPP_MINIMUM_REQUIRED_VERSION 4000
#if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION) #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
#error "Insufficient Cuda Runtime library version, please update it." #error "Insufficient Cuda Runtime library version, please update it."

View File

@@ -46,14 +46,10 @@ using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); } void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); } void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -148,51 +144,25 @@ namespace cv { namespace gpu { namespace split_merge
}}} }}}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
{
split_merge::merge(src, n, dst, 0);
}
void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst)
{
split_merge::merge(&src[0], src.size(), dst, 0);
}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream)
{ {
split_merge::merge(src, n, dst, StreamAccessor::getStream(stream)); split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream) void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream)
{ {
split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream)); split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, GpuMat* dst) void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
{
split_merge::split(src, dst, 0);
}
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst)
{
dst.resize(src.channels());
if(src.channels() > 0)
split_merge::split(src, &dst[0], 0);
}
void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream)
{ {
split_merge::split(src, dst, StreamAccessor::getStream(stream)); split_merge::split(src, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream) void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream)
{ {
dst.resize(src.channels()); dst.resize(src.channels());
if(src.channels() > 0) if(src.channels() > 0)

View File

@@ -51,8 +51,7 @@ cv::gpu::StereoBM_GPU::StereoBM_GPU() { throw_nogpu(); }
cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_nogpu(); } cv::gpu::StereoBM_GPU::StereoBM_GPU(int, int, int) { throw_nogpu(); }
bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; } bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable() { throw_nogpu(); return false; }
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&, const Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -124,13 +123,7 @@ static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& ri
bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream); bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
} }
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity)
{
::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, 0);
}
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const Stream& stream)
{ {
::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream)); ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
} }

View File

@@ -53,10 +53,8 @@ void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int, int, int&,
cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_nogpu(); } cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, int) { throw_nogpu(); }
cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_nogpu(); } cv::gpu::StereoBeliefPropagation::StereoBeliefPropagation(int, int, int, float, float, float, float, int) { throw_nogpu(); }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -133,7 +131,7 @@ namespace
CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < numeric_limits<short>::max()); CV_Assert(rthis.msg_type == CV_32F || (1 << (rthis.levels - 1)) * scale * rthis.max_data_term < numeric_limits<short>::max());
} }
void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream) void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{ {
typedef void (*comp_data_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream); typedef void (*comp_data_t)(const DevMem2D& left, const DevMem2D& right, const DevMem2D& data, cudaStream_t stream);
static const comp_data_t comp_data_callers[2][5] = static const comp_data_t comp_data_callers[2][5] =
@@ -154,16 +152,16 @@ namespace
const int min_image_dim_size = 2; const int min_image_dim_size = 2;
CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size); CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
init(); init(stream);
datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type); datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], stream); comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream));
calcBP(disp, stream); calcBP(disp, stream);
} }
void operator()(const GpuMat& data, GpuMat& disp, cudaStream_t stream) void operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
{ {
CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0)); CV_Assert((data.type() == rthis.msg_type) && (data.rows % rthis.ndisp == 0));
@@ -176,14 +174,14 @@ namespace
const int min_image_dim_size = 2; const int min_image_dim_size = 2;
CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size); CV_Assert(min(lowest_cols, lowest_rows) > min_image_dim_size);
init(); init(stream);
datas[0] = data; datas[0] = data;
calcBP(disp, stream); calcBP(disp, stream);
} }
private: private:
void init() void init(Stream& stream)
{ {
u.create(rows * rthis.ndisp, cols, rthis.msg_type); u.create(rows * rthis.ndisp, cols, rthis.msg_type);
d.create(rows * rthis.ndisp, cols, rthis.msg_type); d.create(rows * rthis.ndisp, cols, rthis.msg_type);
@@ -193,10 +191,20 @@ namespace
if (rthis.levels & 1) if (rthis.levels & 1)
{ {
//can clear less area //can clear less area
u = zero; if (stream)
d = zero; {
l = zero; stream.enqueueMemSet(u, zero);
r = zero; stream.enqueueMemSet(d, zero);
stream.enqueueMemSet(l, zero);
stream.enqueueMemSet(r, zero);
}
else
{
u.setTo(zero);
d.setTo(zero);
l.setTo(zero);
r.setTo(zero);
}
} }
if (rthis.levels > 1) if (rthis.levels > 1)
@@ -211,10 +219,20 @@ namespace
if ((rthis.levels & 1) == 0) if ((rthis.levels & 1) == 0)
{ {
u2 = zero; if (stream)
d2 = zero; {
l2 = zero; stream.enqueueMemSet(u2, zero);
r2 = zero; stream.enqueueMemSet(d2, zero);
stream.enqueueMemSet(l2, zero);
stream.enqueueMemSet(r2, zero);
}
else
{
u2.setTo(zero);
d2.setTo(zero);
l2.setTo(zero);
r2.setTo(zero);
}
} }
} }
@@ -229,7 +247,7 @@ namespace
rows_all[0] = rows; rows_all[0] = rows;
} }
void calcBP(GpuMat& disp, cudaStream_t stream) void calcBP(GpuMat& disp, Stream& stream)
{ {
using namespace cv::gpu::bp; using namespace cv::gpu::bp;
@@ -259,6 +277,8 @@ namespace
const int funcIdx = rthis.msg_type == CV_32F; const int funcIdx = rthis.msg_type == CV_32F;
cudaStream_t cudaStream = StreamAccessor::getStream(stream);
for (int i = 1; i < rthis.levels; ++i) for (int i = 1; i < rthis.levels; ++i)
{ {
cols_all[i] = (cols_all[i-1] + 1) / 2; cols_all[i] = (cols_all[i-1] + 1) / 2;
@@ -266,7 +286,7 @@ namespace
datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type); datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], stream); data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
} }
DevMem2D mus[] = {u, u2}; DevMem2D mus[] = {u, u2};
@@ -280,9 +300,9 @@ namespace
{ {
// for lower level we have already computed messages by setting to zero // for lower level we have already computed messages by setting to zero
if (i != rthis.levels - 1) if (i != rthis.levels - 1)
level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, stream); level_up_messages_callers[funcIdx](mem_idx, cols_all[i], rows_all[i], rows_all[i+1], mus, mds, mls, mrs, cudaStream);
calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], stream); calc_all_iterations_callers[funcIdx](cols_all[i], rows_all[i], rthis.iters, mus[mem_idx], mds[mem_idx], mls[mem_idx], mrs[mem_idx], datas[i], cudaStream);
mem_idx = (mem_idx + 1) & 1; mem_idx = (mem_idx + 1) & 1;
} }
@@ -291,13 +311,22 @@ namespace
disp.create(rows, cols, CV_16S); disp.create(rows, cols, CV_16S);
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
out = zero;
output_callers[funcIdx](u, d, l, r, datas.front(), out, stream); if (stream)
stream.enqueueMemSet(out, zero);
else
out.setTo(zero);
output_callers[funcIdx](u, d, l, r, datas.front(), out, cudaStream);
if (disp.type() != CV_16S) if (disp.type() != CV_16S)
{
if (stream)
stream.enqueueConvert(out, disp, disp.type());
else
out.convertTo(disp, disp.type()); out.convertTo(disp, disp.type());
} }
}
StereoBeliefPropagation& rthis; StereoBeliefPropagation& rthis;
@@ -323,28 +352,16 @@ namespace
}; };
} }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp)
{
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(left, right, disp, 0);
}
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{ {
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(left, right, disp, StreamAccessor::getStream(stream)); impl(left, right, disp, stream);
}
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp)
{
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(data, disp, 0);
} }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream) void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
{ {
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(data, disp, StreamAccessor::getStream(stream)); impl(data, disp, stream);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@@ -53,7 +53,6 @@ void cv::gpu::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int&, i
cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_nogpu(); } cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) { throw_nogpu(); }
cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_nogpu(); } cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, float, float, int, int) { throw_nogpu(); }
void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
@@ -136,7 +135,7 @@ cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, in
template<class T> template<class T>
static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream) GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{ {
CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
&& left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
@@ -145,6 +144,8 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
const Scalar zero = Scalar::all(0); const Scalar zero = Scalar::all(0);
cudaStream_t cudaStream = StreamAccessor::getStream(stream);
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Init // Init
@@ -210,18 +211,36 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp); rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
l[0] = zero; if (stream)
d[0] = zero; {
r[0] = zero; stream.enqueueMemSet(l[0], zero);
u[0] = zero; stream.enqueueMemSet(d[0], zero);
stream.enqueueMemSet(r[0], zero);
stream.enqueueMemSet(u[0], zero);
l[1] = zero; stream.enqueueMemSet(l[1], zero);
d[1] = zero; stream.enqueueMemSet(d[1], zero);
r[1] = zero; stream.enqueueMemSet(r[1], zero);
u[1] = zero; stream.enqueueMemSet(u[1], zero);
data_cost = zero; stream.enqueueMemSet(data_cost, zero);
data_cost_selected = zero; stream.enqueueMemSet(data_cost_selected, zero);
}
else
{
l[0].setTo(zero);
d[0].setTo(zero);
r[0].setTo(zero);
u[0].setTo(zero);
l[1].setTo(zero);
d[1].setTo(zero);
r[1].setTo(zero);
u[1].setTo(zero);
data_cost.setTo(zero);
data_cost_selected.setTo(zero);
}
int cur_idx = 0; int cur_idx = 0;
@@ -230,12 +249,12 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
if (i == levels - 1) if (i == levels - 1)
{ {
csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, stream); step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
} }
else else
{ {
csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), stream); left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);
int new_idx = (cur_idx + 1) & 1; int new_idx = (cur_idx + 1) & 1;
@@ -243,46 +262,49 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i], data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], stream); cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);
cur_idx = new_idx; cur_idx = new_idx;
} }
csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i], data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, stream); rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
} }
if (disp.empty()) if (disp.empty())
disp.create(rows, cols, CV_16S); disp.create(rows, cols, CV_16S);
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
out = zero;
if (stream)
stream.enqueueMemSet(out, zero);
else
out.setTo(zero);
csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], stream); data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);
if (disp.type() != CV_16S) if (disp.type() != CV_16S)
{
if (stream)
stream.enqueueConvert(out, disp, disp.type());
else
out.convertTo(disp, disp.type()); out.convertTo(disp, disp.type());
} }
}
typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2], typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected, GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, cudaStream_t stream); GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream);
const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0}; const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp)
{
CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, 0);
}
void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{ {
CV_Assert(msg_type == CV_32F || msg_type == CV_16S); CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, StreamAccessor::getStream(stream)); operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, stream);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */