Merge pull request #1748 from SpecLad:merge-2.4
This commit is contained in:
commit
8b19df3fec
1792
cmake/FindCUDA.cmake
Normal file
1792
cmake/FindCUDA.cmake
Normal file
File diff suppressed because it is too large
Load Diff
93
cmake/FindCUDA/make2cmake.cmake
Normal file
93
cmake/FindCUDA/make2cmake.cmake
Normal file
@ -0,0 +1,93 @@
|
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler)
|
||||
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
|
||||
#
|
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2007-2009
|
||||
# Scientific Computing and Imaging Institute, University of Utah
|
||||
#
|
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script
|
||||
# for the text of the license.
|
||||
|
||||
# The MIT License
|
||||
#
|
||||
# License for the specific language governing rights and limitations under
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
|
||||
#######################################################################
|
||||
# This converts a file written in makefile syntax into one that can be included
|
||||
# by CMake.
|
||||
|
||||
file(READ ${input_file} depend_text)
|
||||
|
||||
if (${depend_text} MATCHES ".+")
|
||||
|
||||
# message("FOUND DEPENDS")
|
||||
|
||||
# Remember, four backslashes is escaped to one backslash in the string.
|
||||
string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
|
||||
|
||||
# This works for the nvcc -M generated dependency files.
|
||||
string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
|
||||
string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
|
||||
|
||||
set(dependency_list "")
|
||||
|
||||
foreach(file ${depend_text})
|
||||
|
||||
string(REGEX REPLACE "^ +" "" file ${file})
|
||||
|
||||
# OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
|
||||
# instead of '//'. Here we will test to see if the file exists, if it doesn't then
|
||||
# try to prepend another '/' to the path and test again. If it still fails remove the
|
||||
# path.
|
||||
|
||||
if(NOT EXISTS "${file}")
|
||||
if (EXISTS "/${file}")
|
||||
set(file "/${file}")
|
||||
else()
|
||||
message(WARNING " Removing non-existent dependency file: ${file}")
|
||||
set(file "")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT IS_DIRECTORY "${file}")
|
||||
# If softlinks start to matter, we should change this to REALPATH. For now we need
|
||||
# to flatten paths, because nvcc can generate stuff like /bin/../include instead of
|
||||
# just /include.
|
||||
get_filename_component(file_absolute "${file}" ABSOLUTE)
|
||||
list(APPEND dependency_list "${file_absolute}")
|
||||
endif()
|
||||
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
# message("FOUND NO DEPENDS")
|
||||
endif()
|
||||
|
||||
# Remove the duplicate entries and sort them.
|
||||
list(REMOVE_DUPLICATES dependency_list)
|
||||
list(SORT dependency_list)
|
||||
|
||||
foreach(file ${dependency_list})
|
||||
set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
|
||||
endforeach()
|
||||
|
||||
file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
|
110
cmake/FindCUDA/parse_cubin.cmake
Normal file
110
cmake/FindCUDA/parse_cubin.cmake
Normal file
@ -0,0 +1,110 @@
|
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler)
|
||||
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
|
||||
#
|
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# Copyright (c) 2007-2009
|
||||
# Scientific Computing and Imaging Institute, University of Utah
|
||||
#
|
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script
|
||||
# for the text of the license.
|
||||
|
||||
# The MIT License
|
||||
#
|
||||
# License for the specific language governing rights and limitations under
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
|
||||
#######################################################################
|
||||
# Parses a .cubin file produced by nvcc and reports statistics about the file.
|
||||
|
||||
|
||||
file(READ ${input_file} file_text)
|
||||
|
||||
if (${file_text} MATCHES ".+")
|
||||
|
||||
# Remember, four backslashes is escaped to one backslash in the string.
|
||||
string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
|
||||
string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
|
||||
|
||||
list(LENGTH file_text len)
|
||||
|
||||
foreach(line ${file_text})
|
||||
|
||||
# Only look at "code { }" blocks.
|
||||
if(line MATCHES "^code")
|
||||
|
||||
# Break into individual lines.
|
||||
string(REGEX REPLACE "\n" ";" line ${line})
|
||||
|
||||
foreach(entry ${line})
|
||||
|
||||
# Extract kernel names.
|
||||
if (${entry} MATCHES "[^g]name = ([^ ]+)")
|
||||
string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
|
||||
|
||||
# Check to see if the kernel name starts with "_"
|
||||
set(skip FALSE)
|
||||
# if (${entry} MATCHES "^_")
|
||||
# Skip the rest of this block.
|
||||
# message("Skipping ${entry}")
|
||||
# set(skip TRUE)
|
||||
# else ()
|
||||
message("Kernel: ${entry}")
|
||||
# endif ()
|
||||
|
||||
endif()
|
||||
|
||||
# Skip the rest of the block if necessary
|
||||
if(NOT skip)
|
||||
|
||||
# Registers
|
||||
if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
|
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
|
||||
message("Registers: ${entry}")
|
||||
endif()
|
||||
|
||||
# Local memory
|
||||
if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
|
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
|
||||
message("Local: ${entry}")
|
||||
endif()
|
||||
|
||||
# Shared memory
|
||||
if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
|
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
|
||||
message("Shared: ${entry}")
|
||||
endif()
|
||||
|
||||
if (${entry} MATCHES "^}")
|
||||
message("")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
|
||||
endforeach()
|
||||
|
||||
endif()
|
||||
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
# message("FOUND NO DEPENDS")
|
||||
endif()
|
288
cmake/FindCUDA/run_nvcc.cmake
Normal file
288
cmake/FindCUDA/run_nvcc.cmake
Normal file
@ -0,0 +1,288 @@
|
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler)
|
||||
#
|
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved.
|
||||
#
|
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script
|
||||
# for the text of the license.
|
||||
|
||||
# The MIT License
|
||||
#
|
||||
# License for the specific language governing rights and limitations under
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
##########################################################################
|
||||
# This file runs the nvcc commands to produce the desired output file along with
|
||||
# the dependency file needed by CMake to compute dependencies. In addition the
|
||||
# file checks the output of each command and if the command fails it deletes the
|
||||
# output files.
|
||||
|
||||
# Input variables
|
||||
#
|
||||
# verbose:BOOL=<> OFF: Be as quiet as possible (default)
|
||||
# ON : Describe each step
|
||||
#
|
||||
# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
|
||||
# RelWithDebInfo, but it should match one of the
|
||||
# entries in CUDA_HOST_FLAGS. This is the build
|
||||
# configuration used when compiling the code. If
|
||||
# blank or unspecified Debug is assumed as this is
|
||||
# what CMake does.
|
||||
#
|
||||
# generated_file:STRING=<> File to generate. This argument must be passed in.
|
||||
#
|
||||
# generated_cubin_file:STRING=<> File to generate. This argument must be passed
|
||||
# in if build_cubin is true.
|
||||
|
||||
if(NOT generated_file)
|
||||
message(FATAL_ERROR "You must specify generated_file on the command line")
|
||||
endif()
|
||||
|
||||
# Set these up as variables to make reading the generated file easier
|
||||
set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
|
||||
set(source_file "@source_file@") # path
|
||||
set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
|
||||
set(cmake_dependency_file "@cmake_dependency_file@") # path
|
||||
set(CUDA_make2cmake "@CUDA_make2cmake@") # path
|
||||
set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
|
||||
set(build_cubin @build_cubin@) # bool
|
||||
set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
|
||||
# We won't actually use these variables for now, but we need to set this, in
|
||||
# order to force this file to be run again if it changes.
|
||||
set(generated_file_path "@generated_file_path@") # path
|
||||
set(generated_file_internal "@generated_file@") # path
|
||||
set(generated_cubin_file_internal "@generated_cubin_file@") # path
|
||||
|
||||
set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
|
||||
set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
|
||||
@CUDA_NVCC_FLAGS_CONFIG@
|
||||
set(nvcc_flags @nvcc_flags@) # list
|
||||
set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly).
|
||||
set(format_flag "@format_flag@") # string
|
||||
|
||||
if(build_cubin AND NOT generated_cubin_file)
|
||||
message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
|
||||
endif()
|
||||
|
||||
# This is the list of host compilation flags. It C or CXX should already have
|
||||
# been chosen by FindCUDA.cmake.
|
||||
@CUDA_HOST_FLAGS@
|
||||
|
||||
# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
|
||||
set(nvcc_host_compiler_flags "")
|
||||
# If we weren't given a build_configuration, use Debug.
|
||||
if(NOT build_configuration)
|
||||
set(build_configuration Debug)
|
||||
endif()
|
||||
string(TOUPPER "${build_configuration}" build_configuration)
|
||||
#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
|
||||
foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
|
||||
# Extra quotes are added around each flag to help nvcc parse out flags with spaces.
|
||||
set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"")
|
||||
endforeach()
|
||||
if (nvcc_host_compiler_flags)
|
||||
set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
|
||||
endif()
|
||||
#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
|
||||
# Add the build specific configuration flags
|
||||
list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
|
||||
|
||||
# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
|
||||
list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
|
||||
list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
|
||||
if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
|
||||
if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
|
||||
set(CCBIN -ccbin "${CCBIN}")
|
||||
else()
|
||||
set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# cuda_execute_process - Executes a command with optional command echo and status message.
|
||||
#
|
||||
# status - Status message to print if verbose is true
|
||||
# command - COMMAND argument from the usual execute_process argument structure
|
||||
# ARGN - Remaining arguments are the command with arguments
|
||||
#
|
||||
# CUDA_result - return value from running the command
|
||||
#
|
||||
# Make this a macro instead of a function, so that things like RESULT_VARIABLE
|
||||
# and other return variables are present after executing the process.
|
||||
macro(cuda_execute_process status command)
|
||||
set(_command ${command})
|
||||
if(NOT _command STREQUAL "COMMAND")
|
||||
message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})")
|
||||
endif()
|
||||
if(verbose)
|
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
|
||||
# Now we need to build up our command string. We are accounting for quotes
|
||||
# and spaces, anything else is left up to the user to fix if they want to
|
||||
# copy and paste a runnable command line.
|
||||
set(cuda_execute_process_string)
|
||||
foreach(arg ${ARGN})
|
||||
# If there are quotes, excape them, so they come through.
|
||||
string(REPLACE "\"" "\\\"" arg ${arg})
|
||||
# Args with spaces need quotes around them to get them to be parsed as a single argument.
|
||||
if(arg MATCHES " ")
|
||||
list(APPEND cuda_execute_process_string "\"${arg}\"")
|
||||
else()
|
||||
list(APPEND cuda_execute_process_string ${arg})
|
||||
endif()
|
||||
endforeach()
|
||||
# Echo the command
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
|
||||
endif()
|
||||
# Run the command
|
||||
execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
|
||||
endmacro()
|
||||
|
||||
# Delete the target file
|
||||
cuda_execute_process(
|
||||
"Removing ${generated_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
|
||||
)
|
||||
|
||||
# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
|
||||
# for dependency generation and hope for the best.
|
||||
set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
|
||||
set(CUDA_VERSION @CUDA_VERSION@)
|
||||
if(CUDA_VERSION VERSION_LESS "3.0")
|
||||
cmake_policy(PUSH)
|
||||
# CMake policy 0007 NEW states that empty list elements are not
|
||||
# ignored. I'm just setting it to avoid the warning that's printed.
|
||||
cmake_policy(SET CMP0007 NEW)
|
||||
# Note that this will remove all occurances of -G.
|
||||
list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G")
|
||||
cmake_policy(POP)
|
||||
endif()
|
||||
|
||||
# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This
|
||||
# can cause incorrect dependencies when #including files based on this macro which is
|
||||
# defined in the generating passes of nvcc invokation. We will go ahead and manually
|
||||
# define this for now until a future version fixes this bug.
|
||||
set(CUDACC_DEFINE -D__CUDACC__)
|
||||
|
||||
# Generate the dependency file
|
||||
cuda_execute_process(
|
||||
"Generating dependency file: ${NVCC_generated_dependency_file}"
|
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}"
|
||||
-M
|
||||
${CUDACC_DEFINE}
|
||||
"${source_file}"
|
||||
-o "${NVCC_generated_dependency_file}"
|
||||
${CCBIN}
|
||||
${nvcc_flags}
|
||||
${nvcc_host_compiler_flags}
|
||||
${depends_CUDA_NVCC_FLAGS}
|
||||
-DNVCC
|
||||
${CUDA_NVCC_INCLUDE_ARGS}
|
||||
)
|
||||
|
||||
if(CUDA_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Generate the cmake readable dependency file to a temp file. Don't put the
|
||||
# quotes just around the filenames for the input_file and output_file variables.
|
||||
# CMake will pass the quotes through and not be able to find the file.
|
||||
cuda_execute_process(
|
||||
"Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
|
||||
COMMAND "${CMAKE_COMMAND}"
|
||||
-D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
|
||||
-D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
|
||||
-P "${CUDA_make2cmake}"
|
||||
)
|
||||
|
||||
if(CUDA_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Copy the file if it is different
|
||||
cuda_execute_process(
|
||||
"Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
|
||||
)
|
||||
|
||||
if(CUDA_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Delete the temporary file
|
||||
cuda_execute_process(
|
||||
"Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
|
||||
)
|
||||
|
||||
if(CUDA_result)
|
||||
message(FATAL_ERROR "Error generating ${generated_file}")
|
||||
endif()
|
||||
|
||||
# Generate the code
|
||||
cuda_execute_process(
|
||||
"Generating ${generated_file}"
|
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}"
|
||||
"${source_file}"
|
||||
${format_flag} -o "${generated_file}"
|
||||
${CCBIN}
|
||||
${nvcc_flags}
|
||||
${nvcc_host_compiler_flags}
|
||||
${CUDA_NVCC_FLAGS}
|
||||
-DNVCC
|
||||
${CUDA_NVCC_INCLUDE_ARGS}
|
||||
)
|
||||
|
||||
if(CUDA_result)
|
||||
# Since nvcc can sometimes leave half done files make sure that we delete the output file.
|
||||
cuda_execute_process(
|
||||
"Removing ${generated_file}"
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
|
||||
)
|
||||
message(FATAL_ERROR "Error generating file ${generated_file}")
|
||||
else()
|
||||
if(verbose)
|
||||
message("Generated ${generated_file} successfully.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Cubin resource report commands.
|
||||
if( build_cubin )
|
||||
# Run with -cubin to produce resource usage report.
|
||||
cuda_execute_process(
|
||||
"Generating ${generated_cubin_file}"
|
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}"
|
||||
"${source_file}"
|
||||
${CUDA_NVCC_FLAGS}
|
||||
${nvcc_flags}
|
||||
${CCBIN}
|
||||
${nvcc_host_compiler_flags}
|
||||
-DNVCC
|
||||
-cubin
|
||||
-o "${generated_cubin_file}"
|
||||
${CUDA_NVCC_INCLUDE_ARGS}
|
||||
)
|
||||
|
||||
# Execute the parser script.
|
||||
cuda_execute_process(
|
||||
"Executing the parser script"
|
||||
COMMAND "${CMAKE_COMMAND}"
|
||||
-D "input_file:STRING=${generated_cubin_file}"
|
||||
-P "${CUDA_parse_cubin}"
|
||||
)
|
||||
|
||||
endif()
|
@ -8,8 +8,24 @@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Cl
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
|
||||
|
||||
foreach(var INCLUDE LIBRARY PROGRAM)
|
||||
set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
|
||||
|
||||
find_package(CUDA "${MIN_VER_CUDA}" QUIET)
|
||||
|
||||
foreach(var INCLUDE LIBRARY PROGRAM)
|
||||
set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
|
||||
endforeach()
|
||||
|
||||
list(REMOVE_AT CMAKE_MODULE_PATH 0)
|
||||
|
||||
if(CUDA_FOUND)
|
||||
set(HAVE_CUDA 1)
|
||||
|
||||
@ -21,47 +37,6 @@ if(CUDA_FOUND)
|
||||
set(HAVE_CUBLAS 1)
|
||||
endif()
|
||||
|
||||
if(${CUDA_VERSION} VERSION_LESS "5.5")
|
||||
find_cuda_helper_libs(npp)
|
||||
else()
|
||||
# hack for CUDA 5.5
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
|
||||
unset(CUDA_TOOLKIT_INCLUDE CACHE)
|
||||
unset(CUDA_CUDART_LIBRARY CACHE)
|
||||
unset(CUDA_cublas_LIBRARY CACHE)
|
||||
unset(CUDA_cufft_LIBRARY CACHE)
|
||||
unset(CUDA_npp_LIBRARY CACHE)
|
||||
|
||||
if(SOFTFP)
|
||||
set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabi")
|
||||
else()
|
||||
set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
|
||||
endif()
|
||||
|
||||
set(CUDA_TOOLKIT_INCLUDE "${cuda_arm_path}/include" CACHE PATH "include path")
|
||||
set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
|
||||
|
||||
set(cuda_arm_library_path "${cuda_arm_path}/lib")
|
||||
|
||||
set(CUDA_CUDART_LIBRARY "${cuda_arm_library_path}/libcudart.so" CACHE FILEPATH "cudart library")
|
||||
set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
|
||||
set(CUDA_cublas_LIBRARY "${cuda_arm_library_path}/libcublas.so" CACHE FILEPATH "cublas library")
|
||||
set(CUDA_cufft_LIBRARY "${cuda_arm_library_path}/libcufft.so" CACHE FILEPATH "cufft library")
|
||||
set(CUDA_nppc_LIBRARY "${cuda_arm_library_path}/libnppc.so" CACHE FILEPATH "nppc library")
|
||||
set(CUDA_nppi_LIBRARY "${cuda_arm_library_path}/libnppi.so" CACHE FILEPATH "nppi library")
|
||||
set(CUDA_npps_LIBRARY "${cuda_arm_library_path}/libnpps.so" CACHE FILEPATH "npps library")
|
||||
set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
|
||||
else()
|
||||
unset(CUDA_npp_LIBRARY CACHE)
|
||||
|
||||
find_cuda_helper_libs(nppc)
|
||||
find_cuda_helper_libs(nppi)
|
||||
find_cuda_helper_libs(npps)
|
||||
|
||||
set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_NVCUVID)
|
||||
find_cuda_helper_libs(nvcuvid)
|
||||
if(WIN32)
|
||||
@ -166,10 +141,6 @@ if(CUDA_FOUND)
|
||||
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
|
||||
endforeach()
|
||||
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM")
|
||||
endif()
|
||||
|
||||
# These vars will be processed in other scripts
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
|
||||
set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
|
||||
|
@ -499,7 +499,7 @@ macro(ocv_glob_module_sources)
|
||||
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
|
||||
file(GLOB cl_kernels "src/opencl/*.cl")
|
||||
if(HAVE_OPENCL AND cl_kernels)
|
||||
if(HAVE_opencv_ocl AND cl_kernels)
|
||||
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
|
||||
|
@ -48,10 +48,10 @@ The structure of package contents looks as follows:
|
||||
|
||||
::
|
||||
|
||||
OpenCV-2.4.6-android-sdk
|
||||
OpenCV-2.4.7-android-sdk
|
||||
|_ apk
|
||||
| |_ OpenCV_2.4.6_binary_pack_armv7a.apk
|
||||
| |_ OpenCV_2.4.6_Manager_2.9_XXX.apk
|
||||
| |_ OpenCV_2.4.7_binary_pack_armv7a.apk
|
||||
| |_ OpenCV_2.4.7_Manager_2.13_XXX.apk
|
||||
|
|
||||
|_ doc
|
||||
|_ samples
|
||||
@ -157,10 +157,10 @@ Get the OpenCV4Android SDK
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
unzip ~/Downloads/OpenCV-2.4.6-android-sdk.zip
|
||||
unzip ~/Downloads/OpenCV-2.4.7-android-sdk.zip
|
||||
|
||||
.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.6-android-sdk.zip`
|
||||
.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.6/OpenCV-2.4.6-android-sdk.zip/download
|
||||
.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.7-android-sdk.zip`
|
||||
.. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.7/OpenCV-2.4.7-android-sdk.zip/download
|
||||
.. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
|
||||
.. |seven_zip| replace:: 7-Zip
|
||||
.. _seven_zip: http://www.7-zip.org/
|
||||
@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
|
||||
.. code-block:: sh
|
||||
:linenos:
|
||||
|
||||
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.6_Manager_2.9_armv7a-neon.apk
|
||||
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.7_Manager_2.13_armv7a-neon.apk
|
||||
|
||||
.. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
|
||||
platform targets:
|
||||
|
@ -55,14 +55,14 @@ Manager to access OpenCV libraries externally installed in the target system.
|
||||
:guilabel:`File -> Import -> Existing project in your workspace`.
|
||||
|
||||
Press :guilabel:`Browse` button and locate OpenCV4Android SDK
|
||||
(:file:`OpenCV-2.4.6-android-sdk/sdk`).
|
||||
(:file:`OpenCV-2.4.7-android-sdk/sdk`).
|
||||
|
||||
.. image:: images/eclipse_opencv_dependency0.png
|
||||
:alt: Add dependency from OpenCV library
|
||||
:align: center
|
||||
|
||||
#. In application project add a reference to the OpenCV Java SDK in
|
||||
:guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.6``.
|
||||
:guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``.
|
||||
|
||||
.. image:: images/eclipse_opencv_dependency1.png
|
||||
:alt: Add dependency from OpenCV library
|
||||
@ -128,27 +128,27 @@ described above.
|
||||
#. Add the OpenCV library project to your workspace the same way as for the async initialization
|
||||
above. Use menu :guilabel:`File -> Import -> Existing project in your workspace`,
|
||||
press :guilabel:`Browse` button and select OpenCV SDK path
|
||||
(:file:`OpenCV-2.4.6-android-sdk/sdk`).
|
||||
(:file:`OpenCV-2.4.7-android-sdk/sdk`).
|
||||
|
||||
.. image:: images/eclipse_opencv_dependency0.png
|
||||
:alt: Add dependency from OpenCV library
|
||||
:align: center
|
||||
|
||||
#. In the application project add a reference to the OpenCV4Android SDK in
|
||||
:guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.6``;
|
||||
:guilabel:`Project -> Properties -> Android -> Library -> Add` select ``OpenCV Library - 2.4.7``;
|
||||
|
||||
.. image:: images/eclipse_opencv_dependency1.png
|
||||
:alt: Add dependency from OpenCV library
|
||||
:align: center
|
||||
|
||||
#. If your application project **doesn't have a JNI part**, just copy the corresponding OpenCV
|
||||
native libs from :file:`<OpenCV-2.4.6-android-sdk>/sdk/native/libs/<target_arch>` to your
|
||||
native libs from :file:`<OpenCV-2.4.7-android-sdk>/sdk/native/libs/<target_arch>` to your
|
||||
project directory to folder :file:`libs/<target_arch>`.
|
||||
|
||||
In case of the application project **with a JNI part**, instead of manual libraries copying you
|
||||
need to modify your ``Android.mk`` file:
|
||||
add the following two code lines after the ``"include $(CLEAR_VARS)"`` and before
|
||||
``"include path_to_OpenCV-2.4.6-android-sdk/sdk/native/jni/OpenCV.mk"``
|
||||
``"include path_to_OpenCV-2.4.7-android-sdk/sdk/native/jni/OpenCV.mk"``
|
||||
|
||||
.. code-block:: make
|
||||
:linenos:
|
||||
@ -221,7 +221,7 @@ taken:
|
||||
|
||||
.. code-block:: make
|
||||
|
||||
include C:\Work\OpenCV4Android\OpenCV-2.4.6-android-sdk\sdk\native\jni\OpenCV.mk
|
||||
include C:\Work\OpenCV4Android\OpenCV-2.4.7-android-sdk\sdk\native\jni\OpenCV.mk
|
||||
|
||||
Should be inserted into the :file:`jni/Android.mk` file **after** this line:
|
||||
|
||||
|
@ -84,64 +84,63 @@ public:
|
||||
|
||||
for (size_t j = 0; j < n; ++j)
|
||||
{
|
||||
int tag = tags[j];
|
||||
stringstream s;
|
||||
s << tag;
|
||||
int tag = tags[j];
|
||||
stringstream s;
|
||||
s << tag;
|
||||
|
||||
const string filename = "output_"+s.str()+".avi";
|
||||
const string filename = "output_"+s.str()+".avi";
|
||||
|
||||
try
|
||||
{
|
||||
double fps = fps0;
|
||||
Size frame_s = Size(img_c, img_r);
|
||||
|
||||
if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
|
||||
frame_s = Size(352, 288);
|
||||
else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
|
||||
frame_s = Size(704, 576);
|
||||
/*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
|
||||
tag == CV_FOURCC('j', 'p', 'e', 'g') )
|
||||
frame_s = Size(1920, 1080);*/
|
||||
|
||||
if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
|
||||
try
|
||||
{
|
||||
frame_s = Size(720, 576);
|
||||
fps = 25;
|
||||
}
|
||||
double fps = fps0;
|
||||
Size frame_s = Size(img_c, img_r);
|
||||
|
||||
VideoWriter writer(filename, tag, fps, frame_s);
|
||||
if( tag == VideoWriter::fourcc('H', '2', '6', '1') )
|
||||
frame_s = Size(352, 288);
|
||||
else if( tag == VideoWriter::fourcc('H', '2', '6', '3') )
|
||||
frame_s = Size(704, 576);
|
||||
/*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') ||
|
||||
tag == CV_FOURCC('j', 'p', 'e', 'g') )
|
||||
frame_s = Size(1920, 1080);*/
|
||||
|
||||
if (writer.isOpened() == false)
|
||||
{
|
||||
ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
|
||||
ts->printf(ts->LOG, "Codec id: %d Codec tag: %c%c%c%c\n", j,
|
||||
tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
|
||||
ts->printf(ts->LOG, "Error: cannot create video file.");
|
||||
ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
|
||||
}
|
||||
else
|
||||
{
|
||||
Mat img(frame_s, CV_8UC3, Scalar::all(0));
|
||||
const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
|
||||
|
||||
for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
|
||||
if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') )
|
||||
{
|
||||
//circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
|
||||
rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
|
||||
Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
|
||||
writer << img;
|
||||
frame_s = Size(720, 576);
|
||||
fps = 25;
|
||||
}
|
||||
|
||||
if (!created) created = true;
|
||||
else remove(filename.c_str());
|
||||
}
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
|
||||
}
|
||||
ts->set_failed_test_info(cvtest::TS::OK);
|
||||
VideoWriter writer(filename, tag, fps, frame_s);
|
||||
|
||||
if (writer.isOpened() == false)
|
||||
{
|
||||
ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str());
|
||||
ts->printf(ts->LOG, "Codec id: %d Codec tag: %c%c%c%c\n", j,
|
||||
tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255);
|
||||
ts->printf(ts->LOG, "Error: cannot create video file.");
|
||||
ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
|
||||
}
|
||||
else
|
||||
{
|
||||
Mat img(frame_s, CV_8UC3, Scalar::all(0));
|
||||
const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec));
|
||||
|
||||
for (int i = 0 ; i < static_cast<int>(fps * time_sec); i++ )
|
||||
{
|
||||
//circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2);
|
||||
rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)),
|
||||
Scalar::all(255 * (1.0 - static_cast<double>(i) / (fps * time_sec * 2) )), -1);
|
||||
writer << img;
|
||||
}
|
||||
|
||||
if (!created) created = true;
|
||||
else remove(filename.c_str());
|
||||
}
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT);
|
||||
}
|
||||
ts->set_failed_test_info(cvtest::TS::OK);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -3175,8 +3175,8 @@ public:
|
||||
int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
|
||||
int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
|
||||
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
|
||||
XY[x1*2] = (short)(sx >> INTER_BITS);
|
||||
XY[x1*2+1] = (short)(sy >> INTER_BITS);
|
||||
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
|
||||
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
|
||||
A[x1] = (ushort)v;
|
||||
}
|
||||
}
|
||||
@ -3189,8 +3189,8 @@ public:
|
||||
int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
|
||||
int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
|
||||
int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
|
||||
XY[x1*2] = (short)(sx >> INTER_BITS);
|
||||
XY[x1*2+1] = (short)(sy >> INTER_BITS);
|
||||
XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
|
||||
XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
|
||||
A[x1] = (ushort)v;
|
||||
}
|
||||
}
|
||||
@ -3404,8 +3404,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
{
|
||||
int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
|
||||
int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
|
||||
dst1[x*2] = (short)(ix >> INTER_BITS);
|
||||
dst1[x*2+1] = (short)(iy >> INTER_BITS);
|
||||
dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
|
||||
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
|
||||
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
|
||||
}
|
||||
}
|
||||
@ -3422,8 +3422,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
|
||||
{
|
||||
int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
|
||||
int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
|
||||
dst1[x*2] = (short)(ix >> INTER_BITS);
|
||||
dst1[x*2+1] = (short)(iy >> INTER_BITS);
|
||||
dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
|
||||
dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
|
||||
dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
|
||||
}
|
||||
}
|
||||
|
@ -32,6 +32,11 @@ public class OpenCVLoader
|
||||
*/
|
||||
public static final String OPENCV_VERSION_2_4_6 = "2.4.6";
|
||||
|
||||
/**
|
||||
* OpenCV Library version 2.4.7.
|
||||
*/
|
||||
public static final String OPENCV_VERSION_2_4_7 = "2.4.7";
|
||||
|
||||
|
||||
/**
|
||||
* Loads and initializes OpenCV library from current application package. Roughly, it's an analog of system.loadLibrary("opencv_java").
|
||||
|
@ -543,6 +543,8 @@ static void calcSIFTDescriptor( const Mat& img, Point2f ptf, float ori, float sc
|
||||
float exp_scale = -1.f/(d * d * 0.5f);
|
||||
float hist_width = SIFT_DESCR_SCL_FCTR * scl;
|
||||
int radius = cvRound(hist_width * 1.4142135623730951f * (d + 1) * 0.5f);
|
||||
// Clip the radius to the diagonal of the image to avoid autobuffer too large exception
|
||||
radius = std::min(radius, (int) sqrt((double) img.cols*img.cols + img.rows*img.rows));
|
||||
cos_t /= hist_width;
|
||||
sin_t /= hist_width;
|
||||
|
||||
|
@ -133,7 +133,7 @@ Creates a normalized 2D box filter.
|
||||
|
||||
.. ocv:function:: Ptr<BaseFilter_GPU> ocl::getBoxFilter_GPU(int srcType, int dstType, const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
|
||||
|
||||
:param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
|
||||
:param srcType: Input image type.
|
||||
|
||||
:param dstType: Output image type. It supports only the same values as the source type.
|
||||
|
||||
@ -141,9 +141,7 @@ Creates a normalized 2D box filter.
|
||||
|
||||
:param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
|
||||
|
||||
:param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
|
||||
|
||||
.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
|
||||
:param borderType: Border type.
|
||||
|
||||
.. seealso:: :ocv:func:`boxFilter`
|
||||
|
||||
@ -153,21 +151,19 @@ Smooths the image using the normalized box filter.
|
||||
|
||||
.. ocv:function:: void ocl::boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
|
||||
|
||||
:param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
|
||||
:param src: Input image.
|
||||
|
||||
:param dst: Output image type. The size and type is the same as ``src`` .
|
||||
|
||||
:param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
|
||||
:param ddepth: Desired depth of the destination image. If it is negative, it is the same as ``src.depth()`` . It supports only the same depth as the source image depth.
|
||||
|
||||
:param ksize: Kernel size.
|
||||
|
||||
:param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
|
||||
|
||||
:param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
|
||||
:param borderType: Border type.
|
||||
|
||||
Smoothes image using box filter.Supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4.
|
||||
|
||||
.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
|
||||
Smoothes image using box filter.
|
||||
|
||||
ocl::blur
|
||||
-------------
|
||||
@ -175,7 +171,7 @@ Acts as a synonym for the normalized box filter.
|
||||
|
||||
.. ocv:function:: void ocl::blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), int borderType = BORDER_CONSTANT)
|
||||
|
||||
:param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
|
||||
:param src: Input image.
|
||||
|
||||
:param dst: Output image type with the same size and type as ``src`` .
|
||||
|
||||
@ -183,9 +179,7 @@ Acts as a synonym for the normalized box filter.
|
||||
|
||||
:param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
|
||||
|
||||
:param borderType: Supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP.
|
||||
|
||||
.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
|
||||
:param borderType: Border type.
|
||||
|
||||
.. seealso:: :ocv:func:`blur`, :ocv:func:`ocl::boxFilter`
|
||||
|
||||
@ -217,11 +211,11 @@ Creates a non-separable linear filter.
|
||||
|
||||
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
|
||||
|
||||
:param srcType: Input image type. Supports ``CV_8U`` , ``CV_16U`` and ``CV_32F`` one and four channel image.
|
||||
:param srcType: Input image type..
|
||||
|
||||
:param dstType: Output image type. The same type as ``src`` is supported.
|
||||
|
||||
:param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`ocl::convolve`.
|
||||
:param kernel: 2D array of filter coefficients.
|
||||
|
||||
:param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
|
||||
|
||||
@ -234,9 +228,9 @@ ocl::filter2D
|
||||
-----------------
|
||||
Applies the non-separable 2D linear filter to an image.
|
||||
|
||||
.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT)
|
||||
.. ocv:function:: void ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT)
|
||||
|
||||
:param src: Source image. Supports ``CV_8U`` , ``CV_16U`` and ``CV_32F`` one and four channel image.
|
||||
:param src: Source image.
|
||||
|
||||
:param dst: Destination image. The size and the number of channels is the same as ``src`` .
|
||||
|
||||
@ -246,9 +240,9 @@ Applies the non-separable 2D linear filter to an image.
|
||||
|
||||
:param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
|
||||
|
||||
:param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
|
||||
:param delta: optional value added to the filtered pixels before storing them in ``dst``. Value '0' is supported only.
|
||||
|
||||
:param stream: Stream for the asynchronous version.
|
||||
:param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
|
||||
|
||||
ocl::getLinearRowFilter_GPU
|
||||
-------------------------------
|
||||
@ -447,7 +441,7 @@ ocl::Laplacian
|
||||
------------------
|
||||
Returns void
|
||||
|
||||
.. ocv:function:: void ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1)
|
||||
.. ocv:function:: void ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT)
|
||||
|
||||
:param src: The source image
|
||||
|
||||
@ -459,6 +453,10 @@ Returns void
|
||||
|
||||
:param scale: The optional scale factor for the computed Laplacian values (by default, no scaling is applied
|
||||
|
||||
:param delta: Optional delta value that is added to the results prior to storing them in ``dst`` . Supported value is 0 only.
|
||||
|
||||
:param bordertype: Pixel extrapolation method.
|
||||
|
||||
The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator.
|
||||
|
||||
ocl::ConvolveBuf
|
||||
|
@ -86,3 +86,27 @@ Finds centers of clusters and groups input samples around the clusters.
|
||||
* **KMEANS_USE_INITIAL_LABELS** During the first (and possibly the only) attempt, use the user-supplied labels instead of computing them from the initial centers. For the second and further attempts, use the random or semi-random centers. Use one of ``KMEANS_*_CENTERS`` flag to specify the exact method.
|
||||
|
||||
:param centers: Output matrix of the cluster centers, one row per each cluster center.
|
||||
|
||||
ocl::distanceToCenters
|
||||
----------------------
|
||||
For each samples in ``source``, find its closest neighour in ``centers``.
|
||||
|
||||
.. ocv:function:: void ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType = NORM_L2SQR, const oclMat &indices = oclMat())
|
||||
|
||||
:param dists: The output distances calculated from each sample to the best matched center.
|
||||
|
||||
:param labels: The output index of best matched center for each row of sample.
|
||||
|
||||
:param src: Floating-point matrix of input samples. One row per sample.
|
||||
|
||||
:param centers: Floating-point matrix of center candidates. One row per center.
|
||||
|
||||
:param distType: Distance metric to calculate distances. Supports ``NORM_L1`` and ``NORM_L2SQR``.
|
||||
|
||||
:param indices: Optional source indices. If not empty:
|
||||
|
||||
* only the indexed source samples will be processed
|
||||
* outputs, i.e., ``dists`` and ``labels``, have the same size of indices
|
||||
* outputs are in the same order of indices instead of the order of src
|
||||
|
||||
The method is a utility function which maybe used for multiple clustering algorithms such as K-means.
|
||||
|
@ -23,7 +23,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -111,6 +111,7 @@ namespace cv
|
||||
|
||||
bool haveDoubleSupport;
|
||||
bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
|
||||
bool isIntelDevice;
|
||||
|
||||
std::string compilationExtraOptions;
|
||||
|
||||
@ -154,7 +155,8 @@ namespace cv
|
||||
{
|
||||
FEATURE_CL_DOUBLE = 1,
|
||||
FEATURE_CL_UNIFIED_MEM,
|
||||
FEATURE_CL_VER_1_2
|
||||
FEATURE_CL_VER_1_2,
|
||||
FEATURE_CL_INTEL_DEVICE
|
||||
};
|
||||
|
||||
// Represents OpenCL context, interface
|
||||
@ -737,11 +739,12 @@ namespace cv
|
||||
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
|
||||
|
||||
//! applies Laplacian operator to the image
|
||||
// supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type
|
||||
CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1);
|
||||
// supports only ksize = 1 and ksize = 3
|
||||
CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1,
|
||||
double delta=0, int borderType=BORDER_DEFAULT);
|
||||
|
||||
//! returns 2D box filter
|
||||
// supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type
|
||||
// dst type must be the same as source type
|
||||
CV_EXPORTS Ptr<BaseFilter_GPU> getBoxFilter_GPU(int srcType, int dstType,
|
||||
const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
@ -750,17 +753,16 @@ namespace cv
|
||||
const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
//! returns 2D filter with the specified kernel
|
||||
// supports CV_8UC1 and CV_8UC4 types
|
||||
// supports: dst type must be the same as source type
|
||||
CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
|
||||
const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
//! returns the non-separable linear filter engine
|
||||
// supports: dst type must be the same as source type
|
||||
CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel,
|
||||
const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
//! smooths the image using the normalized box filter
|
||||
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
|
||||
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP
|
||||
CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize,
|
||||
Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
|
||||
@ -776,8 +778,6 @@ namespace cv
|
||||
const Point &anchor = Point(-1, -1), int iterations = 1);
|
||||
|
||||
//! a synonym for normalized box filter
|
||||
// supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4
|
||||
// supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101
|
||||
static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1),
|
||||
int borderType = BORDER_CONSTANT)
|
||||
{
|
||||
@ -785,10 +785,8 @@ namespace cv
|
||||
}
|
||||
|
||||
//! applies non-separable 2D linear filter to the image
|
||||
// Note, at the moment this function only works when anchor point is in the kernel center
|
||||
// and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result
|
||||
CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel,
|
||||
Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
|
||||
Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT);
|
||||
|
||||
//! applies separable 2D linear filter to the image
|
||||
CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY,
|
||||
@ -897,7 +895,10 @@ namespace cv
|
||||
|
||||
//! Compute closest centers for each lines in source and lable it after center's index
|
||||
// supports CV_32FC1/CV_32FC2/CV_32FC4 data type
|
||||
CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers);
|
||||
// supports NORM_L1 and NORM_L2 distType
|
||||
// if indices is provided, only the indexed rows will be calculated and their results are in the same
|
||||
// order of indices
|
||||
CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType = NORM_L2SQR, const oclMat &indices = oclMat());
|
||||
|
||||
//!Does k-means procedure on GPU
|
||||
// supports CV_32FC1/CV_32FC2/CV_32FC4 data type
|
||||
@ -964,12 +965,12 @@ namespace cv
|
||||
|
||||
struct CV_EXPORTS CannyBuf
|
||||
{
|
||||
CannyBuf() : counter(NULL) {}
|
||||
CannyBuf() : counter(1, 1, CV_32S) { }
|
||||
~CannyBuf()
|
||||
{
|
||||
release();
|
||||
}
|
||||
explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL)
|
||||
explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S)
|
||||
{
|
||||
create(image_size, apperture_size);
|
||||
}
|
||||
@ -981,7 +982,7 @@ namespace cv
|
||||
oclMat dx_buf, dy_buf;
|
||||
oclMat magBuf, mapBuf;
|
||||
oclMat trackBuf1, trackBuf2;
|
||||
void *counter;
|
||||
oclMat counter;
|
||||
Ptr<FilterEngine_GPU> filterDX, filterDY;
|
||||
};
|
||||
|
||||
@ -1618,7 +1619,12 @@ namespace cv
|
||||
float pos, oclMat &newFrame, oclMat &buf);
|
||||
|
||||
//! computes moments of the rasterized shape or a vector of points
|
||||
CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage);
|
||||
//! _array should be a vector a points standing for the contour
|
||||
CV_EXPORTS Moments ocl_moments(InputArray contour);
|
||||
//! src should be a general image uploaded to the GPU.
|
||||
//! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1
|
||||
//! to use type of CV_64FC1, the GPU should support CV_64FC1
|
||||
CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary);
|
||||
|
||||
class CV_EXPORTS StereoBM_OCL
|
||||
{
|
||||
|
@ -21,7 +21,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -21,7 +21,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -103,7 +103,11 @@ CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
|
||||
const cv::ocl::ProgramEntry* source, String kernelName);
|
||||
CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
|
||||
const cv::ocl::ProgramEntry* source, String kernelName, const char *build_options);
|
||||
CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
|
||||
String kernelName, int channels, int depth, const char *build_options);
|
||||
CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
|
||||
CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
|
||||
CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, String kernelName, std::vector< std::pair<size_t, const void *> > &args,
|
||||
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
|
||||
CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName,
|
||||
|
@ -342,7 +342,7 @@ PERF_TEST_P(CartToPolarFixture, CartToPolar, OCL_TYPICAL_MAT_SIZES)
|
||||
if (srcSize == OCL_SIZE_4000)
|
||||
declare.time(3.6);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2),
|
||||
oclDst1(srcSize, src1.type()), oclDst2(srcSize, src1.type());
|
||||
@ -374,7 +374,7 @@ PERF_TEST_P(PolarToCartFixture, PolarToCart, OCL_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const Size srcSize = GetParam();
|
||||
|
||||
Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
|
||||
Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
|
||||
dst1(srcSize, CV_32FC1), dst2(srcSize, CV_32FC1);
|
||||
declare.in(src1, src2).out(dst1, dst2);
|
||||
randu(src1, 0, 256);
|
||||
@ -421,7 +421,7 @@ PERF_TEST_P(MagnitudeFixture, Magnitude, OCL_TYPICAL_MAT_SIZES)
|
||||
randu(src2, 0, 1);
|
||||
declare.in(src1, src2).out(dst);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2),
|
||||
oclDst(srcSize, src1.type());
|
||||
@ -457,7 +457,7 @@ PERF_TEST_P(TransposeFixture, Transpose,
|
||||
Mat src(srcSize, type), dst(srcSize, type);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc(src), oclDst(srcSize, type);
|
||||
|
||||
@ -562,7 +562,7 @@ PERF_TEST_P(minMaxLocFixture, minMaxLoc,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_32FC1)))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
@ -607,7 +607,7 @@ PERF_TEST_P(SumFixture, Sum,
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
Mat src(srcSize, type);
|
||||
Mat src(srcSize, type);
|
||||
Scalar result;
|
||||
randu(src, 0, 60);
|
||||
declare.in(src);
|
||||
@ -708,16 +708,16 @@ PERF_TEST_P(BitwiseAndFixture, bitwise_and,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
|
||||
Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
|
||||
declare.in(src1, src2).out(dst);
|
||||
randu(src1, 0, 256);
|
||||
randu(src2, 0, 256);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
|
||||
|
||||
@ -737,6 +737,80 @@ PERF_TEST_P(BitwiseAndFixture, bitwise_and,
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
///////////// bitwise_xor ////////////////////////
|
||||
|
||||
typedef Size_MatType BitwiseXorFixture;
|
||||
|
||||
PERF_TEST_P(BitwiseXorFixture, bitwise_xor,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
|
||||
declare.in(src1, src2).out(dst);
|
||||
randu(src1, 0, 256);
|
||||
randu(src2, 0, 256);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
|
||||
|
||||
OCL_TEST_CYCLE() cv::ocl::bitwise_xor(oclSrc1, oclSrc2, oclDst);
|
||||
|
||||
oclDst.download(dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() cv::bitwise_xor(src1, src2, dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
///////////// bitwise_or ////////////////////////
|
||||
|
||||
typedef Size_MatType BitwiseOrFixture;
|
||||
|
||||
PERF_TEST_P(BitwiseOrFixture, bitwise_or,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_32SC1)))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
|
||||
Mat src1(srcSize, type), src2(srcSize, type), dst(srcSize, type);
|
||||
declare.in(src1, src2).out(dst);
|
||||
randu(src1, 0, 256);
|
||||
randu(src2, 0, 256);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
|
||||
|
||||
OCL_TEST_CYCLE() cv::ocl::bitwise_or(oclSrc1, oclSrc2, oclDst);
|
||||
|
||||
oclDst.download(dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() cv::bitwise_or(src1, src2, dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
///////////// bitwise_not////////////////////////
|
||||
|
||||
typedef Size_MatType BitwiseNotFixture;
|
||||
|
@ -47,48 +47,61 @@
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
using namespace perf;
|
||||
using namespace cv;
|
||||
using std::tr1::get;
|
||||
|
||||
///////////// blend ////////////////////////
|
||||
|
||||
template <typename T>
|
||||
static void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2,
|
||||
const cv::Mat &weights1, const cv::Mat &weights2,
|
||||
cv::Mat &result_gold)
|
||||
static void blendLinearGold(const Mat &img1, const Mat &img2,
|
||||
const Mat &weights1, const Mat &weights2,
|
||||
Mat &result_gold)
|
||||
{
|
||||
CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
|
||||
CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
|
||||
weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
|
||||
|
||||
result_gold.create(img1.size(), img1.type());
|
||||
|
||||
int cn = img1.channels();
|
||||
int step1 = img1.cols * img1.channels();
|
||||
|
||||
for (int y = 0; y < img1.rows; ++y)
|
||||
{
|
||||
const float *weights1_row = weights1.ptr<float>(y);
|
||||
const float *weights2_row = weights2.ptr<float>(y);
|
||||
const T *img1_row = img1.ptr<T>(y);
|
||||
const T *img2_row = img2.ptr<T>(y);
|
||||
T *result_gold_row = result_gold.ptr<T>(y);
|
||||
const float * const weights1_row = weights1.ptr<float>(y);
|
||||
const float * const weights2_row = weights2.ptr<float>(y);
|
||||
const T * const img1_row = img1.ptr<T>(y);
|
||||
const T * const img2_row = img2.ptr<T>(y);
|
||||
T * const result_gold_row = result_gold.ptr<T>(y);
|
||||
|
||||
for (int x = 0; x < img1.cols * cn; ++x)
|
||||
for (int x = 0; x < step1; ++x)
|
||||
{
|
||||
int x1 = x * cn;
|
||||
float w1 = weights1_row[x];
|
||||
float w2 = weights2_row[x];
|
||||
result_gold_row[x] = static_cast<T>((img1_row[x1] * w1
|
||||
+ img2_row[x1] * w2) / (w1 + w2 + 1e-5f));
|
||||
int x1 = x / cn;
|
||||
float w1 = weights1_row[x1], w2 = weights2_row[x1];
|
||||
result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
|
||||
+ (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef TestBaseWithParam<Size> blendLinearFixture;
|
||||
typedef void (*blendFunction)(const Mat &img1, const Mat &img2,
|
||||
const Mat &weights1, const Mat &weights2,
|
||||
Mat &result_gold);
|
||||
|
||||
PERF_TEST_P(blendLinearFixture, blendLinear, OCL_TYPICAL_MAT_SIZES)
|
||||
typedef Size_MatType blendLinearFixture;
|
||||
|
||||
PERF_TEST_P(blendLinearFixture, blendLinear, ::testing::Combine(
|
||||
OCL_TYPICAL_MAT_SIZES, testing::Values(CV_8UC1, CV_8UC3, CV_32FC1)))
|
||||
{
|
||||
const Size srcSize = GetParam();
|
||||
const int type = CV_8UC1;
|
||||
Size_MatType_t params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int srcType = get<1>(params);
|
||||
const double eps = CV_MAT_DEPTH(srcType) <= CV_32S ? 1.0 : 0.2;
|
||||
|
||||
Mat src1(srcSize, type), src2(srcSize, CV_8UC1), dst;
|
||||
Mat src1(srcSize, srcType), src2(srcSize, srcType), dst(srcSize, srcType);
|
||||
Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);
|
||||
|
||||
declare.in(src1, src2, WARMUP_RNG);
|
||||
declare.in(src1, src2, WARMUP_RNG).out(dst);
|
||||
randu(weights1, 0.0f, 1.0f);
|
||||
randu(weights2, 0.0f, 1.0f);
|
||||
|
||||
@ -97,17 +110,20 @@ PERF_TEST_P(blendLinearFixture, blendLinear, OCL_TYPICAL_MAT_SIZES)
|
||||
ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst;
|
||||
ocl::oclMat oclWeights1(weights1), oclWeights2(weights2);
|
||||
|
||||
OCL_TEST_CYCLE() cv::ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
|
||||
OCL_TEST_CYCLE() ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
|
||||
|
||||
oclDst.download(dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
SANITY_CHECK(dst, eps);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
|
||||
blendFunction funcs[] = { (blendFunction)blendLinearGold<uchar>, (blendFunction)blendLinearGold<float> };
|
||||
int funcIdx = CV_MAT_DEPTH(srcType) == CV_8UC1 ? 0 : 1;
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
TEST_CYCLE() (funcs[funcIdx])(src1, src2, weights1, weights2, dst);
|
||||
|
||||
SANITY_CHECK(dst, eps);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
|
@ -53,8 +53,8 @@ using namespace perf;
|
||||
|
||||
typedef TestBaseWithParam<Size> BruteForceMatcherFixture;
|
||||
|
||||
PERF_TEST_P(BruteForceMatcherFixture, DISABLED_match,
|
||||
OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too big difference between implementations
|
||||
PERF_TEST_P(BruteForceMatcherFixture, match,
|
||||
OCL_BFMATCHER_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const Size srcSize = GetParam();
|
||||
|
||||
@ -82,14 +82,14 @@ PERF_TEST_P(BruteForceMatcherFixture, DISABLED_match,
|
||||
|
||||
oclMatcher.matchDownload(oclTrainIdx, oclDistance, matches);
|
||||
|
||||
SANITY_CHECK_MATCHES(matches);
|
||||
SANITY_CHECK_MATCHES(matches, 1e-5);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
PERF_TEST_P(BruteForceMatcherFixture, DISABLED_knnMatch,
|
||||
OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too big difference between implementations
|
||||
PERF_TEST_P(BruteForceMatcherFixture, knnMatch,
|
||||
OCL_BFMATCHER_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const Size srcSize = GetParam();
|
||||
|
||||
@ -123,8 +123,8 @@ PERF_TEST_P(BruteForceMatcherFixture, DISABLED_knnMatch,
|
||||
oclMatcher.knnMatchDownload(oclTrainIdx, oclDistance, matches);
|
||||
|
||||
std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
|
||||
SANITY_CHECK_MATCHES(matches0);
|
||||
SANITY_CHECK_MATCHES(matches1);
|
||||
SANITY_CHECK_MATCHES(matches0, 1e-5);
|
||||
SANITY_CHECK_MATCHES(matches1, 1e-5);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
|
@ -22,7 +22,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -56,6 +56,7 @@ typedef TestBaseWithParam<Size> equalizeHistFixture;
|
||||
PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
|
||||
{
|
||||
const Size srcSize = GetParam();
|
||||
const double eps = 1 + DBL_EPSILON;
|
||||
|
||||
Mat src(srcSize, CV_8UC1), dst(srcSize, CV_8UC1);
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
@ -68,13 +69,13 @@ PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
|
||||
|
||||
oclDst.download(dst);
|
||||
|
||||
SANITY_CHECK(dst, 1 + DBL_EPSILON);
|
||||
SANITY_CHECK(dst, eps);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() cv::equalizeHist(src, dst);
|
||||
|
||||
SANITY_CHECK(dst, 1 + DBL_EPSILON);
|
||||
SANITY_CHECK(dst, eps);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
@ -82,15 +83,20 @@ PERF_TEST_P(equalizeHistFixture, equalizeHist, OCL_TYPICAL_MAT_SIZES)
|
||||
|
||||
/////////// CopyMakeBorder //////////////////////
|
||||
|
||||
typedef Size_MatType CopyMakeBorderFixture;
|
||||
CV_ENUM(Border, BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,
|
||||
BORDER_WRAP, BORDER_REFLECT_101)
|
||||
|
||||
typedef tuple<Size, MatType, Border> CopyMakeBorderParamType;
|
||||
typedef TestBaseWithParam<CopyMakeBorderParamType> CopyMakeBorderFixture;
|
||||
|
||||
PERF_TEST_P(CopyMakeBorderFixture, CopyMakeBorder,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_8UC4),
|
||||
Border::all()))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const CopyMakeBorderParamType params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params), borderType = BORDER_CONSTANT;
|
||||
const int type = get<1>(params), borderType = get<2>(params);
|
||||
|
||||
Mat src(srcSize, type), dst;
|
||||
const Size dstSize = srcSize + Size(12, 12);
|
||||
@ -360,20 +366,23 @@ PERF_TEST_P(resizeFixture, resize,
|
||||
|
||||
///////////// threshold////////////////////////
|
||||
|
||||
CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TRUNC)
|
||||
CV_ENUM(ThreshType, THRESH_BINARY, THRESH_TOZERO_INV)
|
||||
|
||||
typedef tuple<Size, ThreshType> ThreshParams;
|
||||
typedef tuple<Size, MatType, ThreshType> ThreshParams;
|
||||
typedef TestBaseWithParam<ThreshParams> ThreshFixture;
|
||||
|
||||
PERF_TEST_P(ThreshFixture, threshold,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC4, CV_32FC1),
|
||||
ThreshType::all()))
|
||||
{
|
||||
const ThreshParams params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int threshType = get<1>(params);
|
||||
const int srcType = get<1>(params);
|
||||
const int threshType = get<2>(params);
|
||||
const double maxValue = 220.0, threshold = 50;
|
||||
|
||||
Mat src(srcSize, CV_8U), dst(srcSize, CV_8U);
|
||||
Mat src(srcSize, srcType), dst(srcSize, srcType);
|
||||
randu(src, 0, 100);
|
||||
declare.in(src).out(dst);
|
||||
|
||||
@ -381,7 +390,7 @@ PERF_TEST_P(ThreshFixture, threshold,
|
||||
{
|
||||
ocl::oclMat oclSrc(src), oclDst(srcSize, CV_8U);
|
||||
|
||||
OCL_TEST_CYCLE() cv::ocl::threshold(oclSrc, oclDst, 50.0, 0.0, threshType);
|
||||
OCL_TEST_CYCLE() cv::ocl::threshold(oclSrc, oclDst, threshold, maxValue, threshType);
|
||||
|
||||
oclDst.download(dst);
|
||||
|
||||
@ -389,7 +398,7 @@ PERF_TEST_P(ThreshFixture, threshold,
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() cv::threshold(src, dst, 50.0, 0.0, threshType);
|
||||
TEST_CYCLE() cv::threshold(src, dst, threshold, maxValue, threshType);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
@ -860,3 +869,64 @@ PERF_TEST_P(columnSumFixture, columnSum, OCL_TYPICAL_MAT_SIZES)
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
//////////////////////////////distanceToCenters////////////////////////////////////////////////
|
||||
|
||||
CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
|
||||
typedef tuple<Size, DistType> distanceToCentersParameters;
|
||||
typedef TestBaseWithParam<distanceToCentersParameters> distanceToCentersFixture;
|
||||
|
||||
static void distanceToCentersPerfTest(Mat& src, Mat& centers, Mat& dists, Mat& labels, int distType)
|
||||
{
|
||||
Mat batch_dists;
|
||||
cv::batchDistance(src,centers,batch_dists, CV_32FC1, noArray(), distType);
|
||||
std::vector<float> dists_v;
|
||||
std::vector<int> labels_v;
|
||||
for(int i = 0; i<batch_dists.rows; i++)
|
||||
{
|
||||
Mat r = batch_dists.row(i);
|
||||
double mVal;
|
||||
Point mLoc;
|
||||
minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
|
||||
dists_v.push_back((float)mVal);
|
||||
labels_v.push_back(mLoc.x);
|
||||
}
|
||||
Mat temp_dists(dists_v);
|
||||
Mat temp_labels(labels_v);
|
||||
temp_dists.reshape(1,1).copyTo(dists);
|
||||
temp_labels.reshape(1,1).copyTo(labels);
|
||||
}
|
||||
|
||||
PERF_TEST_P(distanceToCentersFixture, distanceToCenters, ::testing::Combine(::testing::Values(cv::Size(256,256), cv::Size(512,512)), DistType::all()) )
|
||||
{
|
||||
Size size = get<0>(GetParam());
|
||||
int distType = get<1>(GetParam());
|
||||
Mat src(size, CV_32FC1);
|
||||
Mat centers(size, CV_32FC1);
|
||||
Mat dists(cv::Size(src.rows,1), CV_32FC1);
|
||||
Mat labels(cv::Size(src.rows,1), CV_32SC1);
|
||||
declare.in(src, centers, WARMUP_RNG).out(dists, labels);
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat ocl_src(src);
|
||||
ocl::oclMat ocl_centers(centers);
|
||||
ocl::oclMat ocl_dists(dists);
|
||||
ocl::oclMat ocl_labels(labels);
|
||||
|
||||
OCL_TEST_CYCLE() ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src, ocl_centers, distType);
|
||||
|
||||
ocl_dists.download(dists);
|
||||
ocl_labels.download(labels);
|
||||
|
||||
SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
|
||||
SANITY_CHECK(labels);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() distanceToCentersPerfTest(src,centers,dists,labels,distType);
|
||||
SANITY_CHECK(dists, 1e-6, ERROR_RELATIVE);
|
||||
SANITY_CHECK(labels);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
@ -156,15 +156,17 @@ PERF_TEST_P(setToFixture, setTo,
|
||||
OCL_PERF_ELSE
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
/////////////////// upload ///////////////////////////
|
||||
|
||||
typedef tuple<Size, int, int> uploadParams;
|
||||
typedef tuple<Size, MatDepth, int> uploadParams;
|
||||
typedef TestBaseWithParam<uploadParams> uploadFixture;
|
||||
|
||||
PERF_TEST_P(uploadFixture, upload,
|
||||
testing::Combine(
|
||||
OCL_TYPICAL_MAT_SIZES,
|
||||
testing::Range(CV_8U, CV_64F),
|
||||
testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
|
||||
testing::Range(1, 5)))
|
||||
{
|
||||
const uploadParams params = GetParam();
|
||||
@ -200,7 +202,7 @@ typedef TestBaseWithParam<uploadParams> downloadFixture;
|
||||
PERF_TEST_P(downloadFixture, download,
|
||||
testing::Combine(
|
||||
OCL_TYPICAL_MAT_SIZES,
|
||||
testing::Range(CV_8U, CV_64F),
|
||||
testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
|
||||
testing::Range(1, 5)))
|
||||
{
|
||||
const uploadParams params = GetParam();
|
||||
@ -228,3 +230,5 @@ PERF_TEST_P(downloadFixture, download,
|
||||
|
||||
SANITY_CHECK_NOTHING();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -26,7 +26,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
// and/or other Materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -49,41 +49,42 @@
|
||||
using namespace perf;
|
||||
using std::tr1::tuple;
|
||||
using std::tr1::get;
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
using namespace std;
|
||||
|
||||
|
||||
///////////// Moments ////////////////////////
|
||||
//*! performance of image
|
||||
typedef tuple<Size, MatType, bool> MomentsParamType;
|
||||
typedef TestBaseWithParam<MomentsParamType> MomentsFixture;
|
||||
|
||||
typedef Size_MatType MomentsFixture;
|
||||
|
||||
PERF_TEST_P(MomentsFixture, DISABLED_Moments,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1))) // TODO does not work properly (see below)
|
||||
PERF_TEST_P(MomentsFixture, Moments,
|
||||
::testing::Combine(OCL_TYPICAL_MAT_SIZES,
|
||||
OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_16UC1, CV_32FC1), ::testing::Bool()))
|
||||
{
|
||||
const Size_MatType_t params = GetParam();
|
||||
const MomentsParamType params = GetParam();
|
||||
const Size srcSize = get<0>(params);
|
||||
const int type = get<1>(params);
|
||||
const bool binaryImage = get<2>(params);
|
||||
|
||||
Mat src(srcSize, type), dst(7, 1, CV_64F);
|
||||
const bool binaryImage = false;
|
||||
Mat src(srcSize, type), dst(7, 1, CV_64F);
|
||||
randu(src, 0, 255);
|
||||
|
||||
oclMat src_d(src);
|
||||
cv::Moments mom;
|
||||
|
||||
declare.in(src, WARMUP_RNG).out(dst);
|
||||
|
||||
if (RUN_OCL_IMPL)
|
||||
{
|
||||
ocl::oclMat oclSrc(src);
|
||||
|
||||
OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(oclSrc, binaryImage); // TODO Use oclSrc
|
||||
cv::HuMoments(mom, dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
|
||||
}
|
||||
else if (RUN_PLAIN_IMPL)
|
||||
{
|
||||
TEST_CYCLE() mom = cv::moments(src, binaryImage);
|
||||
cv::HuMoments(mom, dst);
|
||||
|
||||
SANITY_CHECK(dst);
|
||||
}
|
||||
else
|
||||
OCL_PERF_ELSE
|
||||
cv::HuMoments(mom, dst);
|
||||
SANITY_CHECK(dst, 2e-1);
|
||||
}
|
||||
|
@ -474,9 +474,13 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem
|
||||
|
||||
std::ostringstream stream;
|
||||
stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
|
||||
stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
|
||||
stream << " -D MIN_VAL=" << (std::numeric_limits<T>::is_integer ?
|
||||
(WT)std::numeric_limits<T>::min() : -(WT)(std::numeric_limits<T>::max()));
|
||||
if (std::numeric_limits<T>::is_integer)
|
||||
{
|
||||
stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
|
||||
stream << " -D MIN_VAL=" << (WT)std::numeric_limits<T>::min();
|
||||
}
|
||||
else
|
||||
stream << " -D DEPTH_" << src.depth();
|
||||
std::string buildOptions = stream.str();
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
@ -684,7 +688,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
|
||||
break;
|
||||
}
|
||||
if (isRelative)
|
||||
r = r / norm(src2, normType);
|
||||
r = r / (norm(src2, normType) + DBL_EPSILON);
|
||||
|
||||
return r;
|
||||
}
|
||||
@ -693,83 +697,47 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
|
||||
////////////////////////////////// flip //////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kernelName)
|
||||
enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
|
||||
|
||||
static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType)
|
||||
{
|
||||
int channels = dst.oclchannels();
|
||||
int depth = dst.depth();
|
||||
int cols = dst.cols, rows = dst.rows;
|
||||
if ((cols == 1 && flipType == FLIP_COLS) ||
|
||||
(rows == 1 && flipType == FLIP_ROWS) ||
|
||||
(rows == 1 && cols == 1 && flipType == FLIP_BOTH))
|
||||
{
|
||||
src.copyTo(dst);
|
||||
return;
|
||||
}
|
||||
|
||||
int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 1}
|
||||
};
|
||||
cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols;
|
||||
rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows;
|
||||
|
||||
size_t vector_length = vector_lengths[channels - 1][depth];
|
||||
int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
|
||||
|
||||
int cols = divUp(dst.cols * channels + offset_cols, vector_length);
|
||||
int rows = divUp(dst.rows, 2);
|
||||
const char * const channelMap[] = { "", "", "2", "4", "4" };
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]);
|
||||
|
||||
size_t localThreads[3] = { 64, 4, 1 };
|
||||
size_t globalThreads[3] = { cols, rows, 1 };
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
int elemSize = src.elemSize();
|
||||
int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
|
||||
int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth);
|
||||
}
|
||||
|
||||
static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kernelName, bool isVertical)
|
||||
{
|
||||
int channels = dst.oclchannels();
|
||||
int depth = dst.depth();
|
||||
|
||||
int vector_lengths[4][7] = {{1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1}
|
||||
};
|
||||
|
||||
size_t vector_length = vector_lengths[channels - 1][depth];
|
||||
int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1);
|
||||
int cols = divUp(dst.cols + offset_cols, vector_length);
|
||||
cols = isVertical ? cols : divUp(cols, 2);
|
||||
int rows = isVertical ? divUp(dst.rows, 2) : dst.rows;
|
||||
|
||||
size_t localThreads[3] = { 64, 4, 1 };
|
||||
size_t globalThreads[3] = { cols, rows, 1 };
|
||||
|
||||
int dst_step1 = dst.cols * dst.elemSize();
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
|
||||
if (isVertical)
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
|
||||
else
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip;
|
||||
|
||||
openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth);
|
||||
openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args,
|
||||
-1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
|
||||
@ -783,11 +751,11 @@ void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (flipCode == 0)
|
||||
arithmetic_flip_rows_run(src, dst, "arithm_flip_rows");
|
||||
arithmetic_flip_run(src, dst, "arithm_flip_rows", FLIP_ROWS);
|
||||
else if (flipCode > 0)
|
||||
arithmetic_flip_cols_run(src, dst, "arithm_flip_cols", false);
|
||||
arithmetic_flip_run(src, dst, "arithm_flip_cols", FLIP_COLS);
|
||||
else
|
||||
arithmetic_flip_cols_run(src, dst, "arithm_flip_rc", true);
|
||||
arithmetic_flip_run(src, dst, "arithm_flip_rows_cols", FLIP_BOTH);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -49,35 +49,51 @@
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
||||
void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2,
|
||||
oclMat &result)
|
||||
void cv::ocl::blendLinear(const oclMat &src1, const oclMat &src2, const oclMat &weights1, const oclMat &weights2,
|
||||
oclMat &dst)
|
||||
{
|
||||
cv::ocl::Context *ctx = img1.clCxt;
|
||||
CV_Assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt);
|
||||
int channels = img1.oclchannels();
|
||||
int depth = img1.depth();
|
||||
int rows = img1.rows;
|
||||
int cols = img1.cols;
|
||||
int istep = img1.step1();
|
||||
int wstep = weights1.step1();
|
||||
size_t globalSize[] = {cols * channels / 4, rows, 1};
|
||||
size_t localSize[] = {256, 1, 1};
|
||||
CV_Assert(src1.depth() <= CV_32F);
|
||||
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
|
||||
CV_Assert(weights1.size() == weights2.size() && weights1.size() == src1.size() &&
|
||||
weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
|
||||
|
||||
dst.create(src1.size(), src1.type());
|
||||
|
||||
size_t globalSize[] = { dst.cols, dst.rows, 1};
|
||||
size_t localSize[] = { 16, 16, 1 };
|
||||
|
||||
int depth = dst.depth(), ocn = dst.oclchannels();
|
||||
int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize();
|
||||
int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize();
|
||||
int weight1_step = weights1.step / weights1.elemSize(), weight1_offset = weights1.offset / weights1.elemSize();
|
||||
int weight2_step = weights2.step / weights2.elemSize(), weight2_offset = weights2.offset / weights2.elemSize();
|
||||
int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize();
|
||||
|
||||
const char * const channelMap[] = { "", "", "2", "4", "4" };
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D T=%s%s -D convertToT=convert_%s%s%s -D FT=float%s -D convertToFT=convert_float%s",
|
||||
typeMap[depth], channelMap[ocn], typeMap[depth], channelMap[ocn],
|
||||
depth >= CV_32S ? "" : "_sat_rte", channelMap[ocn], channelMap[ocn]);
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels()));
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img1.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img2.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&istep ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&wstep ));
|
||||
String kernelName = "BlendLinear";
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
|
||||
|
||||
openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth);
|
||||
}
|
||||
openCLExecuteKernel(src1.clCxt, &blend_linear, "blendLinear", globalSize, localSize, args,
|
||||
-1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
@ -49,7 +49,7 @@
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
||||
cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(NULL)
|
||||
cv::ocl::CannyBuf::CannyBuf(const oclMat &dx_, const oclMat &dy_) : dx(dx_), dy(dy_), counter(1, 1, CV_32SC1)
|
||||
{
|
||||
CV_Assert(dx_.type() == CV_32SC1 && dy_.type() == CV_32SC1 && dx_.size() == dy_.size());
|
||||
|
||||
@ -81,17 +81,8 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
|
||||
ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, magBuf);
|
||||
ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, mapBuf);
|
||||
|
||||
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1);
|
||||
ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2);
|
||||
|
||||
int counter_i [1] = { 0 };
|
||||
int err = 0;
|
||||
if(counter)
|
||||
{
|
||||
openCLFree(counter);
|
||||
}
|
||||
counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
|
||||
openCLSafeCall(err);
|
||||
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1);
|
||||
ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2);
|
||||
}
|
||||
|
||||
void cv::ocl::CannyBuf::release()
|
||||
@ -104,11 +95,6 @@ void cv::ocl::CannyBuf::release()
|
||||
mapBuf.release();
|
||||
trackBuf1.release();
|
||||
trackBuf2.release();
|
||||
if(counter)
|
||||
{
|
||||
openCLFree(counter);
|
||||
counter = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
namespace cv
|
||||
@ -124,9 +110,9 @@ namespace cv
|
||||
|
||||
void calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int rows, int cols, float low_thresh, float high_thresh);
|
||||
|
||||
void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols);
|
||||
void edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols);
|
||||
|
||||
void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols);
|
||||
void edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols);
|
||||
|
||||
void getEdges_gpu(oclMat &map, oclMat &dst, int rows, int cols);
|
||||
}
|
||||
@ -320,54 +306,61 @@ void canny::calcMap_gpu(oclMat &dx, oclMat &dy, oclMat &mag, oclMat &map, int ro
|
||||
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols)
|
||||
void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols)
|
||||
{
|
||||
Context *clCxt = map.clCxt;
|
||||
String kernelName = "edgesHysteresisLocal";
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
Mat counterMat(counter.rows, counter.cols, counter.type());
|
||||
counterMat.at<int>(0, 0) = 0;
|
||||
counter.upload(counterMat);
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
|
||||
cl_int stepBytes = map.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&stepBytes));
|
||||
cl_int offsetBytes = map.offset;
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&offsetBytes));
|
||||
|
||||
size_t globalThreads[3] = {cols, rows, 1};
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
|
||||
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
|
||||
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols)
|
||||
{
|
||||
unsigned int count;
|
||||
openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
|
||||
Context *clCxt = map.clCxt;
|
||||
String kernelName = "edgesHysteresisGlobal";
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
size_t localThreads[3] = {128, 1, 1};
|
||||
|
||||
int count_i[1] = {0};
|
||||
while(count > 0)
|
||||
while(1 > 0)
|
||||
{
|
||||
openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
|
||||
Mat counterMat; counter.download(counterMat);
|
||||
int count = counterMat.at<int>(0, 0);
|
||||
CV_Assert(count >= 0);
|
||||
if (count == 0)
|
||||
break;
|
||||
|
||||
counterMat.at<int>(0, 0) = 0;
|
||||
counter.upload(counterMat);
|
||||
|
||||
args.clear();
|
||||
size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1};
|
||||
size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1};
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&count));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset));
|
||||
|
||||
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
|
||||
openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1);
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
}
|
||||
|
@ -448,6 +448,17 @@ static int initializeOpenCLDevices()
|
||||
{
|
||||
deviceInfo.info.haveDoubleSupport = false;
|
||||
}
|
||||
|
||||
size_t intel_platform = platformInfo.info.platformVendor.find("Intel");
|
||||
if(intel_platform != std::string::npos)
|
||||
{
|
||||
deviceInfo.info.compilationExtraOptions += " -D INTEL_DEVICE";
|
||||
deviceInfo.info.isIntelDevice = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
deviceInfo.info.isIntelDevice = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -471,7 +482,7 @@ DeviceInfo::DeviceInfo()
|
||||
deviceVendorId(-1),
|
||||
maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0),
|
||||
deviceVersionMajor(0), deviceVersionMinor(0),
|
||||
haveDoubleSupport(false), isUnifiedMemory(false),
|
||||
haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false),
|
||||
platform(NULL)
|
||||
{
|
||||
// nothing
|
||||
@ -572,6 +583,8 @@ bool ContextImpl::supportsFeature(FEATURE_TYPE featureType) const
|
||||
{
|
||||
switch (featureType)
|
||||
{
|
||||
case FEATURE_CL_INTEL_DEVICE:
|
||||
return deviceInfo.isIntelDevice;
|
||||
case FEATURE_CL_DOUBLE:
|
||||
return deviceInfo.haveDoubleSupport;
|
||||
case FEATURE_CL_UNIFIED_MEM:
|
||||
|
@ -109,6 +109,31 @@ cl_mem openCLCreateBuffer(Context *ctx, size_t flag , size_t size)
|
||||
return buffer;
|
||||
}
|
||||
|
||||
//#define CHECK_MEMORY_CORRUPTION
|
||||
#ifdef CHECK_MEMORY_CORRUPTION
|
||||
//#define CHECK_MEMORY_CORRUPTION_PRINT_ERROR
|
||||
#define CHECK_MEMORY_CORRUPTION_RAISE_ERROR
|
||||
static const int __memory_corruption_check_bytes = 1024*1024;
|
||||
static const int __memory_corruption_check_pattern = 0x14326547; // change pattern for sizeof(int)==8
|
||||
struct CheckBuffers
|
||||
{
|
||||
cl_mem mainBuffer;
|
||||
size_t size;
|
||||
size_t widthInBytes, height;
|
||||
CheckBuffers()
|
||||
: mainBuffer(NULL), size(0), widthInBytes(0), height(0)
|
||||
{
|
||||
// nothing
|
||||
}
|
||||
CheckBuffers(cl_mem _mainBuffer, size_t _size, size_t _widthInBytes, size_t _height)
|
||||
: mainBuffer(_mainBuffer), size(_size), widthInBytes(_widthInBytes), height(_height)
|
||||
{
|
||||
// notihng
|
||||
}
|
||||
};
|
||||
static std::map<cl_mem, CheckBuffers> __check_buffers;
|
||||
#endif
|
||||
|
||||
void openCLMallocPitch(Context *ctx, void **dev_ptr, size_t *pitch,
|
||||
size_t widthInBytes, size_t height)
|
||||
{
|
||||
@ -119,9 +144,34 @@ void openCLMallocPitchEx(Context *ctx, void **dev_ptr, size_t *pitch,
|
||||
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
|
||||
{
|
||||
cl_int status;
|
||||
size_t size = widthInBytes * height;
|
||||
#ifndef CHECK_MEMORY_CORRUPTION
|
||||
*dev_ptr = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
widthInBytes * height, 0, &status);
|
||||
size, 0, &status);
|
||||
openCLVerifyCall(status);
|
||||
#else
|
||||
size_t allocSize = size + __memory_corruption_check_bytes * 2;
|
||||
cl_mem mainBuffer = clCreateBuffer(getClContext(ctx), gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
allocSize, 0, &status);
|
||||
openCLVerifyCall(status);
|
||||
cl_buffer_region r = {__memory_corruption_check_bytes, size};
|
||||
*dev_ptr = clCreateSubBuffer(mainBuffer,
|
||||
gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
CL_BUFFER_CREATE_TYPE_REGION, &r,
|
||||
&status);
|
||||
openCLVerifyCall(status);
|
||||
std::vector<int> tmp(__memory_corruption_check_bytes / sizeof(int),
|
||||
__memory_corruption_check_pattern);
|
||||
CV_Assert(tmp.size() * sizeof(int) == __memory_corruption_check_bytes);
|
||||
openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
|
||||
mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &tmp[0],
|
||||
0, NULL, NULL));
|
||||
openCLVerifyCall(clEnqueueWriteBuffer(getClCommandQueue(ctx),
|
||||
mainBuffer, CL_TRUE, __memory_corruption_check_bytes + size, __memory_corruption_check_bytes, &tmp[0],
|
||||
0, NULL, NULL));
|
||||
CheckBuffers data(mainBuffer, size, widthInBytes, height);
|
||||
__check_buffers.insert(std::pair<cl_mem, CheckBuffers>((cl_mem)*dev_ptr, data));
|
||||
#endif
|
||||
*pitch = widthInBytes;
|
||||
}
|
||||
|
||||
@ -174,7 +224,59 @@ void openCLCopyBuffer2D(Context *ctx, void *dst, size_t dpitch, int dst_offset,
|
||||
|
||||
void openCLFree(void *devPtr)
|
||||
{
|
||||
#ifdef CHECK_MEMORY_CORRUPTION
|
||||
bool failBefore = false, failAfter = false;
|
||||
CheckBuffers data;
|
||||
std::map<cl_mem, CheckBuffers>::iterator i = __check_buffers.find((cl_mem)devPtr);
|
||||
if (i != __check_buffers.end())
|
||||
{
|
||||
data = i->second;
|
||||
Context* ctx = Context::getContext();
|
||||
std::vector<uchar> checkBefore(__memory_corruption_check_bytes);
|
||||
std::vector<uchar> checkAfter(__memory_corruption_check_bytes);
|
||||
openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
|
||||
data.mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &checkBefore[0],
|
||||
0, NULL, NULL));
|
||||
openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx),
|
||||
data.mainBuffer, CL_TRUE, __memory_corruption_check_bytes + data.size, __memory_corruption_check_bytes, &checkAfter[0],
|
||||
0, NULL, NULL));
|
||||
|
||||
std::vector<int> tmp(__memory_corruption_check_bytes / sizeof(int),
|
||||
__memory_corruption_check_pattern);
|
||||
|
||||
if (memcmp(&checkBefore[0], &tmp[0], __memory_corruption_check_bytes) != 0)
|
||||
{
|
||||
failBefore = true;
|
||||
}
|
||||
if (memcmp(&checkAfter[0], &tmp[0], __memory_corruption_check_bytes) != 0)
|
||||
{
|
||||
failAfter = true;
|
||||
}
|
||||
openCLSafeCall(clReleaseMemObject(data.mainBuffer));
|
||||
__check_buffers.erase(i);
|
||||
}
|
||||
#endif
|
||||
openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
|
||||
#ifdef CHECK_MEMORY_CORRUPTION
|
||||
if (failBefore)
|
||||
{
|
||||
#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
|
||||
std::cerr << "ERROR: Memory corruption detected: before buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
|
||||
#endif
|
||||
#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
|
||||
CV_Error(CV_StsInternal, "Memory corruption detected: before buffer");
|
||||
#endif
|
||||
}
|
||||
if (failAfter)
|
||||
{
|
||||
#ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR
|
||||
std::cerr << "ERROR: Memory corruption detected: after buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl;
|
||||
#endif
|
||||
#ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR
|
||||
CV_Error(CV_StsInternal, "Memory corruption detected: after buffer");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName)
|
||||
@ -234,8 +336,7 @@ static std::string removeDuplicatedWhiteSpaces(const char * buildOptions)
|
||||
return opt;
|
||||
}
|
||||
|
||||
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
|
||||
cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, int channels,
|
||||
int depth, const char *build_options)
|
||||
{
|
||||
//construct kernel name
|
||||
@ -248,10 +349,14 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, Str
|
||||
idxStr << "_D" << depth;
|
||||
kernelName = kernelName + idxStr.str();
|
||||
|
||||
cl_kernel kernel;
|
||||
std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
|
||||
kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
|
||||
cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
|
||||
return kernel;
|
||||
}
|
||||
|
||||
void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args)
|
||||
{
|
||||
if ( localThreads != NULL)
|
||||
{
|
||||
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
|
||||
@ -297,6 +402,15 @@ void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, Str
|
||||
openCLSafeCall(clReleaseKernel(kernel));
|
||||
}
|
||||
|
||||
void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
|
||||
int depth, const char *build_options)
|
||||
{
|
||||
cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
|
||||
|
||||
openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
|
||||
}
|
||||
|
||||
void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName,
|
||||
size_t globalThreads[3], size_t localThreads[3],
|
||||
std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
|
||||
|
@ -428,7 +428,7 @@ struct ProgramFileCache
|
||||
|
||||
if(status != CL_SUCCESS)
|
||||
{
|
||||
if(status == CL_BUILD_PROGRAM_FAILURE)
|
||||
if (status == CL_BUILD_PROGRAM_FAILURE || status == CL_INVALID_BUILD_OPTIONS)
|
||||
{
|
||||
size_t buildLogSize = 0;
|
||||
openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
|
||||
|
@ -11,7 +11,7 @@
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
@ -69,37 +69,14 @@ inline void normalizeAnchor(Point &anchor, const Size &ksize)
|
||||
normalizeAnchor(anchor.y, ksize.height);
|
||||
}
|
||||
|
||||
inline void normalizeROI(Rect &roi, const Size &ksize, const Point &anchor, const Size &src_size)
|
||||
inline void normalizeROI(Rect &roi, const Size &ksize, const Point &/*anchor*/, const Size &src_size)
|
||||
{
|
||||
if (roi == Rect(0, 0, -1, -1))
|
||||
roi = Rect(0, 0, src_size.width, src_size.height);
|
||||
|
||||
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
|
||||
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
|
||||
CV_Assert(roi.x >= 0 && roi.y >= 0 && roi.width <= src_size.width && roi.height <= src_size.height);
|
||||
}
|
||||
|
||||
|
||||
inline void normalizeKernel(const Mat &kernel, oclMat &gpu_krnl, int type = CV_8U, int *nDivisor = 0, bool reverse = false)
|
||||
{
|
||||
int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
|
||||
|
||||
if (nDivisor)
|
||||
*nDivisor = scale;
|
||||
Mat temp(kernel.size(), type);
|
||||
kernel.convertTo(temp, type, scale);
|
||||
Mat cont_krnl = temp.reshape(1, 1);
|
||||
|
||||
if (reverse)
|
||||
{
|
||||
int count = cont_krnl.cols >> 1;
|
||||
|
||||
for (int i = 0; i < count; ++i)
|
||||
std::swap(cont_krnl.at<int>(0, i), cont_krnl.at<int>(0, cont_krnl.cols - 1 - i));
|
||||
}
|
||||
|
||||
gpu_krnl.upload(cont_krnl);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -168,7 +145,7 @@ typedef void (*GPUMorfFilter_t)(const oclMat & , oclMat & , oclMat & , Size &, c
|
||||
class MorphFilter_GPU : public BaseFilter_GPU
|
||||
{
|
||||
public:
|
||||
MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUMorfFilter_t func_) :
|
||||
MorphFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUMorfFilter_t func_) :
|
||||
BaseFilter_GPU(ksize_, anchor_, BORDER_CONSTANT), kernel(kernel_), func(func_), rectKernel(false) {}
|
||||
|
||||
virtual void operator()(const oclMat &src, oclMat &dst)
|
||||
@ -345,27 +322,22 @@ static void GPUDilate(const oclMat &src, oclMat &dst, oclMat &mat_kernel,
|
||||
openCLExecuteKernel(clCxt, &filtering_morph, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
|
||||
}
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, Point anchor)
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getMorphologyFilter_GPU(int op, int type, const Mat &_kernel, const Size &ksize, Point anchor)
|
||||
{
|
||||
static const GPUMorfFilter_t GPUMorfFilter_callers[2][5] =
|
||||
{
|
||||
{0, GPUErode, 0, GPUErode, GPUErode },
|
||||
{0, GPUDilate, 0, GPUDilate, GPUDilate}
|
||||
};
|
||||
|
||||
CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
|
||||
CV_Assert(type == CV_8UC1 || type == CV_8UC3 || type == CV_8UC4 || type == CV_32FC1 || type == CV_32FC3 || type == CV_32FC4);
|
||||
|
||||
oclMat gpu_krnl;
|
||||
normalizeKernel(kernel, gpu_krnl);
|
||||
normalizeAnchor(anchor, ksize);
|
||||
Mat kernel8U;
|
||||
_kernel.convertTo(kernel8U, CV_8U);
|
||||
Mat kernel = kernel8U.reshape(1, 1);
|
||||
|
||||
bool noZero = true;
|
||||
for(int i = 0; i < kernel.rows * kernel.cols; ++i)
|
||||
if(kernel.data[i] != 1)
|
||||
if(kernel.at<uchar>(i) != 1)
|
||||
noZero = false;
|
||||
|
||||
MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, gpu_krnl, GPUMorfFilter_callers[op][CV_MAT_CN(type)]);
|
||||
MorphFilter_GPU* mfgpu = new MorphFilter_GPU(ksize, anchor, kernel, op == MORPH_ERODE ? GPUErode : GPUDilate);
|
||||
if(noZero)
|
||||
mfgpu->rectKernel = true;
|
||||
|
||||
@ -445,14 +417,15 @@ void morphOp(int op, const oclMat &src, oclMat &dst, const Mat &_kernel, Point a
|
||||
else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
|
||||
{
|
||||
anchor = Point(anchor.x * iterations, anchor.y * iterations);
|
||||
kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1),
|
||||
ksize.height + iterations * (ksize.height - 1)), anchor);
|
||||
kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + (iterations - 1) * (ksize.width - 1),
|
||||
ksize.height + (iterations - 1) * (ksize.height - 1)), anchor);
|
||||
iterations = 1;
|
||||
}
|
||||
else
|
||||
kernel = _kernel;
|
||||
|
||||
Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations);
|
||||
Ptr<MorphologyFilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations)
|
||||
.staticCast<MorphologyFilterEngine_GPU>();
|
||||
|
||||
f->apply(src, dst);
|
||||
}
|
||||
@ -525,12 +498,12 @@ void cv::ocl::morphologyEx(const oclMat &src, oclMat &dst, int op, const Mat &ke
|
||||
|
||||
namespace
|
||||
{
|
||||
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const oclMat & , const Size &, const Point&, const int);
|
||||
typedef void (*GPUFilter2D_t)(const oclMat & , oclMat & , const Mat & , const Size &, const Point&, const int);
|
||||
|
||||
class LinearFilter_GPU : public BaseFilter_GPU
|
||||
{
|
||||
public:
|
||||
LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const oclMat &kernel_, GPUFilter2D_t func_,
|
||||
LinearFilter_GPU(const Size &ksize_, const Point &anchor_, const Mat &kernel_, GPUFilter2D_t func_,
|
||||
int borderType_) :
|
||||
BaseFilter_GPU(ksize_, anchor_, borderType_), kernel(kernel_), func(func_) {}
|
||||
|
||||
@ -539,123 +512,217 @@ public:
|
||||
func(src, dst, kernel, ksize, anchor, borderType) ;
|
||||
}
|
||||
|
||||
oclMat kernel;
|
||||
Mat kernel;
|
||||
GPUFilter2D_t func;
|
||||
};
|
||||
}
|
||||
|
||||
static void GPUFilter2D(const oclMat &src, oclMat &dst, const oclMat &mat_kernel,
|
||||
// prepare kernel: transpose and make double rows (+align). Returns size of aligned row
|
||||
// Samples:
|
||||
// a b c
|
||||
// Input: d e f
|
||||
// g h i
|
||||
// Output, last two zeros is the alignment:
|
||||
// a d g a d g 0 0
|
||||
// b e h b e h 0 0
|
||||
// c f i c f i 0 0
|
||||
template <typename T>
|
||||
static int _prepareKernelFilter2D(std::vector<T>& data, const Mat &kernel)
|
||||
{
|
||||
Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
|
||||
int size_y_aligned = roundUp(kernel.rows * 2, 4);
|
||||
data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
|
||||
for (int x = 0; x < kernel.cols; x++)
|
||||
{
|
||||
for (int y = 0; y < kernel.rows; y++)
|
||||
{
|
||||
data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
|
||||
data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
|
||||
}
|
||||
}
|
||||
return size_y_aligned;
|
||||
}
|
||||
|
||||
static void GPUFilter2D(const oclMat &src, oclMat &dst, const Mat &kernel,
|
||||
const Size &ksize, const Point& anchor, const int borderType)
|
||||
{
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
CV_Assert((src.oclchannels() == dst.oclchannels()));
|
||||
CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1));
|
||||
CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1));
|
||||
CV_Assert(ksize.width == ksize.height);
|
||||
Context *clCxt = src.clCxt;
|
||||
CV_Assert(src.oclchannels() == dst.oclchannels());
|
||||
|
||||
int filterWidth = ksize.width;
|
||||
bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel
|
||||
CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height);
|
||||
CV_Assert(kernel.channels() == 1);
|
||||
|
||||
String kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D";
|
||||
CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols);
|
||||
CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows);
|
||||
|
||||
size_t src_offset_x = (src.offset % src.step) / src.elemSize();
|
||||
size_t src_offset_y = src.offset / src.step;
|
||||
bool useDouble = src.depth() == CV_64F;
|
||||
|
||||
size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize();
|
||||
size_t dst_offset_y = dst.offset / dst.step;
|
||||
|
||||
int paddingPixels = filterWidth & (-2);
|
||||
|
||||
size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1};
|
||||
size_t globalThreads[3] = {src.wholecols, src.wholerows, 1};
|
||||
|
||||
int cn = src.oclchannels();
|
||||
int src_step = (int)(src.step/src.elemSize());
|
||||
int dst_step = (int)(dst.step/src.elemSize());
|
||||
|
||||
int localWidth = localThreads[0] + paddingPixels;
|
||||
int localHeight = localThreads[1] + paddingPixels;
|
||||
|
||||
size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize();
|
||||
|
||||
int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4},
|
||||
{4, 4, 1, 1, 1, 1, 1},
|
||||
{1, 1, 1, 1, 1, 1, 1},
|
||||
{4, 4, 4, 4, 1, 1, 4}
|
||||
};
|
||||
int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1));
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
|
||||
args.push_back(std::make_pair(localMemSize, (void *)NULL));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_x));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_y));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_x));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_y));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols));
|
||||
char btype[30];
|
||||
switch (borderType)
|
||||
std::vector<float> kernelDataFloat;
|
||||
std::vector<double> kernelDataDouble;
|
||||
int kernel_size_y2_aligned = useDouble ?
|
||||
_prepareKernelFilter2D<double>(kernelDataDouble, kernel)
|
||||
: _prepareKernelFilter2D<float>(kernelDataFloat, kernel);
|
||||
oclMat oclKernelParameter;
|
||||
if (useDouble)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
|
||||
openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double),
|
||||
&kernelDataDouble[0], kernelDataDouble.size()*sizeof(double),
|
||||
kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice);
|
||||
}
|
||||
int type = src.depth();
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width);
|
||||
openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
else
|
||||
{
|
||||
oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT);
|
||||
openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float),
|
||||
&kernelDataFloat[0], kernelDataFloat.size()*sizeof(float),
|
||||
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
||||
do {
|
||||
size_t BLOCK_SIZE = tryWorkItems;
|
||||
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
|
||||
BLOCK_SIZE /= 2;
|
||||
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
|
||||
size_t BLOCK_SIZE_Y = 1;
|
||||
#else
|
||||
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
|
||||
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
|
||||
BLOCK_SIZE_Y *= 2;
|
||||
#endif
|
||||
|
||||
CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
|
||||
|
||||
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
cl_uint stepBytes = src.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
|
||||
int offsetXBytes = src.offset % src.step;
|
||||
int offsetX = offsetXBytes / src.elemSize();
|
||||
CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
|
||||
int offsetY = src.offset / src.step;
|
||||
int endX = (offsetX + src.cols);
|
||||
int endY = (offsetY + src.rows);
|
||||
cl_int rect[4] = {offsetX, offsetY, endX, endY};
|
||||
if (!isIsolatedBorder)
|
||||
{
|
||||
rect[2] = src.wholecols;
|
||||
rect[3] = src.wholerows;
|
||||
}
|
||||
args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
|
||||
cl_uint _stepBytes = dst.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
|
||||
int _offsetXBytes = dst.offset % dst.step;
|
||||
int _offsetX = _offsetXBytes / dst.elemSize();
|
||||
CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
|
||||
int _offsetY = dst.offset / dst.step;
|
||||
int _endX = (_offsetX + dst.cols);
|
||||
int _endY = (_offsetY + dst.rows);
|
||||
cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
|
||||
args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
|
||||
|
||||
float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
|
||||
double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
|
||||
if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
|
||||
{
|
||||
if (useDouble)
|
||||
args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
|
||||
else
|
||||
args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
|
||||
}
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
|
||||
|
||||
const char* btype = NULL;
|
||||
|
||||
switch (borderType & ~BORDER_ISOLATED)
|
||||
{
|
||||
case BORDER_CONSTANT:
|
||||
btype = "BORDER_CONSTANT";
|
||||
break;
|
||||
case BORDER_REPLICATE:
|
||||
btype = "BORDER_REPLICATE";
|
||||
break;
|
||||
case BORDER_REFLECT:
|
||||
btype = "BORDER_REFLECT";
|
||||
break;
|
||||
case BORDER_WRAP:
|
||||
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case BORDER_REFLECT101:
|
||||
btype = "BORDER_REFLECT_101";
|
||||
break;
|
||||
}
|
||||
|
||||
int requiredTop = anchor.y;
|
||||
int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
|
||||
int requiredBottom = ksize.height - 1 - anchor.y;
|
||||
int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
|
||||
int h = isIsolatedBorder ? src.rows : src.wholerows;
|
||||
int w = isIsolatedBorder ? src.cols : src.wholecols;
|
||||
bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
|
||||
|
||||
char build_options[1024];
|
||||
sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
|
||||
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
|
||||
"-D %s -D %s -D %s",
|
||||
(int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
|
||||
src.depth(), src.oclchannels(), useDouble ? 1 : 0,
|
||||
anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
|
||||
btype,
|
||||
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
||||
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
||||
|
||||
size_t lt[3] = {BLOCK_SIZE, 1, 1};
|
||||
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
|
||||
|
||||
size_t kernelWorkGroupSize;
|
||||
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
|
||||
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
|
||||
if (lt[0] > kernelWorkGroupSize)
|
||||
{
|
||||
clReleaseKernel(kernel);
|
||||
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
|
||||
tryWorkItems = kernelWorkGroupSize;
|
||||
continue;
|
||||
}
|
||||
|
||||
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
||||
} while (false);
|
||||
}
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize,
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
|
||||
const Point &anchor, int borderType)
|
||||
{
|
||||
static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D};
|
||||
|
||||
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
|
||||
|
||||
oclMat gpu_krnl;
|
||||
Point norm_archor = anchor;
|
||||
normalizeKernel(kernel, gpu_krnl, CV_32FC1);
|
||||
normalizeAnchor(norm_archor, ksize);
|
||||
|
||||
return makePtr<LinearFilter_GPU>(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)],
|
||||
borderType);
|
||||
return Ptr<BaseFilter_GPU>(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D,
|
||||
borderType));
|
||||
}
|
||||
|
||||
Ptr<FilterEngine_GPU> cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor,
|
||||
int borderType)
|
||||
{
|
||||
Size ksize = kernel.size();
|
||||
Size ksize = kernel.size(); // TODO remove duplicated parameter
|
||||
Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor, borderType);
|
||||
|
||||
return createFilter2D_GPU(linearFilter);
|
||||
}
|
||||
|
||||
void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, int borderType)
|
||||
void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, Point anchor, double delta, int borderType)
|
||||
{
|
||||
CV_Assert(delta == 0);
|
||||
|
||||
if (ddepth < 0)
|
||||
ddepth = src.depth();
|
||||
|
||||
@ -714,276 +781,146 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
|
||||
return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
|
||||
}
|
||||
|
||||
/*
|
||||
**data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4
|
||||
**support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101
|
||||
*/
|
||||
|
||||
static void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst,
|
||||
static void GPUFilterBox(const oclMat &src, oclMat &dst,
|
||||
Size &ksize, const Point anchor, const int borderType)
|
||||
{
|
||||
//Normalize the result by default
|
||||
float alpha = ksize.height * ksize.width;
|
||||
float alpha = 1.0f / (ksize.height * ksize.width);
|
||||
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
Context *clCxt = src.clCxt;
|
||||
CV_Assert(src.oclchannels() == dst.oclchannels());
|
||||
|
||||
String kernelName = "boxFilter_C1_D0";
|
||||
size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
|
||||
do {
|
||||
size_t BLOCK_SIZE = tryWorkItems;
|
||||
while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
|
||||
BLOCK_SIZE /= 2;
|
||||
size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
|
||||
while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
|
||||
BLOCK_SIZE_Y *= 2;
|
||||
|
||||
char btype[30];
|
||||
CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
|
||||
|
||||
switch (borderType)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
|
||||
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
|
||||
size_t blockSizeX = 256, blockSizeY = 1;
|
||||
size_t gSize = blockSizeX - (ksize.width - 1);
|
||||
size_t threads = (dst.offset % dst.step % 4 + dst.cols + 3) / 4;
|
||||
size_t globalSizeX = threads % gSize == 0 ? threads / gSize * blockSizeX : (threads / gSize + 1) * blockSizeX;
|
||||
size_t globalSizeY = ((dst.rows + 1) / 2) % blockSizeY == 0 ? ((dst.rows + 1) / 2) : (((dst.rows + 1) / 2) / blockSizeY + 1) * blockSizeY;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
cl_uint stepBytes = src.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes));
|
||||
int offsetXBytes = src.offset % src.step;
|
||||
int offsetX = offsetXBytes / src.elemSize();
|
||||
CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
|
||||
int offsetY = src.offset / src.step;
|
||||
int endX = (offsetX + src.cols);
|
||||
int endY = (offsetY + src.rows);
|
||||
cl_int rect[4] = {offsetX, offsetY, endX, endY};
|
||||
if (!isIsolatedBorder)
|
||||
{
|
||||
rect[2] = src.wholecols;
|
||||
rect[3] = src.wholerows;
|
||||
}
|
||||
args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
|
||||
|
||||
size_t globalThreads[3] = { globalSizeX, globalSizeY, 1 };
|
||||
size_t localThreads[3] = { blockSizeX, blockSizeY, 1 };
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
|
||||
cl_uint _stepBytes = dst.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes));
|
||||
int _offsetXBytes = dst.offset % dst.step;
|
||||
int _offsetX = _offsetXBytes / dst.elemSize();
|
||||
CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
|
||||
int _offsetY = dst.offset / dst.step;
|
||||
int _endX = (_offsetX + dst.cols);
|
||||
int _endY = (_offsetY + dst.rows);
|
||||
cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
|
||||
args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
bool useDouble = src.depth() == CV_64F;
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
|
||||
double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
|
||||
if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
|
||||
{
|
||||
if (useDouble)
|
||||
args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
|
||||
else
|
||||
args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
|
||||
}
|
||||
|
||||
double alphaDouble = alpha; // DON'T move into 'if' body
|
||||
if (useDouble)
|
||||
args.push_back( std::make_pair( sizeof(double), (void *)&alphaDouble));
|
||||
else
|
||||
args.push_back( std::make_pair( sizeof(float), (void *)&alpha));
|
||||
|
||||
const char* btype = NULL;
|
||||
|
||||
switch (borderType & ~BORDER_ISOLATED)
|
||||
{
|
||||
case BORDER_CONSTANT:
|
||||
btype = "BORDER_CONSTANT";
|
||||
break;
|
||||
case BORDER_REPLICATE:
|
||||
btype = "BORDER_REPLICATE";
|
||||
break;
|
||||
case BORDER_REFLECT:
|
||||
btype = "BORDER_REFLECT";
|
||||
break;
|
||||
case BORDER_WRAP:
|
||||
CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case BORDER_REFLECT101:
|
||||
btype = "BORDER_REFLECT_101";
|
||||
break;
|
||||
}
|
||||
|
||||
int requiredTop = anchor.y;
|
||||
int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
|
||||
int requiredBottom = ksize.height - 1 - anchor.y;
|
||||
int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
|
||||
int h = isIsolatedBorder ? src.rows : src.wholerows;
|
||||
int w = isIsolatedBorder ? src.cols : src.wholecols;
|
||||
bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
|
||||
|
||||
CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
|
||||
|
||||
char build_options[1024];
|
||||
sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
|
||||
(int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
|
||||
src.depth(), src.oclchannels(), useDouble ? 1 : 0,
|
||||
anchor.x, anchor.y, ksize.width, ksize.height,
|
||||
btype,
|
||||
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
|
||||
isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
|
||||
|
||||
size_t lt[3] = {BLOCK_SIZE, 1, 1};
|
||||
size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
|
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
|
||||
|
||||
size_t kernelWorkGroupSize;
|
||||
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
|
||||
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
|
||||
if (lt[0] > kernelWorkGroupSize)
|
||||
{
|
||||
clReleaseKernel(kernel);
|
||||
CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
|
||||
tryWorkItems = kernelWorkGroupSize;
|
||||
continue;
|
||||
}
|
||||
|
||||
openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
|
||||
} while (false);
|
||||
}
|
||||
|
||||
static void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst,
|
||||
Size &ksize, const Point anchor, const int borderType)
|
||||
{
|
||||
//Normalize the result by default
|
||||
float alpha = ksize.height * ksize.width;
|
||||
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
String kernelName = "boxFilter_C4_D0";
|
||||
|
||||
char btype[30];
|
||||
|
||||
switch (borderType)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
|
||||
|
||||
size_t blockSizeX = 256, blockSizeY = 1;
|
||||
size_t gSize = blockSizeX - ksize.width / 2 * 2;
|
||||
size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
|
||||
size_t rows_per_thread = 2;
|
||||
size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
|
||||
|
||||
size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
|
||||
size_t localThreads[3] = { blockSizeX, blockSizeY, 1};
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
}
|
||||
|
||||
static void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst,
|
||||
Size &ksize, const Point anchor, const int borderType)
|
||||
{
|
||||
//Normalize the result by default
|
||||
float alpha = ksize.height * ksize.width;
|
||||
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
String kernelName = "boxFilter_C1_D5";
|
||||
|
||||
char btype[30];
|
||||
|
||||
switch (borderType)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
|
||||
|
||||
size_t blockSizeX = 256, blockSizeY = 1;
|
||||
size_t gSize = blockSizeX - ksize.width / 2 * 2;
|
||||
size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
|
||||
size_t rows_per_thread = 2;
|
||||
size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
|
||||
|
||||
|
||||
size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
|
||||
size_t localThreads[3] = { blockSizeX, blockSizeY, 1};
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
}
|
||||
|
||||
static void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst,
|
||||
Size &ksize, const Point anchor, const int borderType)
|
||||
{
|
||||
//Normalize the result by default
|
||||
float alpha = ksize.height * ksize.width;
|
||||
|
||||
CV_Assert(src.clCxt == dst.clCxt);
|
||||
CV_Assert((src.cols == dst.cols) &&
|
||||
(src.rows == dst.rows));
|
||||
Context *clCxt = src.clCxt;
|
||||
|
||||
String kernelName = "boxFilter_C4_D5";
|
||||
|
||||
char btype[30];
|
||||
|
||||
switch (borderType)
|
||||
{
|
||||
case 0:
|
||||
sprintf(btype, "BORDER_CONSTANT");
|
||||
break;
|
||||
case 1:
|
||||
sprintf(btype, "BORDER_REPLICATE");
|
||||
break;
|
||||
case 2:
|
||||
sprintf(btype, "BORDER_REFLECT");
|
||||
break;
|
||||
case 3:
|
||||
CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!");
|
||||
return;
|
||||
case 4:
|
||||
sprintf(btype, "BORDER_REFLECT_101");
|
||||
break;
|
||||
}
|
||||
|
||||
char build_options[150];
|
||||
sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype);
|
||||
|
||||
size_t blockSizeX = 256, blockSizeY = 1;
|
||||
size_t gSize = blockSizeX - ksize.width / 2 * 2;
|
||||
size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX;
|
||||
size_t rows_per_thread = 2;
|
||||
size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
|
||||
|
||||
|
||||
size_t globalThreads[3] = { globalSizeX, globalSizeY, 1};
|
||||
size_t localThreads[3] = { blockSizeX, blockSizeY, 1};
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
|
||||
openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
|
||||
}
|
||||
|
||||
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int srcType, int dstType,
|
||||
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
|
||||
const Size &ksize, Point anchor, int borderType)
|
||||
{
|
||||
static const FilterBox_t FilterBox_callers[2][5] = {{0, GPUFilterBox_8u_C1R, 0, GPUFilterBox_8u_C4R, GPUFilterBox_8u_C4R},
|
||||
{0, GPUFilterBox_32F_C1R, 0, GPUFilterBox_32F_C4R, GPUFilterBox_32F_C4R}
|
||||
};
|
||||
//Remove this check if more data types need to be supported.
|
||||
CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 ||
|
||||
srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType);
|
||||
|
||||
normalizeAnchor(anchor, ksize);
|
||||
|
||||
return makePtr<GPUBoxFilter>(ksize, anchor,
|
||||
borderType, FilterBox_callers[(CV_MAT_DEPTH(srcType) == CV_32F)][CV_MAT_CN(srcType)]);
|
||||
return Ptr<BaseFilter_GPU>(new GPUBoxFilter(ksize, anchor,
|
||||
borderType, GPUFilterBox));
|
||||
}
|
||||
|
||||
Ptr<FilterEngine_GPU> cv::ocl::createBoxFilter_GPU(int srcType, int dstType,
|
||||
@ -1373,8 +1310,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
|
||||
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
|
||||
}
|
||||
|
||||
void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
|
||||
void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale,
|
||||
double delta, int borderType)
|
||||
{
|
||||
CV_Assert(delta == 0);
|
||||
|
||||
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
|
||||
@ -1383,17 +1323,17 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
|
||||
|
||||
CV_Assert(ksize == 1 || ksize == 3);
|
||||
|
||||
int K[2][9] =
|
||||
double K[2][9] =
|
||||
{
|
||||
{0, 1, 0, 1, -4, 1, 0, 1, 0},
|
||||
{2, 0, 2, 0, -8, 0, 2, 0, 2}
|
||||
};
|
||||
Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);
|
||||
Mat kernel(3, 3, CV_64F, (void *)K[ksize == 3 ? 1 : 0]);
|
||||
|
||||
if (scale != 1)
|
||||
kernel *= scale;
|
||||
|
||||
filter2D(src, dst, ddepth, kernel, Point(-1, -1));
|
||||
filter2D(src, dst, ddepth, kernel, Point(-1, -1), 0, borderType);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1431,6 +1371,15 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
|
||||
|
||||
void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
|
||||
{
|
||||
if (bordertype != BORDER_CONSTANT)
|
||||
{
|
||||
if (src.rows == 1)
|
||||
ksize.height = 1;
|
||||
|
||||
if (src.cols == 1)
|
||||
ksize.width = 1;
|
||||
}
|
||||
|
||||
if (ksize.width == 1 && ksize.height == 1)
|
||||
{
|
||||
src.copyTo(dst);
|
||||
@ -1453,15 +1402,6 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
|
||||
|
||||
dst.create(src.size(), src.type());
|
||||
|
||||
if (bordertype != BORDER_CONSTANT)
|
||||
{
|
||||
if (src.rows == 1)
|
||||
ksize.height = 1;
|
||||
|
||||
if (src.cols == 1)
|
||||
ksize.width = 1;
|
||||
}
|
||||
|
||||
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
|
||||
f->apply(src, dst);
|
||||
}
|
||||
|
@ -849,16 +849,138 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
|
||||
args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
|
||||
|
||||
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
|
||||
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
|
||||
{
|
||||
//setup local group size
|
||||
localThreads[0] = 8;
|
||||
localThreads[1] = 16;
|
||||
localThreads[2] = 1;
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
|
||||
//init maximal number of workgroups
|
||||
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
|
||||
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
|
||||
int WGNumZ = loopcount;
|
||||
int WGNum = 0; //accurate number of non -empty workgroups
|
||||
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
|
||||
{
|
||||
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
|
||||
openCLVerifyCall(status);
|
||||
for(int z=0;z<WGNumZ;++z)
|
||||
{
|
||||
int Width = (scaleinfo[z].width_height >> 16)&0xFFFF;
|
||||
int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
|
||||
for(int y=0;y<WGNumY;++y)
|
||||
{
|
||||
int gy = y*localThreads[1];
|
||||
if(gy>=(Height-cascade->orig_window_size.height))
|
||||
continue; // no data to process
|
||||
for(int x=0;x<WGNumX;++x)
|
||||
{
|
||||
int gx = x*localThreads[0];
|
||||
if(gx>=(Width-cascade->orig_window_size.width))
|
||||
continue; // no data to process
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
// save no-empty workgroup info into array
|
||||
pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
|
||||
pWGInfo[WGNum].s[1] = (gx << 16) | gy;
|
||||
pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
|
||||
memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
|
||||
WGNum++;
|
||||
}
|
||||
}
|
||||
}
|
||||
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
|
||||
pWGInfo = NULL;
|
||||
}
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
// setup global sizes to have linear array of workgroups with WGNum size
|
||||
globalThreads[0] = localThreads[0]*WGNum;
|
||||
globalThreads[1] = localThreads[1];
|
||||
globalThreads[2] = 1;
|
||||
|
||||
#define NODE_SIZE 12
|
||||
// pack node info to have less memory loads
|
||||
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
|
||||
{
|
||||
cl_int status;
|
||||
cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
|
||||
openCLVerifyCall(status);
|
||||
//use known local data stride to precalulate indexes
|
||||
int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
|
||||
// check that maximal value is less than maximal unsigned short
|
||||
assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX);
|
||||
for(int i = 0;i<nodenum;++i)
|
||||
{//process each node from classifier
|
||||
struct NodePK
|
||||
{
|
||||
unsigned short slm_index[3][4];
|
||||
float weight[3];
|
||||
float threshold;
|
||||
float alpha[2];
|
||||
};
|
||||
struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
|
||||
for(int k=0;k<3;++k)
|
||||
{// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
|
||||
int* p = &(node[i].p[k][0]);
|
||||
pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
|
||||
pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
|
||||
pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
|
||||
pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
|
||||
}
|
||||
//store used float point values for each node
|
||||
pOut->weight[0] = node[i].weight[0];
|
||||
pOut->weight[1] = node[i].weight[1];
|
||||
pOut->weight[2] = node[i].weight[2];
|
||||
pOut->threshold = node[i].threshold;
|
||||
pOut->alpha[0] = node[i].alpha[0];
|
||||
pOut->alpha[1] = node[i].alpha[1];
|
||||
}
|
||||
openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
|
||||
pNodesPK = NULL;
|
||||
}
|
||||
// add 2 additional buffers (WGinfo and packed nodes) as 2 last args
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
|
||||
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
|
||||
|
||||
//form build options for kernel
|
||||
String options = "-D PACKED_CLASSIFIER";
|
||||
options = options + format(" -D NODE_SIZE=%d",NODE_SIZE);
|
||||
options = options + format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
|
||||
options = options + format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
|
||||
options = options + format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
|
||||
options = options + format(" -D LSx=%d",localThreads[0]);
|
||||
options = options + format(" -D LSy=%d",localThreads[1]);
|
||||
options = options + format(" -D SPLITNODE=%d",splitnode);
|
||||
options = options + format(" -D SPLITSTAGE=%d",splitstage);
|
||||
options = options + format(" -D OUTPUTSZ=%d",outputsz);
|
||||
|
||||
// init candiate global count by 0
|
||||
int pattern = 0;
|
||||
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
|
||||
// execute face detector
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
|
||||
//read candidate buffer back and put it into host list
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
assert(candidate[0]<outputsz);
|
||||
//printf("candidate[0]=%d\n",candidate[0]);
|
||||
for(int i = 1; i <= candidate[0]; i++)
|
||||
{
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
|
||||
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
|
||||
|
||||
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
if(candidate[4 * i + 2] != 0)
|
||||
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
|
||||
candidate[4 * i + 2], candidate[4 * i + 3]));
|
||||
}
|
||||
|
||||
free(scaleinfo);
|
||||
free(candidate);
|
||||
@ -934,11 +1056,11 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
|
||||
{
|
||||
sz = sizev[i];
|
||||
factor = scalev[i];
|
||||
int ystep = cvRound(std::max(2., factor));
|
||||
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
|
||||
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
|
||||
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
|
||||
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
|
||||
double ystep = std::max(2., factor);
|
||||
int equRect_x = cvRound(factor * gcascade->p0);
|
||||
int equRect_y = cvRound(factor * gcascade->p1);
|
||||
int equRect_w = cvRound(factor * gcascade->p3);
|
||||
int equRect_h = cvRound(factor * gcascade->p2);
|
||||
p[i].s[0] = equRect_x;
|
||||
p[i].s[1] = equRect_y;
|
||||
p[i].s[2] = equRect_x + equRect_w;
|
||||
|
@ -99,79 +99,85 @@ namespace cv
|
||||
/////////////////////////////////////////////////////////////////////////////////////
|
||||
// threshold
|
||||
|
||||
typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
|
||||
|
||||
static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
|
||||
static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
|
||||
{
|
||||
uchar thresh_uchar = cvFloor(thresh);
|
||||
uchar max_val = cvRound(maxVal);
|
||||
CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
|
||||
|
||||
size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
|
||||
size_t bSizeX = 16, bSizeY = 16;
|
||||
size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
|
||||
size_t gSizeY = dst.rows;
|
||||
size_t globalThreads[3] = {gSizeX, gSizeY, 1};
|
||||
size_t localThreads[3] = {bSizeX, bSizeY, 1};
|
||||
static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
|
||||
sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
|
||||
args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&max_val));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
|
||||
int elemSize1 = sizeMap[depth];
|
||||
int bufSize = elemSize1 * ocn;
|
||||
std::vector<uchar> _buf(bufSize);
|
||||
uchar * buf = &_buf[0];
|
||||
scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
|
||||
memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
|
||||
|
||||
return _buf;
|
||||
}
|
||||
|
||||
static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
|
||||
static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
|
||||
{
|
||||
float thresh_f = thresh;
|
||||
float max_val = maxVal;
|
||||
int dst_offset = (dst.offset >> 2);
|
||||
int dst_step = (dst.step >> 2);
|
||||
int src_offset = (src.offset >> 2);
|
||||
int src_step = (src.step >> 2);
|
||||
bool ival = src.depth() < CV_32F;
|
||||
int cn = src.channels(), vecSize = 4, depth = src.depth();
|
||||
std::vector<uchar> thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(),
|
||||
dst.oclchannels(), dst.channels());
|
||||
std::vector<uchar> maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels());
|
||||
|
||||
size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
|
||||
size_t bSizeX = 16, bSizeY = 16;
|
||||
size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
|
||||
size_t gSizeY = dst.rows;
|
||||
size_t globalThreads[3] = {gSizeX, gSizeY, 1};
|
||||
size_t localThreads[3] = {bSizeX, bSizeY, 1};
|
||||
const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC",
|
||||
"THRESH_TOZERO", "THRESH_TOZERO_INV" };
|
||||
const char * const channelMap[] = { "", "", "2", "4", "4" };
|
||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||
std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]);
|
||||
|
||||
int elemSize = src.elemSize();
|
||||
int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
|
||||
int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), &src.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), &dst.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step));
|
||||
args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step));
|
||||
args.push_back( std::make_pair(sizeof(cl_float), (void *)&thresh_f));
|
||||
args.push_back( std::make_pair(sizeof(cl_float), (void *)&max_val));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&type));
|
||||
args.push_back( std::make_pair(thresholdValue.size(), (void *)&thresholdValue[0]));
|
||||
args.push_back( std::make_pair(maxValue.size(), (void *)&maxValue[0]));
|
||||
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth());
|
||||
int max_index = dst.cols, cols = dst.cols;
|
||||
if (cn == 1 && vecSize > 1)
|
||||
{
|
||||
CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16);
|
||||
cols = divUp(cols, vecSize);
|
||||
buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d",
|
||||
typeMap[depth], vecSize, vecSize, vecSize, vecSize);
|
||||
|
||||
int vecSizeBytes = vecSize * dst.elemSize1();
|
||||
if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0)
|
||||
buildOptions += " -D DST_ALIGNED";
|
||||
if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0)
|
||||
buildOptions += " -D SRC_ALIGNED";
|
||||
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&max_index));
|
||||
}
|
||||
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
|
||||
|
||||
size_t localThreads[3] = { 16, 16, 1 };
|
||||
size_t globalThreads[3] = { cols, dst.rows, 1 };
|
||||
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args,
|
||||
-1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
// threshold: support 8UC1 and 32FC1 data type and five threshold type
|
||||
double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
|
||||
double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType)
|
||||
{
|
||||
//TODO: These limitations shall be removed later.
|
||||
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
|
||||
CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
|
||||
|| type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
|
||||
CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC
|
||||
|| thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV);
|
||||
|
||||
static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
|
||||
|
||||
dst.create( src.size(), src.type() );
|
||||
gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
|
||||
dst.create(src.size(), src.type());
|
||||
threshold_runner(src, dst, thresh, maxVal, thresholdType);
|
||||
|
||||
return thresh;
|
||||
}
|
||||
@ -891,8 +897,60 @@ namespace cv
|
||||
|
||||
if (ksize > 0)
|
||||
{
|
||||
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
|
||||
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
|
||||
Context* clCxt = Context::getContext();
|
||||
if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
|
||||
src.cols % 8 == 0 && src.rows % 8 == 0 &&
|
||||
ksize==3 &&
|
||||
(borderType ==cv::BORDER_REFLECT ||
|
||||
borderType == cv::BORDER_REPLICATE ||
|
||||
borderType ==cv::BORDER_REFLECT101 ||
|
||||
borderType ==cv::BORDER_WRAP))
|
||||
{
|
||||
Dx.create(src.size(), CV_32FC1);
|
||||
Dy.create(src.size(), CV_32FC1);
|
||||
|
||||
const unsigned int block_x = 8;
|
||||
const unsigned int block_y = 8;
|
||||
|
||||
unsigned int src_pitch = src.step;
|
||||
unsigned int dst_pitch = Dx.cols;
|
||||
|
||||
float _scale = scale;
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
|
||||
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
|
||||
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
|
||||
size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
|
||||
|
||||
String option = "-D BLK_X=8 -D BLK_Y=8";
|
||||
switch(borderType)
|
||||
{
|
||||
case cv::BORDER_REPLICATE:
|
||||
option = option + " -D BORDER_REPLICATE";
|
||||
break;
|
||||
case cv::BORDER_REFLECT:
|
||||
option = option + " -D BORDER_REFLECT";
|
||||
break;
|
||||
case cv::BORDER_REFLECT101:
|
||||
option = option + " -D BORDER_REFLECT101";
|
||||
break;
|
||||
case cv::BORDER_WRAP:
|
||||
option = option + " -D BORDER_WRAP";
|
||||
break;
|
||||
}
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
|
||||
}
|
||||
else
|
||||
{
|
||||
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
|
||||
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -954,6 +1012,7 @@ namespace cv
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&k));
|
||||
|
||||
openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str());
|
||||
}
|
||||
|
||||
@ -969,15 +1028,15 @@ namespace cv
|
||||
{
|
||||
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "Select device doesn't support double");
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
|
||||
return;
|
||||
}
|
||||
|
||||
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
|
||||
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE
|
||||
|| borderType == cv::BORDER_REFLECT);
|
||||
|
||||
extractCovData(src, dx, dy, blockSize, ksize, borderType);
|
||||
dst.create(src.size(), CV_32F);
|
||||
dst.create(src.size(), CV_32FC1);
|
||||
corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
|
||||
}
|
||||
|
||||
@ -991,12 +1050,13 @@ namespace cv
|
||||
{
|
||||
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "select device don't support double");
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
|
||||
return;
|
||||
}
|
||||
|
||||
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
|
||||
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
|
||||
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 ||
|
||||
borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
|
||||
|
||||
extractCovData(src, dx, dy, blockSize, ksize, borderType);
|
||||
dst.create(src.size(), CV_32F);
|
||||
|
||||
|
@ -160,32 +160,61 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
|
||||
}
|
||||
}
|
||||
|
||||
void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers)
|
||||
void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType, const oclMat &indices)
|
||||
{
|
||||
//if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
//{
|
||||
// CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
|
||||
// return;
|
||||
//}
|
||||
CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels());
|
||||
CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F);
|
||||
bool is_label_row_major = false;
|
||||
ensureSizeIsEnough(1, src.rows, CV_32FC1, dists);
|
||||
if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1))
|
||||
{
|
||||
ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels);
|
||||
is_label_row_major = true;
|
||||
}
|
||||
CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR);
|
||||
|
||||
Context *clCxt = src.clCxt;
|
||||
int labels_step = (int)(labels.step/labels.elemSize());
|
||||
std::stringstream build_opt_ss;
|
||||
build_opt_ss
|
||||
<< (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST")
|
||||
<< (indices.empty() ? "" : " -D USE_INDEX");
|
||||
|
||||
String build_opt = build_opt_ss.str();
|
||||
|
||||
const int src_step = (int)(src.oclchannels() * src.step / src.elemSize());
|
||||
const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize());
|
||||
|
||||
const int colsNumb = centers.cols*centers.oclchannels();
|
||||
|
||||
const int label_step = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1;
|
||||
String kernelname = "distanceToCenters";
|
||||
int threadNum = src.rows > 256 ? 256 : src.rows;
|
||||
size_t localThreads[3] = {1, threadNum, 1};
|
||||
size_t globalThreads[3] = {1, src.rows, 1};
|
||||
|
||||
const int number_of_input = indices.empty() ? src.rows : indices.size().area();
|
||||
|
||||
const int src_offset = (int)src.offset/src.elemSize();
|
||||
const int centers_offset = (int)centers.offset/centers.elemSize();
|
||||
|
||||
size_t globalThreads[3] = {number_of_input, 1, 1};
|
||||
|
||||
std::vector<std::pair<size_t, const void *> > args;
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&labels_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.cols));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)¢ers.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dists.data));
|
||||
if(!indices.empty())
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&indices.data));
|
||||
}
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dists.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&colsNumb));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&label_step));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&number_of_input));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.rows));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers_offset));
|
||||
|
||||
openCLExecuteKernel(clCxt, &kmeans_kernel, kernelname, globalThreads, localThreads, args, -1, -1, NULL);
|
||||
openCLExecuteKernel(Context::getContext(), &kmeans_kernel,
|
||||
kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str());
|
||||
}
|
||||
///////////////////////////////////k - means /////////////////////////////////////////////////////////
|
||||
double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
|
||||
@ -404,17 +433,17 @@ double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels,
|
||||
|
||||
_bestLabels.upload(_labels);
|
||||
_centers.upload(centers);
|
||||
|
||||
distanceToCenters(_dists, _bestLabels, _src, _centers);
|
||||
|
||||
Mat dists;
|
||||
_dists.download(dists);
|
||||
_bestLabels.download(_labels);
|
||||
|
||||
double* dist = dists.ptr<double>(0);
|
||||
float* dist = dists.ptr<float>(0);
|
||||
compactness = 0;
|
||||
for( i = 0; i < N; i++ )
|
||||
{
|
||||
compactness += dist[i];
|
||||
compactness += (double)dist[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,12 +10,12 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -26,7 +26,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
// and/or other Materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -50,295 +50,342 @@
|
||||
|
||||
#include "opencl_kernels.hpp"
|
||||
|
||||
#if defined _MSC_VER
|
||||
#define snprintf sprintf_s
|
||||
#endif
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
// The function calculates center of gravity and the central second order moments
|
||||
static void icvCompleteMomentState( CvMoments* moments )
|
||||
{
|
||||
double cx = 0, cy = 0;
|
||||
double mu20, mu11, mu02;
|
||||
|
||||
assert( moments != 0 );
|
||||
moments->inv_sqrt_m00 = 0;
|
||||
|
||||
if( fabs(moments->m00) > DBL_EPSILON )
|
||||
namespace ocl
|
||||
{
|
||||
double inv_m00 = 1. / moments->m00;
|
||||
cx = moments->m10 * inv_m00;
|
||||
cy = moments->m01 * inv_m00;
|
||||
moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
|
||||
}
|
||||
|
||||
// mu20 = m20 - m10*cx
|
||||
mu20 = moments->m20 - moments->m10 * cx;
|
||||
// mu11 = m11 - m10*cy
|
||||
mu11 = moments->m11 - moments->m10 * cy;
|
||||
// mu02 = m02 - m01*cy
|
||||
mu02 = moments->m02 - moments->m01 * cy;
|
||||
|
||||
moments->mu20 = mu20;
|
||||
moments->mu11 = mu11;
|
||||
moments->mu02 = mu02;
|
||||
|
||||
// mu30 = m30 - cx*(3*mu20 + cx*m10)
|
||||
moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
|
||||
mu11 += mu11;
|
||||
// mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
|
||||
moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
|
||||
// mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
|
||||
moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
|
||||
// mu03 = m03 - cy*(3*mu02 + cy*m01)
|
||||
moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
|
||||
}
|
||||
|
||||
|
||||
static void icvContourMoments( CvSeq* contour, CvMoments* mom )
|
||||
{
|
||||
if( contour->total )
|
||||
{
|
||||
CvSeqReader reader;
|
||||
int lpt = contour->total;
|
||||
double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
|
||||
|
||||
cvStartReadSeq( contour, &reader, 0 );
|
||||
|
||||
size_t reader_size = lpt << 1;
|
||||
cv::Mat reader_mat(1,reader_size,CV_32FC1);
|
||||
|
||||
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
|
||||
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
|
||||
// The function calculates center of gravity and the central second order moments
|
||||
static void icvCompleteMomentState( CvMoments* moments )
|
||||
{
|
||||
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
|
||||
}
|
||||
double cx = 0, cy = 0;
|
||||
double mu20, mu11, mu02;
|
||||
|
||||
if( is_float )
|
||||
{
|
||||
for(size_t i = 0; i < reader_size; ++i)
|
||||
assert( moments != 0 );
|
||||
moments->inv_sqrt_m00 = 0;
|
||||
|
||||
if( fabs(moments->m00) > DBL_EPSILON )
|
||||
{
|
||||
reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
|
||||
reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
|
||||
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
|
||||
double inv_m00 = 1. / moments->m00;
|
||||
cx = moments->m10 * inv_m00;
|
||||
cy = moments->m01 * inv_m00;
|
||||
moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) );
|
||||
}
|
||||
|
||||
// mu20 = m20 - m10*cx
|
||||
mu20 = moments->m20 - moments->m10 * cx;
|
||||
// mu11 = m11 - m10*cy
|
||||
mu11 = moments->m11 - moments->m10 * cy;
|
||||
// mu02 = m02 - m01*cy
|
||||
mu02 = moments->m02 - moments->m01 * cy;
|
||||
|
||||
moments->mu20 = mu20;
|
||||
moments->mu11 = mu11;
|
||||
moments->mu02 = mu02;
|
||||
|
||||
// mu30 = m30 - cx*(3*mu20 + cx*m10)
|
||||
moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10);
|
||||
mu11 += mu11;
|
||||
// mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20
|
||||
moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20;
|
||||
// mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02
|
||||
moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02;
|
||||
// mu03 = m03 - cy*(3*mu02 + cy*m01)
|
||||
moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01);
|
||||
}
|
||||
else
|
||||
|
||||
|
||||
static void icvContourMoments( CvSeq* contour, CvMoments* mom )
|
||||
{
|
||||
for(size_t i = 0; i < reader_size; ++i)
|
||||
if( contour->total )
|
||||
{
|
||||
reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
|
||||
reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
|
||||
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
|
||||
CvSeqReader reader;
|
||||
int lpt = contour->total;
|
||||
double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03;
|
||||
|
||||
cvStartReadSeq( contour, &reader, 0 );
|
||||
|
||||
size_t reader_size = lpt << 1;
|
||||
cv::Mat reader_mat(1,reader_size,CV_32FC1);
|
||||
|
||||
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
|
||||
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float)
|
||||
{
|
||||
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
|
||||
}
|
||||
|
||||
if( is_float )
|
||||
{
|
||||
for(size_t i = 0; i < reader_size; ++i)
|
||||
{
|
||||
reader_mat.at<float>(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x;
|
||||
reader_mat.at<float>(0, i) = ((CvPoint2D32f*)(reader.ptr))->y;
|
||||
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(size_t i = 0; i < reader_size; ++i)
|
||||
{
|
||||
reader_mat.at<float>(0, i++) = ((CvPoint*)(reader.ptr))->x;
|
||||
reader_mat.at<float>(0, i) = ((CvPoint*)(reader.ptr))->y;
|
||||
CV_NEXT_SEQ_ELEM( contour->elem_size, reader );
|
||||
}
|
||||
}
|
||||
|
||||
cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
|
||||
cv::ocl::oclMat reader_oclmat(reader_mat);
|
||||
int llength = std::min(lpt,128);
|
||||
size_t localThreads[3] = { llength, 1, 1};
|
||||
size_t globalThreads[3] = { lpt, 1, 1};
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
|
||||
cl_int dst_step = (cl_int)dst_a.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
|
||||
|
||||
char builOption[128];
|
||||
snprintf(builOption, 128, "-D CV_8UC1");
|
||||
|
||||
openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption);
|
||||
|
||||
cv::Mat dst(dst_a);
|
||||
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
|
||||
{
|
||||
for (int i = 0; i < contour->total; ++i)
|
||||
{
|
||||
a00 += dst.at<cl_long>(0, i);
|
||||
a10 += dst.at<cl_long>(1, i);
|
||||
a01 += dst.at<cl_long>(2, i);
|
||||
a20 += dst.at<cl_long>(3, i);
|
||||
a11 += dst.at<cl_long>(4, i);
|
||||
a02 += dst.at<cl_long>(5, i);
|
||||
a30 += dst.at<cl_long>(6, i);
|
||||
a21 += dst.at<cl_long>(7, i);
|
||||
a12 += dst.at<cl_long>(8, i);
|
||||
a03 += dst.at<cl_long>(9, i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
a00 = cv::sum(dst.row(0))[0];
|
||||
a10 = cv::sum(dst.row(1))[0];
|
||||
a01 = cv::sum(dst.row(2))[0];
|
||||
a20 = cv::sum(dst.row(3))[0];
|
||||
a11 = cv::sum(dst.row(4))[0];
|
||||
a02 = cv::sum(dst.row(5))[0];
|
||||
a30 = cv::sum(dst.row(6))[0];
|
||||
a21 = cv::sum(dst.row(7))[0];
|
||||
a12 = cv::sum(dst.row(8))[0];
|
||||
a03 = cv::sum(dst.row(9))[0];
|
||||
}
|
||||
|
||||
double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
|
||||
if( fabs(a00) > FLT_EPSILON )
|
||||
{
|
||||
if( a00 > 0 )
|
||||
{
|
||||
db1_2 = 0.5;
|
||||
db1_6 = 0.16666666666666666666666666666667;
|
||||
db1_12 = 0.083333333333333333333333333333333;
|
||||
db1_24 = 0.041666666666666666666666666666667;
|
||||
db1_20 = 0.05;
|
||||
db1_60 = 0.016666666666666666666666666666667;
|
||||
}
|
||||
else
|
||||
{
|
||||
db1_2 = -0.5;
|
||||
db1_6 = -0.16666666666666666666666666666667;
|
||||
db1_12 = -0.083333333333333333333333333333333;
|
||||
db1_24 = -0.041666666666666666666666666666667;
|
||||
db1_20 = -0.05;
|
||||
db1_60 = -0.016666666666666666666666666666667;
|
||||
}
|
||||
|
||||
// spatial moments
|
||||
mom->m00 = a00 * db1_2;
|
||||
mom->m10 = a10 * db1_6;
|
||||
mom->m01 = a01 * db1_6;
|
||||
mom->m20 = a20 * db1_12;
|
||||
mom->m11 = a11 * db1_24;
|
||||
mom->m02 = a02 * db1_12;
|
||||
mom->m30 = a30 * db1_20;
|
||||
mom->m21 = a21 * db1_60;
|
||||
mom->m12 = a12 * db1_60;
|
||||
mom->m03 = a03 * db1_20;
|
||||
|
||||
icvCompleteMomentState( mom );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cv::ocl::oclMat dst_a(10, lpt, CV_64FC1);
|
||||
cv::ocl::oclMat reader_oclmat(reader_mat);
|
||||
int llength = std::min(lpt,128);
|
||||
size_t localThreads[3] = { llength, 1, 1};
|
||||
size_t globalThreads[3] = { lpt, 1, 1};
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data ));
|
||||
cl_int dst_step = (cl_int)dst_a.step;
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step ));
|
||||
|
||||
openCLExecuteKernel2(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
cv::Mat dst(dst_a);
|
||||
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
|
||||
Moments ocl_moments(oclMat& src, bool binary) //for image
|
||||
{
|
||||
for (int i = 0; i < contour->total; ++i)
|
||||
CV_Assert(src.oclchannels() == 1);
|
||||
if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
|
||||
{
|
||||
a00 += dst.at<cl_long>(0, i);
|
||||
a10 += dst.at<cl_long>(1, i);
|
||||
a01 += dst.at<cl_long>(2, i);
|
||||
a20 += dst.at<cl_long>(3, i);
|
||||
a11 += dst.at<cl_long>(4, i);
|
||||
a02 += dst.at<cl_long>(5, i);
|
||||
a30 += dst.at<cl_long>(6, i);
|
||||
a21 += dst.at<cl_long>(7, i);
|
||||
a12 += dst.at<cl_long>(8, i);
|
||||
a03 += dst.at<cl_long>(9, i);
|
||||
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
a00 = cv::sum(dst.row(0))[0];
|
||||
a10 = cv::sum(dst.row(1))[0];
|
||||
a01 = cv::sum(dst.row(2))[0];
|
||||
a20 = cv::sum(dst.row(3))[0];
|
||||
a11 = cv::sum(dst.row(4))[0];
|
||||
a02 = cv::sum(dst.row(5))[0];
|
||||
a30 = cv::sum(dst.row(6))[0];
|
||||
a21 = cv::sum(dst.row(7))[0];
|
||||
a12 = cv::sum(dst.row(8))[0];
|
||||
a03 = cv::sum(dst.row(9))[0];
|
||||
}
|
||||
|
||||
double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60;
|
||||
if( fabs(a00) > FLT_EPSILON )
|
||||
{
|
||||
if( a00 > 0 )
|
||||
if(binary)
|
||||
{
|
||||
db1_2 = 0.5;
|
||||
db1_6 = 0.16666666666666666666666666666667;
|
||||
db1_12 = 0.083333333333333333333333333333333;
|
||||
db1_24 = 0.041666666666666666666666666666667;
|
||||
db1_20 = 0.05;
|
||||
db1_60 = 0.016666666666666666666666666666667;
|
||||
oclMat mask;
|
||||
if(src.type() != CV_8UC1)
|
||||
{
|
||||
src.convertTo(mask, CV_8UC1);
|
||||
}
|
||||
oclMat src8u(src.size(), CV_8UC1);
|
||||
src8u.setTo(Scalar(255), mask);
|
||||
src = src8u;
|
||||
}
|
||||
const int TILE_SIZE = 256;
|
||||
|
||||
CvMoments mom;
|
||||
memset(&mom, 0, sizeof(mom));
|
||||
|
||||
cv::Size size = src.size();
|
||||
int blockx, blocky;
|
||||
blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE;
|
||||
blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE;
|
||||
|
||||
oclMat dst_m;
|
||||
int tile_height = TILE_SIZE;
|
||||
|
||||
size_t localThreads[3] = {1, tile_height, 1};
|
||||
size_t globalThreads[3] = {blockx, size.height, 1};
|
||||
|
||||
if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE))
|
||||
{
|
||||
dst_m.create(blocky * 10, blockx, CV_64FC1);
|
||||
}else
|
||||
{
|
||||
dst_m.create(blocky * 10, blockx, CV_32FC1);
|
||||
}
|
||||
|
||||
int src_step = (int)(src.step/src.elemSize());
|
||||
int dstm_step = (int)(dst_m.step/dst_m.elemSize());
|
||||
|
||||
std::vector<std::pair<size_t , const void *> > args,args_sum;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstm_step ));
|
||||
|
||||
int binary_;
|
||||
if(binary)
|
||||
binary_ = 1;
|
||||
else
|
||||
binary_ = 0;
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary_));
|
||||
|
||||
char builOption[128];
|
||||
if(binary || src.type() == CV_8UC1)
|
||||
{
|
||||
db1_2 = -0.5;
|
||||
db1_6 = -0.16666666666666666666666666666667;
|
||||
db1_12 = -0.083333333333333333333333333333333;
|
||||
db1_24 = -0.041666666666666666666666666666667;
|
||||
db1_20 = -0.05;
|
||||
db1_60 = -0.016666666666666666666666666666667;
|
||||
snprintf(builOption, 128, "-D CV_8UC1");
|
||||
}else if(src.type() == CV_16UC1)
|
||||
{
|
||||
snprintf(builOption, 128, "-D CV_16UC1");
|
||||
}else if(src.type() == CV_16SC1)
|
||||
{
|
||||
snprintf(builOption, 128, "-D CV_16SC1");
|
||||
}else if(src.type() == CV_32FC1)
|
||||
{
|
||||
snprintf(builOption, 128, "-D CV_32FC1");
|
||||
}else if(src.type() == CV_64FC1)
|
||||
{
|
||||
snprintf(builOption, 128, "-D CV_64FC1");
|
||||
}else
|
||||
{
|
||||
CV_Error( CV_StsUnsupportedFormat, "" );
|
||||
}
|
||||
|
||||
// spatial moments
|
||||
mom->m00 = a00 * db1_2;
|
||||
mom->m10 = a10 * db1_6;
|
||||
mom->m01 = a01 * db1_6;
|
||||
mom->m20 = a20 * db1_12;
|
||||
mom->m11 = a11 * db1_24;
|
||||
mom->m02 = a02 * db1_12;
|
||||
mom->m30 = a30 * db1_20;
|
||||
mom->m21 = a21 * db1_60;
|
||||
mom->m12 = a12 * db1_60;
|
||||
mom->m03 = a03 * db1_20;
|
||||
openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption);
|
||||
|
||||
icvCompleteMomentState( mom );
|
||||
Mat tmp(dst_m);
|
||||
tmp.convertTo(tmp, CV_64FC1);
|
||||
|
||||
double tmp_m[10] = {0};
|
||||
|
||||
for(int j = 0; j < tmp.rows; j += 10)
|
||||
{
|
||||
for(int i = 0; i < tmp.cols; i++)
|
||||
{
|
||||
tmp_m[0] += tmp.at<double>(j, i);
|
||||
tmp_m[1] += tmp.at<double>(j + 1, i);
|
||||
tmp_m[2] += tmp.at<double>(j + 2, i);
|
||||
tmp_m[3] += tmp.at<double>(j + 3, i);
|
||||
tmp_m[4] += tmp.at<double>(j + 4, i);
|
||||
tmp_m[5] += tmp.at<double>(j + 5, i);
|
||||
tmp_m[6] += tmp.at<double>(j + 6, i);
|
||||
tmp_m[7] += tmp.at<double>(j + 7, i);
|
||||
tmp_m[8] += tmp.at<double>(j + 8, i);
|
||||
tmp_m[9] += tmp.at<double>(j + 9, i);
|
||||
}
|
||||
}
|
||||
|
||||
mom.m00 = tmp_m[0];
|
||||
mom.m10 = tmp_m[1];
|
||||
mom.m01 = tmp_m[2];
|
||||
mom.m20 = tmp_m[3];
|
||||
mom.m11 = tmp_m[4];
|
||||
mom.m02 = tmp_m[5];
|
||||
mom.m30 = tmp_m[6];
|
||||
mom.m21 = tmp_m[7];
|
||||
mom.m12 = tmp_m[8];
|
||||
mom.m03 = tmp_m[9];
|
||||
icvCompleteMomentState( &mom );
|
||||
return mom;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
|
||||
{
|
||||
const int TILE_SIZE = 256;
|
||||
int type, depth, cn, coi = 0;
|
||||
CvMat stub, *mat = (CvMat*)array;
|
||||
CvContour contourHeader;
|
||||
CvSeq* contour = 0;
|
||||
CvSeqBlock block;
|
||||
if( CV_IS_SEQ( array ))
|
||||
{
|
||||
contour = (CvSeq*)array;
|
||||
if( !CV_IS_SEQ_POINT_SET( contour ))
|
||||
CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
|
||||
}
|
||||
|
||||
if( !mom )
|
||||
CV_Error( CV_StsNullPtr, "" );
|
||||
|
||||
memset( mom, 0, sizeof(*mom));
|
||||
|
||||
if( !contour )
|
||||
{
|
||||
|
||||
mat = cvGetMat( mat, &stub, &coi );
|
||||
type = CV_MAT_TYPE( mat->type );
|
||||
|
||||
if( type == CV_32SC2 || type == CV_32FC2 )
|
||||
Moments ocl_moments(InputArray _contour) //for contour
|
||||
{
|
||||
contour = cvPointSeqFromMat(
|
||||
CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
|
||||
mat, &contourHeader, &block );
|
||||
CvMoments mom;
|
||||
memset(&mom, 0, sizeof(mom));
|
||||
|
||||
Mat arr = _contour.getMat();
|
||||
CvMat c_array = arr;
|
||||
|
||||
const void* array = &c_array;
|
||||
|
||||
CvSeq* contour = 0;
|
||||
if( CV_IS_SEQ( array ))
|
||||
{
|
||||
contour = (CvSeq*)(array);
|
||||
if( !CV_IS_SEQ_POINT_SET( contour ))
|
||||
CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" );
|
||||
}
|
||||
|
||||
int type, coi = 0;
|
||||
|
||||
CvMat stub, *mat = (CvMat*)(array);
|
||||
CvContour contourHeader;
|
||||
CvSeqBlock block;
|
||||
|
||||
if( !contour )
|
||||
{
|
||||
mat = cvGetMat( mat, &stub, &coi );
|
||||
type = CV_MAT_TYPE( mat->type );
|
||||
|
||||
if( type == CV_32SC2 || type == CV_32FC2 )
|
||||
{
|
||||
contour = cvPointSeqFromMat(
|
||||
CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED,
|
||||
mat, &contourHeader, &block );
|
||||
}
|
||||
}
|
||||
|
||||
CV_Assert(contour);
|
||||
|
||||
icvContourMoments(contour, &mom);
|
||||
return mom;
|
||||
}
|
||||
}
|
||||
if( contour )
|
||||
{
|
||||
icvContourMoments( contour, mom );
|
||||
return;
|
||||
}
|
||||
|
||||
type = CV_MAT_TYPE( mat->type );
|
||||
depth = CV_MAT_DEPTH( type );
|
||||
cn = CV_MAT_CN( type );
|
||||
|
||||
cv::Size size = cvGetMatSize( mat );
|
||||
if( cn > 1 && coi == 0 )
|
||||
CV_Error( CV_StsBadArg, "Invalid image type" );
|
||||
|
||||
if( size.width <= 0 || size.height <= 0 )
|
||||
return;
|
||||
|
||||
cv::Mat src0 = cv::cvarrToMat(mat);
|
||||
cv::ocl::oclMat src(src0);
|
||||
cv::Size tileSize;
|
||||
int blockx,blocky;
|
||||
if(size.width%TILE_SIZE == 0)
|
||||
blockx = size.width/TILE_SIZE;
|
||||
else
|
||||
blockx = size.width/TILE_SIZE + 1;
|
||||
if(size.height%TILE_SIZE == 0)
|
||||
blocky = size.height/TILE_SIZE;
|
||||
else
|
||||
blocky = size.height/TILE_SIZE + 1;
|
||||
oclMat dst_m(blocky * 10, blockx, CV_64FC1);
|
||||
oclMat sum(1, 10, CV_64FC1);
|
||||
int tile_width = std::min(size.width,TILE_SIZE);
|
||||
int tile_height = std::min(size.height,TILE_SIZE);
|
||||
size_t localThreads[3] = { tile_height, 1, 1};
|
||||
size_t globalThreads[3] = { size.height, blockx, 1};
|
||||
std::vector<std::pair<size_t , const void *> > args,args_sum;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&blocky ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
|
||||
openCLExecuteKernel2(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
|
||||
|
||||
size_t localThreadss[3] = { 128, 1, 1};
|
||||
size_t globalThreadss[3] = { 128, 1, 1};
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
|
||||
args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
|
||||
openCLExecuteKernel2(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
|
||||
|
||||
Mat dstsum(sum);
|
||||
mom->m00 = dstsum.at<double>(0, 0);
|
||||
mom->m10 = dstsum.at<double>(0, 1);
|
||||
mom->m01 = dstsum.at<double>(0, 2);
|
||||
mom->m20 = dstsum.at<double>(0, 3);
|
||||
mom->m11 = dstsum.at<double>(0, 4);
|
||||
mom->m02 = dstsum.at<double>(0, 5);
|
||||
mom->m30 = dstsum.at<double>(0, 6);
|
||||
mom->m21 = dstsum.at<double>(0, 7);
|
||||
mom->m12 = dstsum.at<double>(0, 8);
|
||||
mom->m03 = dstsum.at<double>(0, 9);
|
||||
|
||||
icvCompleteMomentState( mom );
|
||||
}
|
||||
|
||||
|
||||
Moments ocl_moments( InputArray _array, bool binaryImage )
|
||||
{
|
||||
CvMoments om;
|
||||
Mat arr = _array.getMat();
|
||||
CvMat c_array = arr;
|
||||
ocl_cvMoments(&c_array, &om, binaryImage);
|
||||
return om;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -67,7 +67,6 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x);
|
||||
|
||||
@ -97,7 +96,6 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x);
|
||||
|
||||
|
@ -44,14 +44,18 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#ifndef DBL_EPSILON
|
||||
#define DBL_EPSILON 0x1.0p-52
|
||||
#endif
|
||||
#else
|
||||
#define CV_PI 3.1415926535897932384626433832795f
|
||||
#ifndef DBL_EPSILON
|
||||
#define DBL_EPSILON 0x1.0p-52f
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
|
||||
#ifndef DBL_EPSILON
|
||||
#define DBL_EPSILON 0x1.0p-52
|
||||
#endif
|
||||
|
||||
__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
|
||||
__global float *src2, int src2_step, int src2_offset,
|
||||
@ -82,9 +86,9 @@ __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int sr
|
||||
float tmp = y >= 0 ? 0 : CV_PI*2;
|
||||
tmp = x < 0 ? CV_PI : tmp;
|
||||
|
||||
float tmp1 = y >= 0 ? CV_PI*0.5 : CV_PI*1.5;
|
||||
cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + (float)DBL_EPSILON) + tmp :
|
||||
tmp1 - x*y/(y2 + 0.28f*x2 + (float)DBL_EPSILON);
|
||||
float tmp1 = y >= 0 ? CV_PI*0.5f : CV_PI*1.5f;
|
||||
cartToPolar = y2 <= x2 ? x*y/(x2 + 0.28f*y2 + DBL_EPSILON) + tmp :
|
||||
tmp1 - x*y/(y2 + 0.28f*x2 + DBL_EPSILON);
|
||||
|
||||
cartToPolar = angInDegree == 0 ? cartToPolar : cartToPolar * (float)(180/CV_PI);
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,753 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////flip rows and cols///////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
uchar data0 = *(src + src_index_0);
|
||||
uchar data1 = *(src + src_index_1);
|
||||
|
||||
*(dst + dst_index_0) = data1;
|
||||
*(dst + dst_index_1) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset);
|
||||
|
||||
char data0 = *(src + src_index_0);
|
||||
char data1 = *(src + src_index_1);
|
||||
|
||||
*(dst + dst_index_0) = data1;
|
||||
*(dst + dst_index_1) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
ushort data0 = *((__global ushort *)((__global char *)src + src_index_0));
|
||||
ushort data1 = *((__global ushort *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global ushort *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
short data0 = *((__global short *)((__global char *)src + src_index_0));
|
||||
short data1 = *((__global short *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global short *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
int data0 = *((__global int *)((__global char *)src + src_index_0));
|
||||
int data1 = *((__global int *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global int *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
float data0 = *((__global float *)((__global char *)src + src_index_0));
|
||||
float data1 = *((__global float *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global float *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_offset,
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
double data0 = *((__global double *)((__global char *)src + src_index_0));
|
||||
double data1 = *((__global double *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global double *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
uchar2 data0 = *((__global uchar2 *)(src + src_index_0));
|
||||
uchar2 data1 = *((__global uchar2 *)(src + src_index_1));
|
||||
|
||||
*((__global uchar2 *)(dst + dst_index_0)) = data1;
|
||||
*((__global uchar2 *)(dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset);
|
||||
|
||||
char2 data0 = *((__global char2 *)(src + src_index_0));
|
||||
char2 data1 = *((__global char2 *)(src + src_index_1));
|
||||
|
||||
*((__global char2 *)(dst + dst_index_0)) = data1;
|
||||
*((__global char2 *)(dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0));
|
||||
ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
short2 data0 = *((__global short2 *)((__global char *)src + src_index_0));
|
||||
short2 data1 = *((__global short2 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global short2 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
int2 data0 = *((__global int2 *)((__global char *)src + src_index_0));
|
||||
int2 data1 = *((__global int2 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global int2 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
float2 data0 = *((__global float2 *)((__global char *)src + src_index_0));
|
||||
float2 data1 = *((__global float2 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global float2 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_offset,
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
double2 data0 = *((__global double2 *)((__global char *)src + src_index_0));
|
||||
double2 data1 = *((__global double2 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global double2 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
|
||||
uchar data0_0 = *(src + src_index_0 + 0);
|
||||
uchar data0_1 = *(src + src_index_0 + 1);
|
||||
uchar data0_2 = *(src + src_index_0 + 2);
|
||||
|
||||
uchar data1_0 = *(src + src_index_1 + 0);
|
||||
uchar data1_1 = *(src + src_index_1 + 1);
|
||||
uchar data1_2 = *(src + src_index_1 + 2);
|
||||
|
||||
*(dst + dst_index_0 + 0 ) = data1_0;
|
||||
*(dst + dst_index_0 + 1 ) = data1_1;
|
||||
*(dst + dst_index_0 + 2 ) = data1_2;
|
||||
|
||||
*(dst + dst_index_1 + 0) = data0_0;
|
||||
*(dst + dst_index_1 + 1) = data0_1;
|
||||
*(dst + dst_index_1 + 2) = data0_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset);
|
||||
|
||||
|
||||
char data0_0 = *(src + src_index_0 + 0);
|
||||
char data0_1 = *(src + src_index_0 + 1);
|
||||
char data0_2 = *(src + src_index_0 + 2);
|
||||
|
||||
char data1_0 = *(src + src_index_1 + 0);
|
||||
char data1_1 = *(src + src_index_1 + 1);
|
||||
char data1_2 = *(src + src_index_1 + 2);
|
||||
|
||||
*(dst + dst_index_0 + 0 ) = data1_0;
|
||||
*(dst + dst_index_0 + 1 ) = data1_1;
|
||||
*(dst + dst_index_0 + 2 ) = data1_2;
|
||||
|
||||
*(dst + dst_index_1 + 0) = data0_0;
|
||||
*(dst + dst_index_1 + 1) = data0_1;
|
||||
*(dst + dst_index_1 + 2) = data0_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0));
|
||||
ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2));
|
||||
ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4));
|
||||
|
||||
ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0));
|
||||
ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2));
|
||||
ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4));
|
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
|
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
|
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset);
|
||||
|
||||
short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0));
|
||||
short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2));
|
||||
short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4));
|
||||
|
||||
short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0));
|
||||
short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2));
|
||||
short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4));
|
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1;
|
||||
*((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2;
|
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1;
|
||||
*((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0));
|
||||
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4));
|
||||
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0));
|
||||
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4));
|
||||
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8));
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset);
|
||||
|
||||
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0));
|
||||
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4));
|
||||
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8));
|
||||
|
||||
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0));
|
||||
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4));
|
||||
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8));
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2;
|
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1;
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_offset,
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset);
|
||||
|
||||
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 ));
|
||||
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 ));
|
||||
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16));
|
||||
|
||||
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 ));
|
||||
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 ));
|
||||
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16));
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2;
|
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1;
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
__kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_offset,
|
||||
__global uchar *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
uchar4 data0 = *((__global uchar4 *)(src + src_index_0));
|
||||
uchar4 data1 = *((__global uchar4 *)(src + src_index_1));
|
||||
|
||||
*((__global uchar4 *)(dst + dst_index_0)) = data1;
|
||||
*((__global uchar4 *)(dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_offset,
|
||||
__global char *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset);
|
||||
|
||||
char4 data0 = *((__global char4 *)(src + src_index_0));
|
||||
char4 data1 = *((__global char4 *)(src + src_index_1));
|
||||
|
||||
*((__global char4 *)(dst + dst_index_0)) = data1;
|
||||
*((__global char4 *)(dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_offset,
|
||||
__global ushort *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0));
|
||||
ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_offset,
|
||||
__global short *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset);
|
||||
|
||||
short4 data0 = *((__global short4 *)((__global char *)src + src_index_0));
|
||||
short4 data1 = *((__global short4 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global short4 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_offset,
|
||||
__global int *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
int4 data0 = *((__global int4 *)((__global char *)src + src_index_0));
|
||||
int4 data1 = *((__global int4 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global int4 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
__kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_offset,
|
||||
__global float *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset);
|
||||
|
||||
float4 data0 = *((__global float4 *)((__global char *)src + src_index_0));
|
||||
float4 data1 = *((__global float4 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global float4 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
__kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_offset,
|
||||
__global double *dst, int dst_step, int dst_offset,
|
||||
int rows, int cols, int thread_rows, int dst_step1)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < thread_rows)
|
||||
{
|
||||
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset);
|
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset);
|
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset);
|
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset);
|
||||
|
||||
double4 data0 = *((__global double4 *)((__global char *)src + src_index_0));
|
||||
double4 data1 = *((__global double4 *)((__global char *)src + src_index_1));
|
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index_0)) = data1;
|
||||
*((__global double4 *)((__global char *)dst + dst_index_1)) = data0;
|
||||
}
|
||||
}
|
||||
#endif
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -53,61 +53,66 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
|
||||
#ifdef DEPTH_5
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#elif defined DEPTH_6
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
/**************************************Array minMax**************************************/
|
||||
|
||||
__kernel void arithm_op_minMax(__global const T * src, __global T * dst,
|
||||
int cols, int invalid_cols, int offset, int elemnum, int groupnum)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
|
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols;
|
||||
int idx = offset + id + (id / cols) * invalid_cols;
|
||||
|
||||
__local T localmem_max[128], localmem_min[128];
|
||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
||||
__local T localmem_max[128], localmem_min[128];
|
||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
||||
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
}
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
}
|
||||
|
||||
if(lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval, localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval, localmem_max[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval, localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval, localmem_max[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (lid == 0)
|
||||
{
|
||||
dst[gid] = localmem_min[0];
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
if (lid == 0)
|
||||
{
|
||||
dst[gid] = localmem_min[0];
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
|
||||
@ -115,57 +120,57 @@ __kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
|
||||
int elemnum, int groupnum,
|
||||
const __global uchar * mask, int minvalid_cols, int moffset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
|
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols;
|
||||
unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
|
||||
int idx = offset + id + (id / cols) * invalid_cols;
|
||||
int midx = moffset + id + (id / cols) * minvalid_cols;
|
||||
|
||||
__local T localmem_max[128], localmem_min[128];
|
||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
||||
__local T localmem_max[128], localmem_min[128];
|
||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
||||
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
midx = moffset + id + (id / cols) * minvalid_cols;
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
midx = moffset + id + (id / cols) * minvalid_cols;
|
||||
|
||||
if (mask[midx])
|
||||
{
|
||||
temp = src[idx];
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
}
|
||||
}
|
||||
if (mask[midx])
|
||||
{
|
||||
temp = src[idx];
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
}
|
||||
}
|
||||
|
||||
if(lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval, localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval, localmem_max[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval, localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval, localmem_max[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (lid == 0)
|
||||
{
|
||||
dst[gid] = localmem_min[0];
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
if (lid == 0)
|
||||
{
|
||||
dst[gid] = localmem_min[0];
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
}
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -137,118 +137,114 @@
|
||||
#define repeat_e(a) a.s3 = a.s0;a.s2 = a.s0;a.s1 = a.s0;
|
||||
#endif
|
||||
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
|
||||
|
||||
/**************************************Array minMax**************************************/
|
||||
|
||||
__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
|
||||
__global VEC_TYPE *src, __global RES_TYPE *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols;
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
int idx = offset + id + (id / cols) * invalid_cols;
|
||||
|
||||
__local VEC_TYPE localmem_max[128], localmem_min[128];
|
||||
VEC_TYPE minval, maxval, temp;
|
||||
__local VEC_TYPE localmem_max[128], localmem_min[128];
|
||||
VEC_TYPE minval, maxval, temp;
|
||||
|
||||
__local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
|
||||
VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
|
||||
__local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
|
||||
VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
|
||||
|
||||
int idx_c;
|
||||
int idx_c;
|
||||
|
||||
if (id < elemnum)
|
||||
{
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
|
||||
if (id < elemnum)
|
||||
{
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
|
||||
|
||||
if (id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
}
|
||||
if (id % cols == cols - 1)
|
||||
{
|
||||
repeat_e(temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = temp;
|
||||
maxval = temp;
|
||||
minloc = temploc;
|
||||
maxloc = temploc;
|
||||
}
|
||||
else
|
||||
{
|
||||
minval = MAX_VAL;
|
||||
maxval = MIN_VAL;
|
||||
minloc = negative;
|
||||
maxloc = negative;
|
||||
}
|
||||
if (id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
}
|
||||
if (id % cols == cols - 1)
|
||||
{
|
||||
repeat_e(temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = temp;
|
||||
maxval = temp;
|
||||
minloc = temploc;
|
||||
maxloc = temploc;
|
||||
}
|
||||
else
|
||||
{
|
||||
minval = MAX_VAL;
|
||||
maxval = MIN_VAL;
|
||||
minloc = negative;
|
||||
maxloc = negative;
|
||||
}
|
||||
|
||||
int grainSize = (groupnum << 8);
|
||||
for (id = id + grainSize; id < elemnum; id = id + grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
|
||||
int grainSize = (groupnum << 8);
|
||||
for (id = id + grainSize; id < elemnum; id = id + grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = src[idx];
|
||||
idx_c = idx << 2;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
|
||||
|
||||
if (id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
}
|
||||
if (id % cols == cols - 1)
|
||||
{
|
||||
repeat_e(temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
if (id % cols == 0 )
|
||||
{
|
||||
repeat_s(temp);
|
||||
repeat_s(temploc);
|
||||
}
|
||||
if (id % cols == cols - 1)
|
||||
{
|
||||
repeat_e(temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
|
||||
maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
|
||||
}
|
||||
minval = min(minval, temp);
|
||||
maxval = max(maxval, temp);
|
||||
minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
|
||||
maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
|
||||
}
|
||||
|
||||
if (lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
localmem_minloc[lid - 128] = minloc;
|
||||
localmem_maxloc[lid - 128] = maxloc;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
localmem_minloc[lid - 128] = minloc;
|
||||
localmem_maxloc[lid - 128] = maxloc;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval,localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval,localmem_max[lid]);
|
||||
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval,localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval,localmem_max[lid]);
|
||||
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
||||
localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
|
||||
localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
if ( lid == 0)
|
||||
{
|
||||
dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
|
||||
dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
|
||||
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
|
||||
}
|
||||
if ( lid == 0)
|
||||
{
|
||||
dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
|
||||
dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
|
||||
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
|
||||
}
|
||||
}
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -147,96 +147,96 @@
|
||||
__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global TYPE *src,
|
||||
int minvalid_cols,int moffset,__global uchar *mask,__global RES_TYPE *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
unsigned int idx = id + (id / cols) * invalid_cols;
|
||||
unsigned int midx = id + (id / cols) * minvalid_cols;
|
||||
__local VEC_TYPE lm_max[128],lm_min[128];
|
||||
VEC_TYPE minval,maxval,temp,m_temp;
|
||||
__local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
|
||||
VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = vload4(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
|
||||
int idx_c = (idx << 2) + offset;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
|
||||
maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
|
||||
minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
|
||||
maxloc = minloc;
|
||||
}
|
||||
else
|
||||
{
|
||||
minval = MAX_VAL;
|
||||
maxval = MIN_VAL;
|
||||
minloc = negative;
|
||||
maxloc = negative;
|
||||
}
|
||||
for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
|
||||
{
|
||||
idx = id + (id / cols) * invalid_cols;
|
||||
midx = id + (id / cols) * minvalid_cols;
|
||||
temp = vload4(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
|
||||
int idx_c = (idx << 2) + offset;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
|
||||
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
int idx = id + (id / cols) * invalid_cols;
|
||||
int midx = id + (id / cols) * minvalid_cols;
|
||||
__local VEC_TYPE lm_max[128],lm_min[128];
|
||||
VEC_TYPE minval,maxval,temp,m_temp;
|
||||
__local VEC_TYPE_LOC lm_maxloc[128],lm_minloc[128];
|
||||
VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1,one = 1,zero = 0;
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = vload4(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
|
||||
int idx_c = (idx << 2) + offset;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
|
||||
maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
|
||||
minloc = CONDITION_FUNC(m_temp != (VEC_TYPE)0, temploc , negative);
|
||||
maxloc = minloc;
|
||||
}
|
||||
else
|
||||
{
|
||||
minval = MAX_VAL;
|
||||
maxval = MIN_VAL;
|
||||
minloc = negative;
|
||||
maxloc = negative;
|
||||
}
|
||||
for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
|
||||
{
|
||||
idx = id + (id / cols) * invalid_cols;
|
||||
midx = id + (id / cols) * minvalid_cols;
|
||||
temp = vload4(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload4(midx,&mask[moffset]));
|
||||
int idx_c = (idx << 2) + offset;
|
||||
temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
repeat_e(temploc);
|
||||
}
|
||||
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
|
||||
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
|
||||
|
||||
minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
|
||||
maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
|
||||
}
|
||||
if(lid > 127)
|
||||
{
|
||||
lm_min[lid - 128] = minval;
|
||||
lm_max[lid - 128] = maxval;
|
||||
lm_minloc[lid - 128] = minloc;
|
||||
lm_maxloc[lid - 128] = maxloc;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 128)
|
||||
{
|
||||
lm_min[lid] = min(minval,lm_min[lid]);
|
||||
lm_max[lid] = max(maxval,lm_max[lid]);
|
||||
VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
|
||||
VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
|
||||
lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
|
||||
lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if(lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
|
||||
lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
|
||||
VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
|
||||
VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
|
||||
lm_minloc[lid] =
|
||||
CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
|
||||
lm_maxloc[lid] =
|
||||
CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if( lid == 0)
|
||||
{
|
||||
dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
|
||||
dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
|
||||
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
|
||||
}
|
||||
minloc = CONDITION_FUNC((minval == temp) && (m_temp != (VEC_TYPE)0), temploc , minloc);
|
||||
maxloc = CONDITION_FUNC((maxval == temp) && (m_temp != (VEC_TYPE)0), temploc , maxloc);
|
||||
}
|
||||
if(lid > 127)
|
||||
{
|
||||
lm_min[lid - 128] = minval;
|
||||
lm_max[lid - 128] = maxval;
|
||||
lm_minloc[lid - 128] = minloc;
|
||||
lm_maxloc[lid - 128] = maxloc;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 128)
|
||||
{
|
||||
lm_min[lid] = min(minval,lm_min[lid]);
|
||||
lm_max[lid] = max(maxval,lm_max[lid]);
|
||||
VEC_TYPE con_min = CONVERT_TYPE(minloc != negative ? one : zero);
|
||||
VEC_TYPE con_max = CONVERT_TYPE(maxloc != negative ? one : zero);
|
||||
lm_minloc[lid] = CONDITION_FUNC((lm_min[lid] == minval) && (con_min != (VEC_TYPE)0), minloc , lm_minloc[lid]);
|
||||
lm_maxloc[lid] = CONDITION_FUNC((lm_max[lid] == maxval) && (con_max != (VEC_TYPE)0), maxloc , lm_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if(lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
lm_min[lid] = min(lm_min[lid] , lm_min[lid2]);
|
||||
lm_max[lid] = max(lm_max[lid] , lm_max[lid2]);
|
||||
VEC_TYPE con_min = CONVERT_TYPE(lm_minloc[lid2] != negative ? one : zero);
|
||||
VEC_TYPE con_max = CONVERT_TYPE(lm_maxloc[lid2] != negative ? one : zero);
|
||||
lm_minloc[lid] =
|
||||
CONDITION_FUNC((lm_min[lid] == lm_min[lid2]) && (con_min != (VEC_TYPE)0), lm_minloc[lid2] , lm_minloc[lid]);
|
||||
lm_maxloc[lid] =
|
||||
CONDITION_FUNC((lm_max[lid] == lm_max[lid2]) && (con_max != (VEC_TYPE)0), lm_maxloc[lid2] , lm_maxloc[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if( lid == 0)
|
||||
{
|
||||
dst[gid] = CONVERT_RES_TYPE(lm_min[0]);
|
||||
dst[gid + groupnum] = CONVERT_RES_TYPE(lm_max[0]);
|
||||
dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(lm_minloc[0]);
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
|
||||
}
|
||||
}
|
||||
|
@ -1,196 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Shengen Yan,yanshengen@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
/**************************************PUBLICFUNC*************************************/
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
|
||||
#if defined (DEPTH_0)
|
||||
#define VEC_TYPE uchar8
|
||||
#define TYPE uchar
|
||||
#define CONVERT_TYPE convert_uchar8
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 255
|
||||
#endif
|
||||
#if defined (DEPTH_1)
|
||||
#define VEC_TYPE char8
|
||||
#define TYPE char
|
||||
#define CONVERT_TYPE convert_char8
|
||||
#define MIN_VAL -128
|
||||
#define MAX_VAL 127
|
||||
#endif
|
||||
#if defined (DEPTH_2)
|
||||
#define VEC_TYPE ushort8
|
||||
#define TYPE ushort
|
||||
#define CONVERT_TYPE convert_ushort8
|
||||
#define MIN_VAL 0
|
||||
#define MAX_VAL 65535
|
||||
#endif
|
||||
#if defined (DEPTH_3)
|
||||
#define VEC_TYPE short8
|
||||
#define TYPE short
|
||||
#define CONVERT_TYPE convert_short8
|
||||
#define MIN_VAL -32768
|
||||
#define MAX_VAL 32767
|
||||
#endif
|
||||
#if defined (DEPTH_4)
|
||||
#define VEC_TYPE int8
|
||||
#define TYPE int
|
||||
#define CONVERT_TYPE convert_int8
|
||||
#define MIN_VAL INT_MIN
|
||||
#define MAX_VAL INT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_5)
|
||||
#define VEC_TYPE float8
|
||||
#define TYPE float
|
||||
#define CONVERT_TYPE convert_float8
|
||||
#define MIN_VAL (-FLT_MAX)
|
||||
#define MAX_VAL FLT_MAX
|
||||
#endif
|
||||
#if defined (DEPTH_6)
|
||||
#define VEC_TYPE double8
|
||||
#define TYPE double
|
||||
#define CONVERT_TYPE convert_double8
|
||||
#define MIN_VAL (-DBL_MAX)
|
||||
#define MAX_VAL DBL_MAX
|
||||
#endif
|
||||
|
||||
#if defined (REPEAT_E0)
|
||||
#define repeat_me(a) a = a;
|
||||
#endif
|
||||
#if defined (REPEAT_E1)
|
||||
#define repeat_me(a) a.s7 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E2)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E3)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E4)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E5)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E6)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;
|
||||
#endif
|
||||
#if defined (REPEAT_E7)
|
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0;
|
||||
#endif
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
|
||||
|
||||
/**************************************Array minMax mask**************************************/
|
||||
__kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum, __global TYPE *src,
|
||||
int minvalid_cols,int moffset, __global uchar *mask,__global VEC_TYPE *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
unsigned int idx = id + (id / cols) * invalid_cols;
|
||||
unsigned int midx = id + (id / cols) * minvalid_cols;
|
||||
__local VEC_TYPE localmem_max[128],localmem_min[128];
|
||||
VEC_TYPE minval,maxval,temp,m_temp;
|
||||
if(id < elemnum)
|
||||
{
|
||||
temp = vload8(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
}
|
||||
minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL;
|
||||
maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
minval = MAX_VAL;
|
||||
maxval = MIN_VAL;
|
||||
}
|
||||
for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
|
||||
{
|
||||
idx = id + (id / cols) * invalid_cols;
|
||||
midx = id + (id / cols) * minvalid_cols;
|
||||
temp = vload8(idx, &src[offset]);
|
||||
m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset]));
|
||||
if(id % cols == cols - 1)
|
||||
{
|
||||
repeat_me(m_temp);
|
||||
}
|
||||
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval);
|
||||
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval);
|
||||
}
|
||||
if(lid > 127)
|
||||
{
|
||||
localmem_min[lid - 128] = minval;
|
||||
localmem_max[lid - 128] = maxval;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < 128)
|
||||
{
|
||||
localmem_min[lid] = min(minval,localmem_min[lid]);
|
||||
localmem_max[lid] = max(maxval,localmem_max[lid]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if(lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
|
||||
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if( lid == 0)
|
||||
{
|
||||
dst[gid] = localmem_min[0];
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
}
|
@ -55,11 +55,11 @@
|
||||
__kernel void arithm_op_nonzero(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
|
||||
__global srcT *src, __global dstT *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
|
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols;
|
||||
int idx = offset + id + (id / cols) * invalid_cols;
|
||||
__local dstT localmem_nonzero[128];
|
||||
dstT nonzero = (dstT)(0);
|
||||
srcT zero = (srcT)(0), one = (srcT)(1);
|
||||
|
@ -45,15 +45,17 @@
|
||||
//
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#elif defined (cl_amd_fp64)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#endif
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#define CV_2PI 2*CV_PI
|
||||
#else
|
||||
#define CV_PI 3.1415926535897932384626433832795f
|
||||
#define CV_2PI 2*CV_PI
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CV_PI 3.1415926535898
|
||||
#define CV_2PI 2*3.1415926535898
|
||||
|
||||
/**************************************phase inradians**************************************/
|
||||
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -43,12 +43,13 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#ifdef DOUBLE_SUPPORT
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
#else
|
||||
#define CV_PI 3.1415926535897932384626433832795f
|
||||
#endif
|
||||
|
||||
#define CV_PI 3.1415926535897932384626433832795
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////polarToCart with magnitude//////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -72,7 +73,7 @@ __kernel void arithm_polarToCart_mag_D5 (__global float *src1, int src1_step, in
|
||||
float x = *((__global float *)((__global char *)src1 + src1_index));
|
||||
float y = *((__global float *)((__global char *)src2 + src2_index));
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
float ascale = CV_PI/180.0f;
|
||||
float alpha = angInDegree == 1 ? y * ascale : y;
|
||||
float a = cos(alpha) * x;
|
||||
float b = sin(alpha) * x;
|
||||
@ -134,7 +135,7 @@ __kernel void arithm_polarToCart_D5 (__global float *src, int src_step, int sr
|
||||
|
||||
float y = *((__global float *)((__global char *)src + src_index));
|
||||
|
||||
float ascale = CV_PI/180.0;
|
||||
float ascale = CV_PI/180.0f;
|
||||
float alpha = angInDegree == 1 ? y * ascale : y;
|
||||
float a = cos(alpha);
|
||||
float b = sin(alpha);
|
||||
|
@ -66,39 +66,39 @@
|
||||
__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
|
||||
__global srcT *src, __global dstT *dst)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
unsigned int id = get_global_id(0);
|
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols;
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int id = get_global_id(0);
|
||||
int idx = offset + id + (id / cols) * invalid_cols;
|
||||
|
||||
__local dstT localmem_sum[128];
|
||||
dstT sum = (dstT)(0), temp;
|
||||
__local dstT localmem_sum[128];
|
||||
dstT sum = (dstT)(0), temp;
|
||||
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = convertToDstT(src[idx]);
|
||||
FUNC(temp, sum);
|
||||
}
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
||||
{
|
||||
idx = offset + id + (id / cols) * invalid_cols;
|
||||
temp = convertToDstT(src[idx]);
|
||||
FUNC(temp, sum);
|
||||
}
|
||||
|
||||
if (lid > 127)
|
||||
localmem_sum[lid - 128] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid > 127)
|
||||
localmem_sum[lid - 128] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lid < 128)
|
||||
localmem_sum[lid] = sum + localmem_sum[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (lid < 128)
|
||||
localmem_sum[lid] = sum + localmem_sum[lid];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
||||
{
|
||||
if (lid < lsize)
|
||||
{
|
||||
int lid2 = lsize + lid;
|
||||
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (lid == 0)
|
||||
dst[gid] = localmem_sum[0];
|
||||
if (lid == 0)
|
||||
dst[gid] = localmem_sum[0];
|
||||
}
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -42,99 +42,37 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
__kernel void BlendLinear_C1_D0(
|
||||
__global uchar4 *dst,
|
||||
__global uchar4 *img1,
|
||||
__global uchar4 *img2,
|
||||
__global float4 *weight1,
|
||||
__global float4 *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_amd_fp64
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (cl_khr_fp64)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
__kernel void blendLinear(__global const T * src1, int src1_offset, int src1_step,
|
||||
__global const T * src2, int src2_offset, int src2_step,
|
||||
__global const float * weight1, int weight1_offset, int weight1_step,
|
||||
__global const float * weight2, int weight2_offset, int weight2_step,
|
||||
__global T * dst, int dst_offset, int dst_step,
|
||||
int rows, int cols)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx << 2 < cols && idy < rows)
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep >> 2,idx);
|
||||
float4 w1 = weight1[wpos], w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BlendLinear_C4_D0(
|
||||
__global uchar4 *dst,
|
||||
__global uchar4 *img1,
|
||||
__global uchar4 *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep, idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BlendLinear_C1_D5(
|
||||
__global float4 *dst,
|
||||
__global float4 *img1,
|
||||
__global float4 *img2,
|
||||
__global float4 *weight1,
|
||||
__global float4 *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx << 2 < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep >> 2,idx);
|
||||
float4 w1 = weight1[wpos], w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BlendLinear_C4_D5(
|
||||
__global float4 *dst,
|
||||
__global float4 *img1,
|
||||
__global float4 *img2,
|
||||
__global float *weight1,
|
||||
__global float *weight2,
|
||||
int rows,
|
||||
int cols,
|
||||
int istep,
|
||||
int wstep
|
||||
)
|
||||
{
|
||||
int idx = get_global_id(0);
|
||||
int idy = get_global_id(1);
|
||||
if (idx < cols && idy < rows)
|
||||
{
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep, idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
int src1_index = mad24(y, src1_step, src1_offset + x);
|
||||
int src2_index = mad24(y, src2_step, src2_offset + x);
|
||||
int weight1_index = mad24(y, weight1_step, weight1_offset + x);
|
||||
int weight2_index = mad24(y, weight2_step, weight2_offset + x);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x);
|
||||
|
||||
FT w1 = (FT)(weight1[weight1_index]), w2 = (FT)(weight2[weight2_index]);
|
||||
FT den = w1 + w2 + (FT)(1e-5f);
|
||||
FT num = w1 * convertToFT(src1[src1_index]) + w2 * convertToFT(src2[src2_index]);
|
||||
|
||||
dst[dst_index] = convertToT(num / den);
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,7 @@
|
||||
// @Authors
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
// Baichuan Su, baichuan@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -63,7 +64,7 @@
|
||||
#endif
|
||||
|
||||
//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
int bit1Count(int v)
|
||||
static int bit1Count(int v)
|
||||
{
|
||||
v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
|
||||
@ -94,7 +95,7 @@ typedef int result_type;
|
||||
#define DIST_RES(x) (x)
|
||||
#endif
|
||||
|
||||
result_type reduce_block(
|
||||
static result_type reduce_block(
|
||||
__local value_type *s_query,
|
||||
__local value_type *s_train,
|
||||
int lidx,
|
||||
@ -112,7 +113,25 @@ result_type reduce_block(
|
||||
return DIST_RES(result);
|
||||
}
|
||||
|
||||
result_type reduce_multi_block(
|
||||
static result_type reduce_block_match(
|
||||
__local value_type *s_query,
|
||||
__local value_type *s_train,
|
||||
int lidx,
|
||||
int lidy
|
||||
)
|
||||
{
|
||||
result_type result = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(
|
||||
s_query[lidy * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
static result_type reduce_multi_block(
|
||||
__local value_type *s_query,
|
||||
__local value_type *s_train,
|
||||
int block_index,
|
||||
@ -128,7 +147,7 @@ result_type reduce_multi_block(
|
||||
s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return DIST_RES(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
|
||||
@ -168,7 +187,6 @@ __kernel void BruteForceMatch_UnrollMatch(
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
|
||||
{
|
||||
result_type result = 0;
|
||||
@ -187,11 +205,12 @@ __kernel void BruteForceMatch_UnrollMatch(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
@ -272,16 +291,17 @@ __kernel void BruteForceMatch_Match(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||
result += reduce_block_match(s_query, s_train, lidx, lidy);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//myBestImgidx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
@ -367,11 +387,10 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if(ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
@ -428,11 +447,10 @@ __kernel void BruteForceMatch_RadiusMatch(
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx);
|
||||
int ind = atom_inc(nMatches + queryIdx);
|
||||
|
||||
if(ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
@ -475,7 +493,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
volatile int imgIdx = 0;
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
result_type result = 0;
|
||||
@ -493,6 +510,8 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
@ -631,11 +650,13 @@ __kernel void BruteForceMatch_knnMatch(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, lidx, lidy);
|
||||
result += reduce_block_match(s_query, s_train, lidx, lidy);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
result = DIST_RES(result);
|
||||
|
||||
const int trainIdx = t * BLOCK_SIZE + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
|
@ -50,8 +50,6 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
|
||||
#define DATA_TYPE UNDEFINED
|
||||
|
||||
#if defined (DEPTH_0)
|
||||
#define DATA_TYPE uchar
|
||||
#define MAX_NUM 255
|
||||
@ -73,6 +71,10 @@
|
||||
#define SAT_CAST(num) (num)
|
||||
#endif
|
||||
|
||||
#ifndef DATA_TYPE
|
||||
#define DATA_TYPE UNDEFINED
|
||||
#endif
|
||||
|
||||
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
|
||||
|
||||
enum
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -10,13 +10,9 @@
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Zhang Ying, zhangying913@gmail.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
@ -79,400 +75,298 @@
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#endif
|
||||
|
||||
#define THREADS 256
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
|
||||
|
||||
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
|
||||
int dst_rows, int dst_cols,
|
||||
int dst_startX, int dst_x_off,
|
||||
float alpha)
|
||||
{
|
||||
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
uint4 tmp_sum = 0;
|
||||
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
|
||||
int posY = (get_group_id(1) << 1);
|
||||
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum += vload4(get_local_id(0), temp+i);
|
||||
}
|
||||
|
||||
if(posY < dst_rows && posX < dst_cols)
|
||||
{
|
||||
tmp_sum /= (uint4) alpha;
|
||||
if(posX >= 0 && posX < dst_cols)
|
||||
*(dst) = tmp_sum.x;
|
||||
if(posX+1 >= 0 && posX+1 < dst_cols)
|
||||
*(dst + 1) = tmp_sum.y;
|
||||
if(posX+2 >= 0 && posX+2 < dst_cols)
|
||||
*(dst + 2) = tmp_sum.z;
|
||||
if(posX+3 >= 0 && posX+3 < dst_cols)
|
||||
*(dst + 3) = tmp_sum.w;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
|
||||
int dst_rows, int dst_cols,
|
||||
int dst_startX, int dst_x_off,
|
||||
float alpha)
|
||||
{
|
||||
if(get_local_id(0) >= (THREADS-ksX+1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
int posX = dst_startX - dst_x_off + get_local_id(0);
|
||||
int posY = (get_group_id(1) << 1);
|
||||
|
||||
uint4 temp_sum = 0;
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
temp_sum += temp[get_local_id(0) + anX + i];
|
||||
}
|
||||
|
||||
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
|
||||
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
int src_x_off = src_offset % src_step;
|
||||
int src_y_off = src_offset / src_step;
|
||||
int dst_x_off = dst_offset % dst_step;
|
||||
int dst_y_off = dst_offset / dst_step;
|
||||
|
||||
int head_off = dst_x_off%4;
|
||||
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[2][THREADS];
|
||||
|
||||
#ifdef EXTRA_EXTRAPOLATION // border > src image size
|
||||
#ifdef BORDER_CONSTANT
|
||||
// None
|
||||
#elif defined BORDER_REPLICATE
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
x = max(min(x, maxX - 1), minX); \
|
||||
y = max(min(y, maxY - 1), minY); \
|
||||
}
|
||||
#elif defined BORDER_WRAP
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
if (x < minX) \
|
||||
x -= ((x - maxX + 1) / maxX) * maxX; \
|
||||
if (x >= maxX) \
|
||||
x %= maxX; \
|
||||
if (y < minY) \
|
||||
y -= ((y - maxY + 1) / maxY) * maxY; \
|
||||
if (y >= maxY) \
|
||||
y %= maxY; \
|
||||
}
|
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
|
||||
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
|
||||
{ \
|
||||
if (maxX - minX == 1) \
|
||||
x = minX; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if (x < minX) \
|
||||
x = -(x - minX) - 1 + delta; \
|
||||
else \
|
||||
x = maxX - 1 - (x - maxX) - delta; \
|
||||
} \
|
||||
while (x >= maxX || x < minX); \
|
||||
\
|
||||
if (maxY - minY == 1) \
|
||||
y = minY; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if (y < minY) \
|
||||
y = -(y - minY) - 1 + delta; \
|
||||
else \
|
||||
y = maxY - 1 - (y - maxY) - delta; \
|
||||
} \
|
||||
while (y >= maxY || y < minY); \
|
||||
}
|
||||
#ifdef BORDER_REFLECT
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
|
||||
#elif defined(BORDER_REFLECT_101)
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
|
||||
#endif
|
||||
#else
|
||||
#error No extrapolation method
|
||||
#endif
|
||||
#else
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
int _row = y - minY, _col = x - minX; \
|
||||
_row = ADDR_H(_row, 0, maxY - minY); \
|
||||
_row = ADDR_B(_row, maxY - minY, _row); \
|
||||
y = _row + minY; \
|
||||
\
|
||||
_col = ADDR_L(_col, 0, maxX - minX); \
|
||||
_col = ADDR_R(_col, maxX - minX, _col); \
|
||||
x = _col + minX; \
|
||||
}
|
||||
#endif
|
||||
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
#if USE_DOUBLE
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#define FPTYPE double
|
||||
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
|
||||
#else
|
||||
#define FPTYPE float
|
||||
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
|
||||
#endif
|
||||
|
||||
#if DATA_DEPTH == 0
|
||||
#define BASE_TYPE uchar
|
||||
#elif DATA_DEPTH == 1
|
||||
#define BASE_TYPE char
|
||||
#elif DATA_DEPTH == 2
|
||||
#define BASE_TYPE ushort
|
||||
#elif DATA_DEPTH == 3
|
||||
#define BASE_TYPE short
|
||||
#elif DATA_DEPTH == 4
|
||||
#define BASE_TYPE int
|
||||
#elif DATA_DEPTH == 5
|
||||
#define BASE_TYPE float
|
||||
#elif DATA_DEPTH == 6
|
||||
#define BASE_TYPE double
|
||||
#else
|
||||
#error data_depth
|
||||
#endif
|
||||
|
||||
#define __CAT(x, y) x##y
|
||||
#define CAT(x, y) __CAT(x, y)
|
||||
|
||||
#define uchar1 uchar
|
||||
#define char1 char
|
||||
#define ushort1 ushort
|
||||
#define short1 short
|
||||
#define int1 int
|
||||
#define float1 float
|
||||
#define double1 double
|
||||
|
||||
#define convert_uchar1_sat_rte convert_uchar_sat_rte
|
||||
#define convert_char1_sat_rte convert_char_sat_rte
|
||||
#define convert_ushort1_sat_rte convert_ushort_sat_rte
|
||||
#define convert_short1_sat_rte convert_short_sat_rte
|
||||
#define convert_int1_sat_rte convert_int_sat_rte
|
||||
#define convert_float1
|
||||
#define convert_double1
|
||||
|
||||
#if DATA_DEPTH == 5 || DATA_DEPTH == 6
|
||||
#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
|
||||
#else
|
||||
#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
|
||||
#endif
|
||||
|
||||
#define VEC_SIZE DATA_CHAN
|
||||
|
||||
#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
|
||||
#define TYPE VEC_TYPE
|
||||
|
||||
#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
|
||||
|
||||
#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
|
||||
|
||||
struct RectCoords
|
||||
{
|
||||
int x1, y1, x2, y2;
|
||||
};
|
||||
|
||||
//#define DEBUG
|
||||
#ifdef DEBUG
|
||||
#define DEBUG_ONLY(x) x
|
||||
#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
|
||||
#else
|
||||
#define DEBUG_ONLY(x)
|
||||
#define ASSERT(condition)
|
||||
#endif
|
||||
|
||||
|
||||
inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
|
||||
#ifdef BORDER_CONSTANT
|
||||
, SCALAR_TYPE borderValue
|
||||
#endif
|
||||
)
|
||||
{
|
||||
#ifdef BORDER_ISOLATED
|
||||
if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
#else
|
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
#endif
|
||||
{
|
||||
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
|
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
|
||||
return CONVERT_TO_FPTYPE(*ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
return borderValue;
|
||||
#else
|
||||
int selected_col = pos.x;
|
||||
int selected_row = pos.y;
|
||||
|
||||
EXTRAPOLATE(selected_col, selected_row,
|
||||
#ifdef BORDER_ISOLATED
|
||||
srcCoords.x1, srcCoords.y1,
|
||||
#else
|
||||
0, 0,
|
||||
#endif
|
||||
srcCoords.x2, srcCoords.y2
|
||||
);
|
||||
|
||||
// debug border mapping
|
||||
//printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
|
||||
|
||||
pos = (int2)(selected_col, selected_row);
|
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
{
|
||||
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
|
||||
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
|
||||
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
|
||||
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
|
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
|
||||
return CONVERT_TO_FPTYPE(*ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[i]=0;
|
||||
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
|
||||
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
|
||||
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
|
||||
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
|
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
|
||||
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
|
||||
// for debug only
|
||||
DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
|
||||
return (FPTYPE)(0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
int not_all_in_range;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
|
||||
| (startY+i<0) | (startY+i>src_whole_rows-1);
|
||||
if(not_all_in_range)
|
||||
{
|
||||
int selected_row;
|
||||
int4 selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
|
||||
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
|
||||
|
||||
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
|
||||
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
|
||||
|
||||
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
|
||||
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
|
||||
|
||||
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
|
||||
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
|
||||
|
||||
data[i].x = *(src + selected_row * src_step + selected_col.x);
|
||||
data[i].y = *(src + selected_row * src_step + selected_col.y);
|
||||
data[i].z = *(src + selected_row * src_step + selected_col.z);
|
||||
data[i].w = *(src + selected_row * src_step + selected_col.w);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
uint4 tmp_sum = 0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
tmp_sum += (data[i]);
|
||||
}
|
||||
|
||||
int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
|
||||
|
||||
temp[0][col] = tmp_sum + (data[0]);
|
||||
temp[1][col] = tmp_sum + (data[ksY]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
|
||||
int src_x_off = (src_offset % src_step) >> 2;
|
||||
int src_y_off = src_offset / src_step;
|
||||
int dst_x_off = (dst_offset % dst_step) >> 2;
|
||||
int dst_y_off = dst_offset / dst_step;
|
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
uint4 data[ksY+1];
|
||||
__local uint4 temp[2][THREADS];
|
||||
// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
|
||||
void boxFilter(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
|
||||
__global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
|
||||
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
|
||||
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
|
||||
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
|
||||
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
|
||||
}
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
|
||||
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
|
||||
}
|
||||
|
||||
SCALAR_TYPE borderValue,
|
||||
#endif
|
||||
uint4 tmp_sum = 0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
tmp_sum += (data[i]);
|
||||
}
|
||||
|
||||
int index = dst_startY * (dst_step>>2)+ dst_startX + col;
|
||||
|
||||
temp[0][col] = tmp_sum + (data[0]);
|
||||
temp[1][col] = tmp_sum + (data[ksY]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
|
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
|
||||
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
FPTYPE alpha
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
|
||||
const struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
|
||||
|
||||
int src_x_off = (src_offset % src_step) >> 2;
|
||||
int src_y_off = src_offset / src_step;
|
||||
int dst_x_off = (dst_offset % dst_step) >> 2;
|
||||
int dst_y_off = dst_offset / dst_step;
|
||||
const int x = get_local_id(0) + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
|
||||
const int y = get_global_id(1) * BLOCK_SIZE_Y;
|
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
float data[ksY+1];
|
||||
__local float temp[2][THREADS];
|
||||
const int local_id = get_local_id(0);
|
||||
|
||||
INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
|
||||
__local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
|
||||
|
||||
int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
|
||||
for(int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)
|
||||
{
|
||||
data[sy] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
float ss;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
|
||||
|
||||
data[i] = con ? ss : 0.f;
|
||||
}
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
|
||||
data[i] = src[selected_row * (src_step>>2) + selected_col];
|
||||
}
|
||||
|
||||
, borderValue
|
||||
#endif
|
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[i]);
|
||||
);
|
||||
}
|
||||
sum1 = sum0 + (data[0]);
|
||||
sum2 = sum0 + (data[ksY]);
|
||||
temp[0][col] = sum1;
|
||||
temp[1][col] = sum2;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(col < (THREADS-(ksX-1)))
|
||||
{
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
|
||||
float tmp_sum[2]= {0.0, 0.0};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
INTERMEDIATE_TYPE tmp_sum = 0;
|
||||
for(int sy = 0; sy < KERNEL_SIZE_Y; sy++)
|
||||
{
|
||||
tmp_sum += (data[sy]);
|
||||
}
|
||||
|
||||
sumOfCols[local_id] = tmp_sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
|
||||
__global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
|
||||
|
||||
int sy_index = 0; // current index in data[] array
|
||||
int stepsY = min(dstCoords.y2 - pos.y, BLOCK_SIZE_Y);
|
||||
ASSERT(stepsY > 0);
|
||||
for (; ;)
|
||||
{
|
||||
ASSERT(pos.y < dstCoords.y2);
|
||||
|
||||
if(local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
|
||||
pos.x >= dstCoords.x1 && pos.x < dstCoords.x2)
|
||||
{
|
||||
ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
|
||||
|
||||
INTERMEDIATE_TYPE total_sum = 0;
|
||||
#pragma unroll
|
||||
for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
|
||||
{
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
total_sum += sumOfCols[local_id + sx - ANCHOR_X];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
|
||||
*dstPtr = CONVERT_TO_TYPE(((INTERMEDIATE_TYPE)alpha) * total_sum);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
|
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step
|
||||
)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
|
||||
int src_x_off = (src_offset % src_step) >> 4;
|
||||
int src_y_off = src_offset / src_step;
|
||||
int dst_x_off = (dst_offset % dst_step) >> 4;
|
||||
int dst_y_off = dst_offset / dst_step;
|
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
|
||||
int startY = (gY << 1) - anY + src_y_off;
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
float4 data[ksY+1];
|
||||
__local float4 temp[2][THREADS];
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool con;
|
||||
float4 ss;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
|
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols);
|
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
|
||||
|
||||
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
|
||||
}
|
||||
#if BLOCK_SIZE_Y == 1
|
||||
break;
|
||||
#else
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int selected_row;
|
||||
int selected_col;
|
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
|
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
|
||||
if (--stepsY == 0)
|
||||
break;
|
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
|
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
data[i] = src[selected_row * (src_step>>4) + selected_col];
|
||||
}
|
||||
tmp_sum = sumOfCols[local_id]; // TODO FIX IT: workaround for BUG in OpenCL compiler
|
||||
// only works with scalars: ASSERT(fabs(tmp_sum - sumOfCols[local_id]) < (INTERMEDIATE_TYPE)1e-6);
|
||||
tmp_sum -= data[sy_index];
|
||||
|
||||
data[sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
|
||||
#ifdef BORDER_CONSTANT
|
||||
, borderValue
|
||||
#endif
|
||||
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[i]);
|
||||
}
|
||||
sum1 = sum0 + (data[0]);
|
||||
sum2 = sum0 + (data[ksY]);
|
||||
temp[0][col] = sum1;
|
||||
temp[1][col] = sum2;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(col < (THREADS-(ksX-1)))
|
||||
{
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gY << 1);
|
||||
);
|
||||
srcPos.y++;
|
||||
|
||||
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
|
||||
for(int k=0; k<2; k++)
|
||||
for(int i=-anX; i<=anX; i++)
|
||||
{
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
for(int i=0; i<2; i++)
|
||||
{
|
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
|
||||
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
|
||||
}
|
||||
tmp_sum += data[sy_index];
|
||||
sumOfCols[local_id] = tmp_sum;
|
||||
|
||||
sy_index = (sy_index + 1 < KERNEL_SIZE_Y) ? sy_index + 1 : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// next line
|
||||
DEBUG_ONLY(pos.y++);
|
||||
dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes); // Pointer can be out of bounds!
|
||||
#endif // BLOCK_SIZE_Y == 1
|
||||
}
|
||||
}
|
||||
|
370
modules/ocl/src/opencl/filtering_filter2D.cl
Normal file
370
modules/ocl/src/opencl/filtering_filter2D.cl
Normal file
@ -0,0 +1,370 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifdef BORDER_REPLICATE
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT_101
|
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
//blur function does not support BORDER_WRAP
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef EXTRA_EXTRAPOLATION // border > src image size
|
||||
#ifdef BORDER_CONSTANT
|
||||
// None
|
||||
#elif defined BORDER_REPLICATE
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
x = max(min(x, maxX - 1), minX); \
|
||||
y = max(min(y, maxY - 1), minY); \
|
||||
}
|
||||
#elif defined BORDER_WRAP
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
if (x < minX) \
|
||||
x -= ((x - maxX + 1) / maxX) * maxX; \
|
||||
if (x >= maxX) \
|
||||
x %= maxX; \
|
||||
if (y < minY) \
|
||||
y -= ((y - maxY + 1) / maxY) * maxY; \
|
||||
if (y >= maxY) \
|
||||
y %= maxY; \
|
||||
}
|
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
|
||||
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
|
||||
{ \
|
||||
if (maxX - minX == 1) \
|
||||
x = minX; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if (x < minX) \
|
||||
x = -(x - minX) - 1 + delta; \
|
||||
else \
|
||||
x = maxX - 1 - (x - maxX) - delta; \
|
||||
} \
|
||||
while (x >= maxX || x < minX); \
|
||||
\
|
||||
if (maxY - minY == 1) \
|
||||
y = minY; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if (y < minY) \
|
||||
y = -(y - minY) - 1 + delta; \
|
||||
else \
|
||||
y = maxY - 1 - (y - maxY) - delta; \
|
||||
} \
|
||||
while (y >= maxY || y < minY); \
|
||||
}
|
||||
#ifdef BORDER_REFLECT
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
|
||||
#elif defined(BORDER_REFLECT_101)
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
|
||||
#endif
|
||||
#else
|
||||
#error No extrapolation method
|
||||
#endif
|
||||
#else
|
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
|
||||
{ \
|
||||
int _row = y - minY, _col = x - minX; \
|
||||
_row = ADDR_H(_row, 0, maxY - minY); \
|
||||
_row = ADDR_B(_row, maxY - minY, _row); \
|
||||
y = _row + minY; \
|
||||
\
|
||||
_col = ADDR_L(_col, 0, maxX - minX); \
|
||||
_col = ADDR_R(_col, maxX - minX, _col); \
|
||||
x = _col + minX; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_DOUBLE
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#define FPTYPE double
|
||||
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE)
|
||||
#else
|
||||
#define FPTYPE float
|
||||
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE)
|
||||
#endif
|
||||
|
||||
#if DATA_DEPTH == 0
|
||||
#define BASE_TYPE uchar
|
||||
#elif DATA_DEPTH == 1
|
||||
#define BASE_TYPE char
|
||||
#elif DATA_DEPTH == 2
|
||||
#define BASE_TYPE ushort
|
||||
#elif DATA_DEPTH == 3
|
||||
#define BASE_TYPE short
|
||||
#elif DATA_DEPTH == 4
|
||||
#define BASE_TYPE int
|
||||
#elif DATA_DEPTH == 5
|
||||
#define BASE_TYPE float
|
||||
#elif DATA_DEPTH == 6
|
||||
#define BASE_TYPE double
|
||||
#else
|
||||
#error data_depth
|
||||
#endif
|
||||
|
||||
#define __CAT(x, y) x##y
|
||||
#define CAT(x, y) __CAT(x, y)
|
||||
|
||||
#define uchar1 uchar
|
||||
#define char1 char
|
||||
#define ushort1 ushort
|
||||
#define short1 short
|
||||
#define int1 int
|
||||
#define float1 float
|
||||
#define double1 double
|
||||
|
||||
#define convert_uchar1_sat_rte convert_uchar_sat_rte
|
||||
#define convert_char1_sat_rte convert_char_sat_rte
|
||||
#define convert_ushort1_sat_rte convert_ushort_sat_rte
|
||||
#define convert_short1_sat_rte convert_short_sat_rte
|
||||
#define convert_int1_sat_rte convert_int_sat_rte
|
||||
#define convert_float1
|
||||
#define convert_double1
|
||||
|
||||
#if DATA_DEPTH == 5 || DATA_DEPTH == 6
|
||||
#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE)
|
||||
#else
|
||||
#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte)
|
||||
#endif
|
||||
|
||||
#define VEC_SIZE DATA_CHAN
|
||||
|
||||
#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
|
||||
#define TYPE VEC_TYPE
|
||||
|
||||
#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE)
|
||||
|
||||
#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE)
|
||||
|
||||
struct RectCoords
|
||||
{
|
||||
int x1, y1, x2, y2;
|
||||
};
|
||||
|
||||
//#define DEBUG
|
||||
#ifdef DEBUG
|
||||
#define DEBUG_ONLY(x) x
|
||||
#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0)
|
||||
#else
|
||||
#define DEBUG_ONLY(x) (void)0
|
||||
#define ASSERT(condition) (void)0
|
||||
#endif
|
||||
|
||||
|
||||
inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords
|
||||
#ifdef BORDER_CONSTANT
|
||||
, SCALAR_TYPE borderValue
|
||||
#endif
|
||||
)
|
||||
{
|
||||
#ifdef BORDER_ISOLATED
|
||||
if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
#else
|
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
#endif
|
||||
{
|
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
|
||||
return CONVERT_TO_FPTYPE(*ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
return borderValue;
|
||||
#else
|
||||
int selected_col = pos.x;
|
||||
int selected_row = pos.y;
|
||||
|
||||
EXTRAPOLATE(selected_col, selected_row,
|
||||
#ifdef BORDER_ISOLATED
|
||||
srcCoords.x1, srcCoords.y1,
|
||||
#else
|
||||
0, 0,
|
||||
#endif
|
||||
srcCoords.x2, srcCoords.y2
|
||||
);
|
||||
|
||||
// debug border mapping
|
||||
//printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row);
|
||||
|
||||
pos = (int2)(selected_col, selected_row);
|
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)
|
||||
{
|
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes);
|
||||
return CONVERT_TO_FPTYPE(*ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
// for debug only
|
||||
DEBUG_ONLY(printf("BUG in boxFilter kernel\n"));
|
||||
return (FPTYPE)(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// INPUT PARAMETER: BLOCK_SIZE_Y (via defines)
|
||||
|
||||
__kernel
|
||||
__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
|
||||
void filter2D(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC,
|
||||
__global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC,
|
||||
#ifdef BORDER_CONSTANT
|
||||
SCALAR_TYPE borderValue,
|
||||
#endif
|
||||
__constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED]
|
||||
)
|
||||
{
|
||||
const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
|
||||
struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3};
|
||||
|
||||
const int local_id = get_local_id(0);
|
||||
const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;
|
||||
const int y = get_global_id(1) * BLOCK_SIZE_Y;
|
||||
|
||||
INTERMEDIATE_TYPE data[KERNEL_SIZE_Y];
|
||||
__local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE];
|
||||
|
||||
int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);
|
||||
|
||||
int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y);
|
||||
__global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds!
|
||||
bool writeResult = (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&
|
||||
pos.x >= dstCoords.x1 && pos.x < dstCoords.x2);
|
||||
|
||||
#if BLOCK_SIZE_Y > 1
|
||||
bool readAllpixels = true;
|
||||
int sy_index = 0; // current index in data[] array
|
||||
|
||||
dstCoords.y2 = min(dstCoords.y2, pos.y + BLOCK_SIZE_Y);
|
||||
for (;
|
||||
pos.y < dstCoords.y2;
|
||||
pos.y++,
|
||||
dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes))
|
||||
#endif
|
||||
{
|
||||
ASSERT(pos.y < dstCoords.y2);
|
||||
|
||||
for (
|
||||
#if BLOCK_SIZE_Y > 1
|
||||
int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0);
|
||||
#else
|
||||
int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y;
|
||||
#endif
|
||||
sy++, srcPos.y++)
|
||||
{
|
||||
data[sy + sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords
|
||||
#ifdef BORDER_CONSTANT
|
||||
, borderValue
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
INTERMEDIATE_TYPE total_sum = 0;
|
||||
for (int sx = 0; sx < KERNEL_SIZE_X; sx++)
|
||||
{
|
||||
{
|
||||
__constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx
|
||||
#if BLOCK_SIZE_Y > 1
|
||||
+ KERNEL_SIZE_Y - sy_index
|
||||
#endif
|
||||
];
|
||||
INTERMEDIATE_TYPE tmp_sum = 0;
|
||||
for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)
|
||||
{
|
||||
tmp_sum += data[sy] * k[sy];
|
||||
}
|
||||
|
||||
sumOfCols[local_id] = tmp_sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
int id = local_id + sx - ANCHOR_X;
|
||||
if (id >= 0 && id < LOCAL_SIZE)
|
||||
total_sum += sumOfCols[id];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (writeResult)
|
||||
{
|
||||
ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2);
|
||||
*dstPtr = CONVERT_TO_TYPE(total_sum);
|
||||
}
|
||||
|
||||
#if BLOCK_SIZE_Y > 1
|
||||
readAllpixels = false;
|
||||
#if BLOCK_SIZE_Y > KERNEL_SIZE_Y
|
||||
sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1;
|
||||
#else
|
||||
sy_index++;
|
||||
#endif
|
||||
#endif // BLOCK_SIZE_Y == 1
|
||||
}
|
||||
}
|
@ -1,381 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Pang Erping, erping@multicorewareinc.com
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT_101
|
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef IMG_C_1_0
|
||||
#define T_IMG uchar
|
||||
#define T_IMGx4 uchar4
|
||||
#define T_IMG_C1 uchar
|
||||
#define CONVERT_TYPE convert_uchar_sat
|
||||
#define CONVERT_TYPEx4 convert_uchar4_sat
|
||||
#endif
|
||||
#ifdef IMG_C_4_0
|
||||
#define T_IMG uchar4
|
||||
#define T_IMGx4 uchar16
|
||||
#define T_IMG_C1 uchar
|
||||
#define CONVERT_TYPE convert_uchar4_sat
|
||||
#define CONVERT_TYPEx4 convert_uchar16_sat
|
||||
#endif
|
||||
#ifdef IMG_C_1_5
|
||||
#define T_IMG float
|
||||
#define T_IMGx4 float4
|
||||
#define T_IMG_C1 float
|
||||
#define CONVERT_TYPE convert_float
|
||||
#define CONVERT_TYPEx4 convert_float4
|
||||
#endif
|
||||
#ifdef IMG_C_4_5
|
||||
#define T_IMG float4
|
||||
#define T_IMGx4 float16
|
||||
#define T_IMG_C1 float
|
||||
#define CONVERT_TYPE convert_float4
|
||||
#define CONVERT_TYPEx4 convert_float16
|
||||
#endif
|
||||
|
||||
#ifndef CN
|
||||
#define CN 1
|
||||
#endif
|
||||
|
||||
#if CN == 1
|
||||
#define T_SUM float
|
||||
#define T_SUMx4 float4
|
||||
#define CONVERT_TYPE_SUM convert_float
|
||||
#define CONVERT_TYPE_SUMx4 convert_float4
|
||||
#define SUM_ZERO (0.0f)
|
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define VLOAD4 vload4
|
||||
#define SX x
|
||||
#define SY y
|
||||
#define SZ z
|
||||
#define SW w
|
||||
#elif CN == 4
|
||||
#define T_SUM float4
|
||||
#define T_SUMx4 float16
|
||||
#define CONVERT_TYPE_SUM convert_float4
|
||||
#define CONVERT_TYPE_SUMx4 convert_float16
|
||||
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
|
||||
#define VLOAD4 vload16
|
||||
#define SX s0123
|
||||
#define SY s4567
|
||||
#define SZ s89ab
|
||||
#define SW scdef
|
||||
#endif
|
||||
|
||||
#ifndef FILTER_SIZE
|
||||
#define FILTER_SIZE 3
|
||||
#endif
|
||||
|
||||
#define LOCAL_GROUP_SIZE 16
|
||||
|
||||
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
|
||||
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
|
||||
|
||||
#define FILTER_RADIUS (FILTER_SIZE >> 1)
|
||||
|
||||
__kernel void filter2D(
|
||||
__global T_IMG *src,
|
||||
__global T_IMG *dst,
|
||||
int src_step,
|
||||
int dst_step,
|
||||
__constant float *mat_kernel,
|
||||
__local T_IMG *local_data,
|
||||
int wholerows,
|
||||
int wholecols,
|
||||
int src_offset_x,
|
||||
int src_offset_y,
|
||||
int dst_offset_x,
|
||||
int dst_offset_y,
|
||||
int cols,
|
||||
int rows,
|
||||
int operate_cols
|
||||
)
|
||||
{
|
||||
int groupStartCol = get_group_id(0) * get_local_size(0);
|
||||
int groupStartRow = get_group_id(1) * get_local_size(1);
|
||||
|
||||
int localCol = get_local_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int globalCol = groupStartCol + localCol;
|
||||
int globalRow = groupStartRow + localRow;
|
||||
const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
|
||||
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
|
||||
{
|
||||
int curRow = groupStartRow + i;
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
|
||||
{
|
||||
int curCol = groupStartCol + j;
|
||||
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
|
||||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
|
||||
{
|
||||
int curRow = groupStartRow + i;
|
||||
|
||||
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
|
||||
|
||||
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
|
||||
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
|
||||
{
|
||||
int curCol = groupStartCol + j;
|
||||
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
|
||||
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
|
||||
if(curRow < wholerows && curCol < wholecols)
|
||||
{
|
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(globalRow < rows && globalCol < cols)
|
||||
{
|
||||
T_SUM sum = (T_SUM)(SUM_ZERO);
|
||||
int filterIdx = 0;
|
||||
for(int i = 0; i < FILTER_SIZE; i++)
|
||||
{
|
||||
int offset = (i + localRow) * LOCAL_WIDTH;
|
||||
|
||||
for(int j = 0; j < FILTER_SIZE; j++)
|
||||
{
|
||||
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
|
||||
}
|
||||
}
|
||||
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
|
||||
}
|
||||
}
|
||||
|
||||
/// following is specific for 3x3 kernels
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////Macro for define elements number per thread/////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ANX 1
|
||||
#define ANY 1
|
||||
|
||||
#define ROWS_PER_GROUP 4
|
||||
#define ROWS_PER_GROUP_BITS 2
|
||||
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
|
||||
|
||||
#define THREADS_PER_ROW 64
|
||||
#define THREADS_PER_ROW_BIT 6
|
||||
|
||||
#define ELEMENTS_PER_THREAD 4
|
||||
#define ELEMENTS_PER_THREAD_BIT 2
|
||||
|
||||
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void filter2D_3x3(
|
||||
__global T_IMG *src,
|
||||
__global T_IMG *dst,
|
||||
int src_step,
|
||||
int dst_step,
|
||||
__constant float *mat_kernel,
|
||||
__local T_IMG *local_data,
|
||||
int wholerows,
|
||||
int wholecols,
|
||||
int src_offset_x,
|
||||
int src_offset_y,
|
||||
int dst_offset_x,
|
||||
int dst_offset_y,
|
||||
int cols,
|
||||
int rows,
|
||||
int operate_cols
|
||||
)
|
||||
{
|
||||
int gX = get_global_id(0);
|
||||
int gY = get_global_id(1);
|
||||
|
||||
int lX = get_local_id(0);
|
||||
|
||||
int groupX_size = get_local_size(0);
|
||||
int groupX_id = get_group_id(0);
|
||||
|
||||
#define dst_align (dst_offset_x & 3)
|
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
|
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
|
||||
|
||||
if((gY << 2) < rows)
|
||||
{
|
||||
for(int i = 0; i < ROWS_FETCH; ++i)
|
||||
{
|
||||
if((rows_start_index - src_offset_y) + i < rows + ANY)
|
||||
{
|
||||
#ifdef BORDER_CONSTANT
|
||||
int selected_row = rows_start_index + i;
|
||||
int selected_cols = cols_start_index_group + lX;
|
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
|
||||
data = con ? data : (T_IMG)(0);
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
|
||||
data = con ? data : (T_IMG)(0);
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
|
||||
}
|
||||
#else
|
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
|
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
|
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
|
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
|
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
|
||||
|
||||
if(lX < (ANX << 1))
|
||||
{
|
||||
selected_cols = cols_start_index_group + lX + groupX_size;
|
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
|
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)];
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
|
||||
if(((gY << 2) < rows) && (process_col < operate_cols))
|
||||
{
|
||||
int dst_cols_start = dst_offset_x;
|
||||
int dst_cols_end = dst_offset_x + cols;
|
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
|
||||
|
||||
int dst_rows_end = dst_offset_y + rows;
|
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
|
||||
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
|
||||
|
||||
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
|
||||
|
||||
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
|
||||
T_IMGx4 data;
|
||||
|
||||
for(int i = 0; i < FILTER_SIZE; i++)
|
||||
{
|
||||
#pragma unroll
|
||||
for(int j = 0; j < FILTER_SIZE; j++)
|
||||
{
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
|
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
|
||||
|
||||
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
|
||||
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(dst_rows_index < dst_rows_end)
|
||||
{
|
||||
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
|
||||
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
|
||||
tmp_dst.SX : dst_data.SX;
|
||||
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
|
||||
tmp_dst.SY : dst_data.SY;
|
||||
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
|
||||
tmp_dst.SZ : dst_data.SZ;
|
||||
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
|
||||
tmp_dst.SW : dst_data.SW;
|
||||
*(__global T_IMGx4 *)dst = tmp_dst;
|
||||
}
|
||||
}
|
||||
}
|
@ -11,6 +11,7 @@
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
// Erping Pang, erping@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
@ -37,7 +38,6 @@
|
||||
//
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
#define CV_HAAR_FEATURE_MAX 3
|
||||
|
||||
#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
|
||||
@ -101,6 +101,144 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
|
||||
float inv_window_area __attribute__((aligned (4)));
|
||||
} GpuHidHaarClassifierCascade;
|
||||
|
||||
|
||||
#ifdef PACKED_CLASSIFIER
|
||||
// this code is scalar, one pixel -> one workitem
|
||||
__kernel void gpuRunHaarClassifierCascadePacked(
|
||||
global const GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global const int4 * info,
|
||||
global const GpuHidHaarTreeNode * nodeptr,
|
||||
global const int * restrict sum,
|
||||
global const float * restrict sqsum,
|
||||
volatile global int4 * candidate,
|
||||
const int pixelstep,
|
||||
const int loopcount,
|
||||
const int start_stage,
|
||||
const int split_stage,
|
||||
const int end_stage,
|
||||
const int startnode,
|
||||
const int splitnode,
|
||||
const int4 p,
|
||||
const int4 pq,
|
||||
const float correction,
|
||||
global const int* pNodesPK,
|
||||
global const int4* pWGInfo
|
||||
)
|
||||
|
||||
{
|
||||
// this version used information provided for each workgroup
|
||||
// no empty WG
|
||||
int gid = (int)get_group_id(0);
|
||||
int lid_x = (int)get_local_id(0);
|
||||
int lid_y = (int)get_local_id(1);
|
||||
int lid = lid_y*LSx+lid_x;
|
||||
int4 WGInfo = pWGInfo[gid];
|
||||
int GroupX = (WGInfo.y >> 16)&0xFFFF;
|
||||
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
|
||||
int Width = (WGInfo.x >> 16)&0xFFFF;
|
||||
int Height = (WGInfo.x >> 0 )& 0xFFFF;
|
||||
int ImgOffset = WGInfo.z;
|
||||
float ScaleFactor = as_float(WGInfo.w);
|
||||
|
||||
#define DATA_SIZE_X (LSx+WND_SIZE_X)
|
||||
#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
|
||||
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
|
||||
|
||||
local int SumL[DATA_SIZE];
|
||||
|
||||
// read input data window into local mem
|
||||
for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
|
||||
{
|
||||
int index = i+lid; // index in shared local memory
|
||||
if(index<DATA_SIZE)
|
||||
{// calc global x,y coordinat and read data from there
|
||||
int x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
|
||||
int y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
|
||||
SumL[index] = sum[ImgOffset+y*pixelstep+x];
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// calc variance_norm_factor for all stages
|
||||
float variance_norm_factor;
|
||||
int nodecounter= startnode;
|
||||
int4 info1 = p;
|
||||
int4 info2 = pq;
|
||||
|
||||
{
|
||||
int xl = lid_x;
|
||||
int yl = lid_y;
|
||||
int OffsetLocal = yl * DATA_SIZE_X + xl;
|
||||
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
|
||||
|
||||
// add shift to get position on scaled image
|
||||
OffsetGlobal += ImgOffset;
|
||||
|
||||
float mean =
|
||||
SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
|
||||
SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
|
||||
SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
|
||||
SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
|
||||
float sq =
|
||||
sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
|
||||
sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
|
||||
sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
|
||||
sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
|
||||
|
||||
mean *= correction;
|
||||
sq *= correction;
|
||||
|
||||
variance_norm_factor = sq - mean * mean;
|
||||
variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
|
||||
}// end calc variance_norm_factor for all stages
|
||||
|
||||
int result = (1.0f>0.0f);
|
||||
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
|
||||
{// iterate until candidate is exist
|
||||
float stage_sum = 0.0f;
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
|
||||
float stagethreshold = as_float(stageinfo.y);
|
||||
int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
|
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
|
||||
{
|
||||
// simple macro to extract shorts from int
|
||||
#define M0(_t) ((_t)&0xFFFF)
|
||||
#define M1(_t) (((_t)>>16)&0xFFFF)
|
||||
// load packed node data from global memory (L3) into registers
|
||||
global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
|
||||
int4 n0 = pN[0];
|
||||
int4 n1 = pN[1];
|
||||
int4 n2 = pN[2];
|
||||
float nodethreshold = as_float(n2.y) * variance_norm_factor;
|
||||
// calc sum of intensity pixels according to node information
|
||||
float classsum =
|
||||
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
|
||||
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
|
||||
(SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
|
||||
//accumulate stage responce
|
||||
stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
|
||||
}
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}// next stage if needed
|
||||
|
||||
if(result)
|
||||
{// all stages will be passed and there is a detected face on the tested position
|
||||
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
|
||||
if(index<OUTPUTSZ)
|
||||
{
|
||||
int x = GroupX+lid_x;
|
||||
int y = GroupY+lid_y;
|
||||
int4 candidate_result;
|
||||
candidate_result.x = convert_int_rtn(x*ScaleFactor);
|
||||
candidate_result.y = convert_int_rtn(y*ScaleFactor);
|
||||
candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
|
||||
candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
|
||||
candidate[index] = candidate_result;
|
||||
}
|
||||
}
|
||||
}//end gpuRunHaarClassifierCascade
|
||||
#else
|
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
|
||||
global GpuHidHaarStageClassifier * stagecascadeptr,
|
||||
global int4 * info,
|
||||
@ -183,7 +321,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int glb_x = grpoffx + (lcl_x<<2);
|
||||
int glb_y = grpoffy + lcl_y;
|
||||
|
||||
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
|
||||
int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
|
||||
int4 data = *(__global int4*)&sum[glb_off];
|
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
|
||||
|
||||
@ -283,12 +421,23 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
|
||||
result = (stage_sum >= stagethreshold);
|
||||
}
|
||||
|
||||
if(result && (x < width) && (y < height))
|
||||
if(factor < 2)
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
|
||||
if(result && lclidx %2 ==0 && lclidy %2 ==0 )
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(result)
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
|
||||
lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int queuecount = lclcount[0];
|
||||
@ -411,13 +560,30 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
|
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
|
||||
temp = glboutindex[0];
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
|
||||
candidate_result.x = convert_int_rtn(x*factor);
|
||||
candidate_result.y = convert_int_rtn(y*factor);
|
||||
candidate_result.zw = (int2)convert_int_rte(factor*20.f);
|
||||
candidate_result.x = convert_int_rte(x*factor);
|
||||
candidate_result.y = convert_int_rte(y*factor);
|
||||
atomic_inc(glboutindex);
|
||||
candidate[outputoff+temp+lcl_id] = candidate_result;
|
||||
|
||||
int i = outputoff+temp+lcl_id;
|
||||
if(candidate[i].z == 0)
|
||||
{
|
||||
candidate[i] = candidate_result;
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i=i+1;;i++)
|
||||
{
|
||||
if(candidate[i].z == 0)
|
||||
{
|
||||
candidate[i] = candidate_result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
|
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++)
|
||||
}
|
||||
#endif
|
||||
|
@ -18,6 +18,7 @@
|
||||
// Wu Xinglong, wxl370@126.com
|
||||
// Sen Liu, swjtuls1987@126.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
// Erping Pang, erping@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
@ -120,7 +121,6 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
int grpidx = get_group_id(0);
|
||||
int lclidx = get_local_id(0);
|
||||
int lclidy = get_local_id(1);
|
||||
int lcl_sz = mul24(grpszx, grpszy);
|
||||
int lcl_id = mad24(lclidy, grpszx, lclidx);
|
||||
__local int glboutindex[1];
|
||||
__local int lclcount[1];
|
||||
@ -142,7 +142,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
int totalgrp = scaleinfo1.y & 0xffff;
|
||||
float factor = as_float(scaleinfo1.w);
|
||||
float correction_t = correction[scalei];
|
||||
int ystep = (int)(max(2.0f, factor) + 0.5f);
|
||||
float ystep = max(2.0f, factor);
|
||||
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
|
||||
{
|
||||
@ -151,8 +151,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
int grpidx = grploop - mul24(grpidy, grpnumperline);
|
||||
int ix = mad24(grpidx, grpszx, lclidx);
|
||||
int iy = mad24(grpidy, grpszy, lclidy);
|
||||
int x = ix * ystep;
|
||||
int y = iy * ystep;
|
||||
int x = round(ix * ystep);
|
||||
int y = round(iy * ystep);
|
||||
lcloutindex[lcl_id] = 0;
|
||||
lclcount[0] = 0;
|
||||
int nodecounter;
|
||||
@ -243,7 +243,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (result && (ix < width) && (iy < height))
|
||||
if (result)
|
||||
{
|
||||
int queueindex = atomic_inc(lclcount);
|
||||
lcloutindex[queueindex] = (y << 16) | x;
|
||||
@ -258,10 +258,26 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
|
||||
int y = (temp & (int)0xffff0000) >> 16;
|
||||
temp = atomic_inc(glboutindex);
|
||||
int4 candidate_result;
|
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
|
||||
candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
|
||||
candidate_result.x = x;
|
||||
candidate_result.y = y;
|
||||
candidate[outputoff + temp + lcl_id] = candidate_result;
|
||||
|
||||
int i = outputoff+temp+lcl_id;
|
||||
if(candidate[i].z == 0)
|
||||
{
|
||||
candidate[i] = candidate_result;
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i=i+1;;i++)
|
||||
{
|
||||
if(candidate[i].z == 0)
|
||||
{
|
||||
candidate[i] = candidate_result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -284,7 +300,7 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
|
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
|
||||
}
|
||||
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
|
||||
t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
|
||||
counter += nodenum;
|
||||
#pragma unroll
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -43,60 +43,63 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
#elif defined BORDER_REPLICATE
|
||||
#define EXTRAPOLATE(x, maxV) \
|
||||
{ \
|
||||
x = max(min(x, maxV - 1), 0); \
|
||||
}
|
||||
#elif defined BORDER_WRAP
|
||||
#define EXTRAPOLATE(x, maxV) \
|
||||
{ \
|
||||
if (x < 0) \
|
||||
x -= ((x - maxV + 1) / maxV) * maxV; \
|
||||
if (x >= maxV) \
|
||||
x %= maxV; \
|
||||
}
|
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
|
||||
#define EXTRAPOLATE_(x, maxV, delta) \
|
||||
{ \
|
||||
if (maxV == 1) \
|
||||
x = 0; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if ( x < 0 ) \
|
||||
x = -x - 1 + delta; \
|
||||
else \
|
||||
x = maxV - 1 - (x - maxV) - delta; \
|
||||
} \
|
||||
while (x >= maxV || x < 0); \
|
||||
}
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
|
||||
#else
|
||||
#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT101
|
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#else
|
||||
#error No extrapolation method
|
||||
#endif
|
||||
|
||||
#define THREADS 256
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
|
||||
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
|
||||
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
float k)
|
||||
|
||||
__kernel void calcHarris(__global const float *Dx, __global const float *Dy, __global float *dst,
|
||||
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
|
||||
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
const int glx = get_global_id(0);
|
||||
const int gly = get_global_id(1);
|
||||
int gX = get_group_id(0);
|
||||
int gY = get_group_id(1);
|
||||
int gly = get_global_id(1);
|
||||
|
||||
int dx_x_off = (dx_offset % dx_step) >> 2;
|
||||
int dx_y_off = dx_offset / dx_step;
|
||||
@ -112,41 +115,36 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
|
||||
float dx_data[ksY+1],dy_data[ksY+1], data[3][ksY+1];
|
||||
__local float temp[6][THREADS];
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool dx_con,dy_con;
|
||||
float dx_s,dy_s;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
for (int i=0; i < ksY+1; i++)
|
||||
{
|
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_data[i] = dx_con ? dx_s : 0.0;
|
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_data[i] = dy_con ? dy_s : 0.0;
|
||||
bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
|
||||
float dx_s = dx_con ? Dx[indexDx] : 0.0f;
|
||||
dx_data[i] = dx_s;
|
||||
bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
|
||||
float dy_s = dx_con ? Dy[indexDy] : 0.0f;
|
||||
dy_data[i] = dy_s;
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
}
|
||||
#else
|
||||
int clamped_col = min(dst_cols, col);
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
for (int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int dx_selected_row;
|
||||
int dx_selected_col;
|
||||
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
|
||||
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
|
||||
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
|
||||
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
|
||||
int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
|
||||
EXTRAPOLATE(dx_selected_row, dx_whole_rows)
|
||||
EXTRAPOLATE(dx_selected_col, dx_whole_cols)
|
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
|
||||
|
||||
int dy_selected_row;
|
||||
int dy_selected_col;
|
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
|
||||
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
|
||||
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
|
||||
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
|
||||
int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
|
||||
EXTRAPOLATE(dy_selected_row, dy_whole_rows)
|
||||
EXTRAPOLATE(dy_selected_col, dy_whole_cols)
|
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
|
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
@ -154,46 +152,45 @@ __kernel void calcHarris(__global const float *Dx,__global const float *Dy, __gl
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
}
|
||||
#endif
|
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
|
||||
for (int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[0][i]);
|
||||
sum1 += (data[1][i]);
|
||||
sum2 += (data[2][i]);
|
||||
sum0 += data[0][i];
|
||||
sum1 += data[1][i];
|
||||
sum2 += data[2][i];
|
||||
}
|
||||
float sum01,sum02,sum11,sum12,sum21,sum22;
|
||||
sum01 = sum0 + (data[0][0]);
|
||||
sum02 = sum0 + (data[0][ksY]);
|
||||
|
||||
float sum01 = sum0 + data[0][0];
|
||||
float sum02 = sum0 + data[0][ksY];
|
||||
temp[0][col] = sum01;
|
||||
temp[1][col] = sum02;
|
||||
sum11 = sum1 + (data[1][0]);
|
||||
sum12 = sum1 + (data[1][ksY]);
|
||||
float sum11 = sum1 + data[1][0];
|
||||
float sum12 = sum1 + data[1][ksY];
|
||||
temp[2][col] = sum11;
|
||||
temp[3][col] = sum12;
|
||||
sum21 = sum2 + (data[2][0]);
|
||||
sum22 = sum2 + (data[2][ksY]);
|
||||
float sum21 = sum2 + data[2][0];
|
||||
float sum22 = sum2 + data[2][ksY];
|
||||
temp[4][col] = sum21;
|
||||
temp[5][col] = sum22;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(col < (THREADS-(ksX-1)))
|
||||
|
||||
if (col < (THREADS- (ksX - 1)))
|
||||
{
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gly << 1);
|
||||
int till = (ksX + 1)%2;
|
||||
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
|
||||
for(int k=0; k<6; k++)
|
||||
for(int i=-anX; i<=anX - till; i++)
|
||||
{
|
||||
float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
for (int k=0; k<6; k++)
|
||||
for (int i=-anX; i<=anX - till; i++)
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
|
||||
if(posX < dst_cols && (posY) < dst_rows)
|
||||
if (posX < dst_cols && (posY) < dst_rows)
|
||||
{
|
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
|
||||
}
|
||||
if(posX < dst_cols && (posY + 1) < dst_rows)
|
||||
if (posX < dst_cols && (posY + 1) < dst_rows)
|
||||
{
|
||||
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
|
||||
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -43,60 +43,62 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
#elif defined BORDER_REPLICATE
|
||||
#define EXTRAPOLATE(x, maxV) \
|
||||
{ \
|
||||
x = max(min(x, maxV - 1), 0); \
|
||||
}
|
||||
#elif defined BORDER_WRAP
|
||||
#define EXTRAPOLATE(x, maxV) \
|
||||
{ \
|
||||
if (x < 0) \
|
||||
x -= ((x - maxV + 1) / maxV) * maxV; \
|
||||
if (x >= maxV) \
|
||||
x %= maxV; \
|
||||
}
|
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
|
||||
#define EXTRAPOLATE_(x, maxV, delta) \
|
||||
{ \
|
||||
if (maxV == 1) \
|
||||
x = 0; \
|
||||
else \
|
||||
do \
|
||||
{ \
|
||||
if ( x < 0 ) \
|
||||
x = -x - 1 + delta; \
|
||||
else \
|
||||
x = maxV - 1 - (x - maxV) - delta; \
|
||||
} \
|
||||
while (x >= maxV || x < 0); \
|
||||
}
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
|
||||
#else
|
||||
#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT101
|
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#else
|
||||
#error No extrapolation method
|
||||
#endif
|
||||
|
||||
#define THREADS 256
|
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
|
||||
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
|
||||
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
float k)
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
|
||||
{
|
||||
int col = get_local_id(0);
|
||||
const int gX = get_group_id(0);
|
||||
const int gY = get_group_id(1);
|
||||
const int glx = get_global_id(0);
|
||||
const int gly = get_global_id(1);
|
||||
int gX = get_group_id(0);
|
||||
int gY = get_group_id(1);
|
||||
int gly = get_global_id(1);
|
||||
|
||||
int dx_x_off = (dx_offset % dx_step) >> 2;
|
||||
int dx_y_off = dx_offset / dx_step;
|
||||
@ -112,42 +114,36 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
|
||||
int dst_startY = (gY << 1) + dst_y_off;
|
||||
|
||||
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
|
||||
float dx_data[ksY+1], dy_data[ksY+1], data[3][ksY+1];
|
||||
__local float temp[6][THREADS];
|
||||
|
||||
#ifdef BORDER_CONSTANT
|
||||
bool dx_con,dy_con;
|
||||
float dx_s,dy_s;
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
for (int i=0; i < ksY+1; i++)
|
||||
{
|
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
|
||||
dx_data[i] = dx_con ? dx_s : 0.0;
|
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
|
||||
dy_data[i] = dy_con ? dy_s : 0.0;
|
||||
bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
|
||||
int indexDx = (dx_startY+i)*(dx_step>>2)+(dx_startX+col);
|
||||
float dx_s = dx_con ? Dx[indexDx] : 0.0f;
|
||||
dx_data[i] = dx_s;
|
||||
bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
|
||||
int indexDy = (dy_startY+i)*(dy_step>>2)+(dy_startX+col);
|
||||
float dy_s = dx_con ? Dy[indexDy] : 0.0f;
|
||||
dy_data[i] = dy_s;
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
data[1][i] = dx_data[i] * dy_data[i];
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
}
|
||||
#else
|
||||
int clamped_col = min(dst_cols, col);
|
||||
|
||||
for(int i=0; i < ksY+1; i++)
|
||||
for (int i=0; i < ksY+1; i++)
|
||||
{
|
||||
int dx_selected_row;
|
||||
int dx_selected_col;
|
||||
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
|
||||
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
|
||||
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
|
||||
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
|
||||
int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
|
||||
EXTRAPOLATE(dx_selected_row, dx_whole_rows)
|
||||
EXTRAPOLATE(dx_selected_col, dx_whole_cols)
|
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
|
||||
|
||||
int dy_selected_row;
|
||||
int dy_selected_col;
|
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
|
||||
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
|
||||
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
|
||||
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
|
||||
int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
|
||||
EXTRAPOLATE(dy_selected_row, dy_whole_rows)
|
||||
EXTRAPOLATE(dy_selected_col, dy_whole_cols)
|
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
|
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i];
|
||||
@ -155,39 +151,38 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
data[2][i] = dy_data[i] * dy_data[i];
|
||||
}
|
||||
#endif
|
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
|
||||
for(int i=1; i < ksY; i++)
|
||||
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;
|
||||
for (int i=1; i < ksY; i++)
|
||||
{
|
||||
sum0 += (data[0][i]);
|
||||
sum1 += (data[1][i]);
|
||||
sum2 += (data[2][i]);
|
||||
}
|
||||
float sum01,sum02,sum11,sum12,sum21,sum22;
|
||||
sum01 = sum0 + (data[0][0]);
|
||||
sum02 = sum0 + (data[0][ksY]);
|
||||
|
||||
float sum01 = sum0 + (data[0][0]);
|
||||
float sum02 = sum0 + (data[0][ksY]);
|
||||
temp[0][col] = sum01;
|
||||
temp[1][col] = sum02;
|
||||
sum11 = sum1 + (data[1][0]);
|
||||
sum12 = sum1 + (data[1][ksY]);
|
||||
float sum11 = sum1 + (data[1][0]);
|
||||
float sum12 = sum1 + (data[1][ksY]);
|
||||
temp[2][col] = sum11;
|
||||
temp[3][col] = sum12;
|
||||
sum21 = sum2 + (data[2][0]);
|
||||
sum22 = sum2 + (data[2][ksY]);
|
||||
float sum21 = sum2 + (data[2][0]);
|
||||
float sum22 = sum2 + (data[2][ksY]);
|
||||
temp[4][col] = sum21;
|
||||
temp[5][col] = sum22;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(col < (THREADS-(ksX-1)))
|
||||
{
|
||||
col += anX;
|
||||
int posX = dst_startX - dst_x_off + col - anX;
|
||||
int posY = (gly << 1);
|
||||
int till = (ksX + 1)%2;
|
||||
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
|
||||
for(int k=0; k<6; k++)
|
||||
for(int i=-anX; i<=anX - till; i++)
|
||||
{
|
||||
float tmp_sum[6] = { 0.0f, 0.0f , 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
for (int k=0; k<6; k++)
|
||||
for (int i=-anX; i<=anX - till; i++)
|
||||
tmp_sum[k] += temp[k][col+i];
|
||||
}
|
||||
|
||||
if(posX < dst_cols && (posY) < dst_rows)
|
||||
{
|
||||
@ -196,7 +191,7 @@ __kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy,
|
||||
float c = tmp_sum[4] * 0.5f;
|
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
|
||||
}
|
||||
if(posX < dst_cols && (posY + 1) < dst_rows)
|
||||
if (posX < dst_cols && (posY + 1) < dst_rows)
|
||||
{
|
||||
float a = tmp_sum[1] * 0.5f;
|
||||
float b = tmp_sum[3];
|
||||
|
@ -43,9 +43,6 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
|
||||
#ifdef L2GRAD
|
||||
inline float calc(int x, int y)
|
||||
{
|
||||
@ -248,7 +245,12 @@ void calcMagnitude
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// 0.4142135623730950488016887242097 is tan(22.5)
|
||||
#define CANNY_SHIFT 15
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
|
||||
#ifdef DOUBLE_SUPPORT
|
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
|
||||
#else
|
||||
#define TG22 (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
|
||||
#endif
|
||||
|
||||
//First pass of edge detection and non-maximum suppression
|
||||
// edgetype is set to for each pixel:
|
||||
@ -374,6 +376,14 @@ calcMap
|
||||
#undef CANNY_SHIFT
|
||||
#undef TG22
|
||||
|
||||
struct PtrStepSz {
|
||||
__global int *ptr;
|
||||
int step;
|
||||
int rows, cols;
|
||||
};
|
||||
inline int get(struct PtrStepSz data, int y, int x) { return *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)); }
|
||||
inline void set(struct PtrStepSz data, int y, int x, int value) { *((__global int *)((__global char*)data.ptr + data.step * y + sizeof(int) * x)) = value; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// do Hysteresis for pixel whose edge type is 1
|
||||
//
|
||||
@ -390,7 +400,7 @@ void
|
||||
__attribute__((reqd_work_group_size(16,16,1)))
|
||||
edgesHysteresisLocal
|
||||
(
|
||||
__global int * map,
|
||||
__global int * map_ptr,
|
||||
__global ushort2 * st,
|
||||
__global unsigned int * counter,
|
||||
int rows,
|
||||
@ -399,10 +409,11 @@ edgesHysteresisLocal
|
||||
int map_offset
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
map_step /= sizeof(*map);
|
||||
map_offset /= sizeof(*map);
|
||||
|
||||
map += map_offset;
|
||||
const __global int* map = map_ptr + map_offset;
|
||||
|
||||
__local int smem[18][18];
|
||||
|
||||
@ -482,6 +493,92 @@ edgesHysteresisLocal
|
||||
st[ind] = (ushort2)(gidx + 1, gidy + 1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
struct PtrStepSz map = {((__global int *)((__global char*)map_ptr + map_offset)), map_step, rows, cols};
|
||||
|
||||
__local int smem[18][18];
|
||||
|
||||
int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
|
||||
int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
|
||||
int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
|
||||
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
|
||||
if (threadIdx.y == 0)
|
||||
smem[0][threadIdx.x + 1] = y > 0 ? get(map, y - 1, x) : 0;
|
||||
if (threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
|
||||
if (threadIdx.x == 0)
|
||||
smem[threadIdx.y + 1][0] = x > 0 ? get(map, y, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1)
|
||||
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? get(map, y, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
|
||||
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
|
||||
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
|
||||
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
|
||||
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (x >= map.cols || y >= map.rows)
|
||||
return;
|
||||
|
||||
int n;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < 16; ++k)
|
||||
{
|
||||
n = 0;
|
||||
|
||||
if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
|
||||
}
|
||||
|
||||
const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
|
||||
|
||||
set(map, y, x, e);
|
||||
|
||||
n = 0;
|
||||
|
||||
if (e == 2)
|
||||
{
|
||||
n += smem[threadIdx.y ][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y ][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 1][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
|
||||
|
||||
n += smem[threadIdx.y + 2][threadIdx.x ] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
|
||||
n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
|
||||
}
|
||||
|
||||
if (n > 0)
|
||||
{
|
||||
const int ind = atomic_inc(counter);
|
||||
st[ind] = (ushort2)(x, y);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
|
||||
@ -505,17 +602,12 @@ edgesHysteresisGlobal
|
||||
int map_offset
|
||||
)
|
||||
{
|
||||
|
||||
map_step /= sizeof(*map);
|
||||
map_offset /= sizeof(*map);
|
||||
|
||||
map += map_offset;
|
||||
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
int lidx = get_local_id(0);
|
||||
int lidy = get_local_id(1);
|
||||
|
||||
int grp_idx = get_group_id(0);
|
||||
int grp_idy = get_group_id(1);
|
||||
@ -536,14 +628,39 @@ edgesHysteresisGlobal
|
||||
if(ind < count)
|
||||
{
|
||||
ushort2 pos = st1[ind];
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
if (lidx < 8)
|
||||
{
|
||||
if (lidx < 8)
|
||||
pos.x += c_dx[lidx];
|
||||
pos.y += c_dy[lidx];
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
|
||||
{
|
||||
pos.x += c_dx[lidx];
|
||||
pos.y += c_dy[lidx];
|
||||
map[pos.x + pos.y * map_step] = 2;
|
||||
|
||||
if (map[pos.x + pos.y * map_step] == 1)
|
||||
ind = atomic_inc(&s_counter);
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
|
||||
{
|
||||
const int subTaskIdx = lidx >> 3;
|
||||
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
s_counter -= portion;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
{
|
||||
pos.x += c_dx[lidx & 7];
|
||||
pos.y += c_dy[lidx & 7];
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && map[pos.x + pos.y * map_step] == 1)
|
||||
{
|
||||
map[pos.x + pos.y * map_step] = 2;
|
||||
|
||||
@ -553,54 +670,22 @@ edgesHysteresisGlobal
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
|
||||
if (s_counter > 0)
|
||||
{
|
||||
if (lidx == 0)
|
||||
{
|
||||
const int subTaskIdx = lidx >> 3;
|
||||
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
|
||||
if (subTaskIdx < portion)
|
||||
pos = s_st[s_counter - 1 - subTaskIdx];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
s_counter -= portion;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
|
||||
{
|
||||
pos.x += c_dx[lidx & 7];
|
||||
pos.y += c_dy[lidx & 7];
|
||||
|
||||
if (map[pos.x + pos.y * map_step] == 1)
|
||||
{
|
||||
map[pos.x + pos.y * map_step] = 2;
|
||||
|
||||
ind = atomic_inc(&s_counter);
|
||||
|
||||
s_st[ind] = pos;
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
ind = atomic_add(counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (s_counter > 0)
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
|
||||
{
|
||||
if (lidx == 0)
|
||||
{
|
||||
ind = atomic_add(counter, s_counter);
|
||||
s_ind = ind - s_counter;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
ind = s_ind;
|
||||
|
||||
for (int i = lidx; i < s_counter; i += get_local_size(0))
|
||||
{
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
st2[ind + i] = s_st[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -47,7 +47,7 @@
|
||||
#define WAVE_SIZE 1
|
||||
#endif
|
||||
|
||||
int calc_lut(__local int* smem, int val, int tid)
|
||||
static int calc_lut(__local int* smem, int val, int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -61,7 +61,7 @@ int calc_lut(__local int* smem, int val, int tid)
|
||||
}
|
||||
|
||||
#ifdef CPU
|
||||
void reduce(volatile __local int* smem, int val, int tid)
|
||||
static void reduce(volatile __local int* smem, int val, int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -101,7 +101,7 @@ void reduce(volatile __local int* smem, int val, int tid)
|
||||
|
||||
#else
|
||||
|
||||
void reduce(__local volatile int* smem, int val, int tid)
|
||||
static void reduce(__local volatile int* smem, int val, int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -147,9 +147,9 @@ __kernel void calcLut(__global __const uchar * src, __global uchar * lut,
|
||||
{
|
||||
__local int smem[512];
|
||||
|
||||
const int tx = get_group_id(0);
|
||||
const int ty = get_group_id(1);
|
||||
const unsigned int tid = get_local_id(1) * get_local_size(0)
|
||||
int tx = get_group_id(0);
|
||||
int ty = get_group_id(1);
|
||||
int tid = get_local_id(1) * get_local_size(0)
|
||||
+ get_local_id(0);
|
||||
|
||||
smem[tid] = 0;
|
||||
|
@ -19,7 +19,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -63,8 +63,8 @@
|
||||
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
float4 sqsum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
@ -75,8 +75,8 @@ kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global fl
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
|
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : 0);
|
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : 0);
|
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
@ -163,8 +163,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
|
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
|
||||
int sqsum_step,int sum_offset,int sqsum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
float4 sqsrc_t[2],sqsum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
@ -279,8 +279,8 @@ kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__
|
||||
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
float4 sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
@ -291,8 +291,8 @@ kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global
|
||||
gid = gid << 1;
|
||||
for(int i = 0; i < rows; i =i + LSIZE_1)
|
||||
{
|
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
|
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
|
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, cols - 1)]) : (float4)0);
|
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, cols - 1)]) : (float4)0);
|
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
|
||||
@ -379,8 +379,8 @@ kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,
|
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
|
||||
int sqsum_step,int sum_offset,int sqsum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
float4 sqsrc_t[2],sqsum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -64,8 +64,8 @@
|
||||
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local int* sum_p;
|
||||
@ -146,8 +146,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
|
||||
int rows,int cols,int src_step,int sum_step,
|
||||
int sum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
int4 src_t[2], sum_t[2];
|
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local int *sum_p;
|
||||
@ -239,8 +239,8 @@ kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
|
||||
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
|
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float* sum_p;
|
||||
@ -321,8 +321,8 @@ kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
|
||||
int rows,int cols,int src_step,int sum_step,
|
||||
int sum_offset)
|
||||
{
|
||||
unsigned int lid = get_local_id(0);
|
||||
unsigned int gid = get_group_id(0);
|
||||
int lid = get_local_id(0);
|
||||
int gid = get_group_id(0);
|
||||
float4 src_t[2], sum_t[2];
|
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
|
||||
__local float *sum_p;
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -106,10 +106,10 @@ __kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -148,10 +148,10 @@ __kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, i
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -190,10 +190,10 @@ __kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, i
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -232,10 +232,10 @@ __kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
|
||||
op(p4, p2); op(p6, p4); op(p4, p2);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -294,10 +294,10 @@ __kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -356,10 +356,10 @@ __kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, i
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -418,10 +418,10 @@ __kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
|
||||
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
|
||||
@ -480,7 +480,7 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i
|
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
|
||||
op(p7, p11); op(p11, p13); op(p11, p12);
|
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols)
|
||||
if((int)get_global_id(1)<rows && (int)get_global_id(0)<cols)
|
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
|
||||
}
|
||||
#undef op(a,b)
|
||||
#undef op
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -60,7 +60,7 @@
|
||||
#elif defined BORDER_REPLICATE
|
||||
#define EXTRAPOLATE(v2, v) \
|
||||
{ \
|
||||
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \
|
||||
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
|
||||
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
|
||||
}
|
||||
#elif defined BORDER_WRAP
|
||||
@ -139,7 +139,9 @@ __kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
|
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy))
|
||||
{
|
||||
int2 gxy = (int2)(gx, gy), zero = (int2)(0);
|
||||
#ifndef BORDER_CONSTANT
|
||||
int2 gxy = (int2)(gx, gy);
|
||||
#endif
|
||||
EXTRAPOLATE(gxy, dst[dstIdx]);
|
||||
}
|
||||
else
|
||||
@ -167,10 +169,7 @@ __kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __g
|
||||
int gx = gxy.x, gy = gxy.y;
|
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy))
|
||||
{
|
||||
int2 zero = (int2)(0);
|
||||
EXTRAPOLATE(gxy, dst[dstIdx]);
|
||||
}
|
||||
EXTRAPOLATE(gxy, dst[dstIdx])
|
||||
else
|
||||
{
|
||||
int srcIdx = mad24(gy, src_step, gx + src_offset);
|
||||
@ -196,10 +195,7 @@ __kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __g
|
||||
int gx = gxy.x, gy = gxy.y;
|
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy))
|
||||
{
|
||||
int2 zero = (int2)(0);
|
||||
EXTRAPOLATE(gxy, dst[dstIdx]);
|
||||
}
|
||||
EXTRAPOLATE(gxy, dst[dstIdx])
|
||||
else
|
||||
{
|
||||
int srcIdx = mad24(gy, src_step, gx + src_offset);
|
||||
@ -231,7 +227,6 @@ __kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst,
|
||||
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
|
||||
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
|
||||
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
|
||||
int2 zero = (int2)(0);
|
||||
|
||||
float2 _u = map_data - convert_float2(map_dataA);
|
||||
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
|
||||
@ -285,7 +280,6 @@ __kernel void remap_32FC2(__global T const * restrict src, __global T * dst,
|
||||
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
|
||||
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
|
||||
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
|
||||
int2 zero = (int2)(0);
|
||||
|
||||
float2 _u = map_data - convert_float2(map_dataA);
|
||||
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -182,10 +182,10 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
|
||||
int x = floor(sx), y = floor(sy);
|
||||
float u = sx - x, v = sy - y;
|
||||
|
||||
x<0 ? x=0,u=0 : x,u;
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x,u;
|
||||
y<0 ? y=0,v=0 : y,v;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y,v;
|
||||
if ( x<0 ) x=0,u=0;
|
||||
if ( x>=src_cols ) x=src_cols-1,u=0;
|
||||
if ( y<0 ) y=0,v=0;
|
||||
if (y>=src_rows ) y=src_rows-1,v=0;
|
||||
|
||||
u = u * INTER_RESIZE_COEF_SCALE;
|
||||
v = v * INTER_RESIZE_COEF_SCALE;
|
||||
@ -225,10 +225,10 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
|
||||
int x = floor(sx), y = floor(sy);
|
||||
float u = sx - x, v = sy - y;
|
||||
|
||||
x<0 ? x=0,u=0 : x,u;
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x,u;
|
||||
y<0 ? y=0,v=0 : y,v;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y,v;
|
||||
if ( x<0 ) x=0,u=0;
|
||||
if ( x>=src_cols ) x=src_cols-1,u=0;
|
||||
if ( y<0 ) y=0,v=0;
|
||||
if (y>=src_rows ) y=src_rows-1,v=0;
|
||||
|
||||
int y_ = INC(y,src_rows);
|
||||
int x_ = INC(x,src_cols);
|
||||
@ -264,10 +264,10 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
|
||||
int x = floor(sx), y = floor(sy);
|
||||
float u = sx - x, v = sy - y;
|
||||
|
||||
x<0 ? x=0,u=0 : x;
|
||||
x>=src_cols ? x=src_cols-1,u=0 : x;
|
||||
y<0 ? y=0,v=0 : y;
|
||||
y>=src_rows ? y=src_rows-1,v=0 : y;
|
||||
if ( x<0 ) x=0,u=0;
|
||||
if ( x>=src_cols ) x=src_cols-1,u=0;
|
||||
if ( y<0 ) y=0,v=0;
|
||||
if (y>=src_rows ) y=src_rows-1,v=0;
|
||||
|
||||
int y_ = INC(y,src_rows);
|
||||
int x_ = INC(x,src_cols);
|
||||
|
108
modules/ocl/src/opencl/imgproc_sobel3.cl
Normal file
108
modules/ocl/src/opencl/imgproc_sobel3.cl
Normal file
@ -0,0 +1,108 @@
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////Macro for border type////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef BORDER_REPLICATE
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT
|
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_REFLECT101
|
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
|
||||
#endif
|
||||
|
||||
#ifdef BORDER_WRAP
|
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
|
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
|
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
|
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
|
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
|
||||
#endif
|
||||
|
||||
__kernel void sobel3(
|
||||
__global uchar* Src,
|
||||
__global float* DstX,
|
||||
__global float* DstY,
|
||||
int width, int height,
|
||||
uint srcStride, uint dstStride,
|
||||
float scale
|
||||
)
|
||||
{
|
||||
__local float lsmem[BLK_Y+2][BLK_X+2];
|
||||
|
||||
int lix = get_local_id(0);
|
||||
int liy = get_local_id(1);
|
||||
|
||||
int gix = get_group_id(0);
|
||||
int giy = get_group_id(1);
|
||||
|
||||
int id_x = get_global_id(0);
|
||||
int id_y = get_global_id(1);
|
||||
|
||||
lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
|
||||
|
||||
int id_y_h = ADDR_H(id_y-1, 0,height);
|
||||
int id_y_b = ADDR_B(id_y+1, height,id_y+1);
|
||||
|
||||
int id_x_l = ADDR_L(id_x-1, 0,width);
|
||||
int id_x_r = ADDR_R(id_x+1, width,id_x+1);
|
||||
|
||||
if(liy==0)
|
||||
{
|
||||
lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
|
||||
|
||||
if(lix==0)
|
||||
lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
|
||||
}
|
||||
else if(liy==BLK_Y-1)
|
||||
{
|
||||
lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
|
||||
|
||||
if(lix==0)
|
||||
lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
|
||||
}
|
||||
|
||||
if(lix==0)
|
||||
lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]);
|
||||
else if(lix==BLK_X-1)
|
||||
lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float u1 = lsmem[liy][lix];
|
||||
float u2 = lsmem[liy][lix+1];
|
||||
float u3 = lsmem[liy][lix+2];
|
||||
|
||||
float m1 = lsmem[liy+1][lix];
|
||||
float m2 = lsmem[liy+1][lix+1];
|
||||
float m3 = lsmem[liy+1][lix+2];
|
||||
|
||||
float b1 = lsmem[liy+2][lix];
|
||||
float b2 = lsmem[liy+2][lix+1];
|
||||
float b3 = lsmem[liy+2][lix+2];
|
||||
|
||||
//m2 * scale;//
|
||||
float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
|
||||
DstX[ id_y * dstStride + id_x ] = dx * scale;
|
||||
|
||||
float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
|
||||
DstY[ id_y * dstStride + id_x ] = dy * scale;
|
||||
}
|
@ -44,109 +44,93 @@
|
||||
//M*/
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
#ifdef cl_amd_fp64
|
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable
|
||||
#elif defined (cl_khr_fp64)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// threshold type:
|
||||
// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
|
||||
// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
|
||||
#ifdef VECTORIZED
|
||||
|
||||
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
|
||||
int src_offset, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
uchar thresh, uchar max_val, int thresh_type
|
||||
)
|
||||
__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
|
||||
__global T * dst, int dst_offset, int dst_step,
|
||||
T thresh, T max_val, int max_index, int rows, int cols)
|
||||
{
|
||||
int gx = get_global_id(0);
|
||||
const int gy = get_global_id(1);
|
||||
int gy = get_global_id(1);
|
||||
|
||||
int offset = (dst_offset & 15);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 4) - offset;
|
||||
if(dstart < dst_cols && gy < dst_rows)
|
||||
if (gx < cols && gy < rows)
|
||||
{
|
||||
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
|
||||
uchar16 ddata;
|
||||
uchar16 zero = 0;
|
||||
switch (thresh_type)
|
||||
gx *= VECSIZE;
|
||||
int src_index = mad24(gy, src_step, src_offset + gx);
|
||||
int dst_index = mad24(gy, dst_step, dst_offset + gx);
|
||||
|
||||
#ifdef SRC_ALIGNED
|
||||
VT sdata = *((__global VT *)(src + src_index));
|
||||
#else
|
||||
VT sdata = VLOADN(0, src + src_index);
|
||||
#endif
|
||||
VT vthresh = (VT)(thresh);
|
||||
|
||||
#ifdef THRESH_BINARY
|
||||
VT vecValue = sdata > vthresh ? max_val : (VT)(0);
|
||||
#elif defined THRESH_BINARY_INV
|
||||
VT vecValue = sdata > vthresh ? (VT)(0) : max_val;
|
||||
#elif defined THRESH_TRUNC
|
||||
VT vecValue = sdata > vthresh ? thresh : sdata;
|
||||
#elif defined THRESH_TOZERO
|
||||
VT vecValue = sdata > vthresh ? sdata : (VT)(0);
|
||||
#elif defined THRESH_TOZERO_INV
|
||||
VT vecValue = sdata > vthresh ? (VT)(0) : sdata;
|
||||
#endif
|
||||
|
||||
if (gx + VECSIZE <= max_index)
|
||||
#ifdef DST_ALIGNED
|
||||
*(__global VT*)(dst + dst_index) = vecValue;
|
||||
#else
|
||||
VSTOREN(vecValue, 0, dst + dst_index);
|
||||
#endif
|
||||
else
|
||||
{
|
||||
case 0:
|
||||
ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
|
||||
break;
|
||||
case 1:
|
||||
ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val);
|
||||
break;
|
||||
case 2:
|
||||
ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
|
||||
break;
|
||||
case 3:
|
||||
ddata = ((sdata > thresh)) ? sdata : zero;
|
||||
break;
|
||||
case 4:
|
||||
ddata = ((sdata > thresh)) ? zero : sdata;
|
||||
break;
|
||||
default:
|
||||
ddata = sdata;
|
||||
}
|
||||
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
|
||||
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
|
||||
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int16 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_uchar16(con != 0) ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
T array[VECSIZE];
|
||||
VSTOREN(vecValue, 0, array);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VECSIZE; ++i)
|
||||
if (gx + i < max_index)
|
||||
dst[dst_index + i] = array[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
|
||||
int src_offset, int src_step,
|
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step,
|
||||
float thresh, float max_val, int thresh_type
|
||||
)
|
||||
__kernel void threshold(__global const T * restrict src, int src_offset, int src_step,
|
||||
__global T * dst, int dst_offset, int dst_step,
|
||||
T thresh, T max_val, int rows, int cols)
|
||||
{
|
||||
const int gx = get_global_id(0);
|
||||
const int gy = get_global_id(1);
|
||||
int gx = get_global_id(0);
|
||||
int gy = get_global_id(1);
|
||||
|
||||
int offset = (dst_offset & 3);
|
||||
src_offset -= offset;
|
||||
|
||||
int dstart = (gx << 2) - offset;
|
||||
if(dstart < dst_cols && gy < dst_rows)
|
||||
if (gx < cols && gy < rows)
|
||||
{
|
||||
float4 sdata = vload4(gx, src+src_offset+gy*src_step);
|
||||
float4 ddata;
|
||||
float4 zero = 0;
|
||||
switch (thresh_type)
|
||||
{
|
||||
case 0:
|
||||
ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
|
||||
break;
|
||||
case 1:
|
||||
ddata = sdata > thresh ? zero : (float4)max_val;
|
||||
break;
|
||||
case 2:
|
||||
ddata = sdata > thresh ? (float4)thresh : sdata;
|
||||
break;
|
||||
case 3:
|
||||
ddata = sdata > thresh ? sdata : (float4)(0.f);
|
||||
break;
|
||||
case 4:
|
||||
ddata = sdata > thresh ? (float4)(0.f) : sdata;
|
||||
break;
|
||||
default:
|
||||
ddata = sdata;
|
||||
}
|
||||
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
|
||||
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
|
||||
int4 con = dpos >= 0 && dpos < dst_cols;
|
||||
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
|
||||
if(dstart < dst_cols)
|
||||
{
|
||||
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
|
||||
}
|
||||
int src_index = mad24(gy, src_step, src_offset + gx);
|
||||
int dst_index = mad24(gy, dst_step, dst_offset + gx);
|
||||
|
||||
T sdata = src[src_index];
|
||||
|
||||
#ifdef THRESH_BINARY
|
||||
dst[dst_index] = sdata > thresh ? max_val : (T)(0);
|
||||
#elif defined THRESH_BINARY_INV
|
||||
dst[dst_index] = sdata > thresh ? (T)(0) : max_val;
|
||||
#elif defined THRESH_TRUNC
|
||||
dst[dst_index] = sdata > thresh ? thresh : sdata;
|
||||
#elif defined THRESH_TOZERO
|
||||
dst[dst_index] = sdata > thresh ? sdata : (T)(0);
|
||||
#elif defined THRESH_TOZERO_INV
|
||||
dst[dst_index] = sdata > thresh ? (T)(0) : sdata;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -537,9 +537,9 @@ __kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst,
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
@ -680,9 +680,9 @@ __kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * ds
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -100,8 +100,8 @@ __kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __glo
|
||||
F4 Y0 = M[3]*DX + M[4]*dy + M[5];
|
||||
F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
|
||||
W = (W!=zero) ? one/W : zero;
|
||||
short4 X = convert_short4(rint(X0*W));
|
||||
short4 Y = convert_short4(rint(Y0*W));
|
||||
short4 X = convert_short4_sat_rte(X0*W);
|
||||
short4 Y = convert_short4_sat_rte(Y0*W);
|
||||
int4 sx = convert_int4(X);
|
||||
int4 sy = convert_int4(Y);
|
||||
|
||||
@ -133,12 +133,12 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
int sx = (short)(X >> INTER_BITS);
|
||||
int sy = (short)(Y >> INTER_BITS);
|
||||
int sx = convert_short_sat(X >> INTER_BITS);
|
||||
int sy = convert_short_sat(Y >> INTER_BITS);
|
||||
int ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
int ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -150,16 +150,16 @@ __kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, _
|
||||
|
||||
short itab[4];
|
||||
float tab1y[2], tab1x[2];
|
||||
tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
|
||||
tab1y[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
|
||||
tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
|
||||
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
|
||||
tab1x[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
|
||||
tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
#pragma unroll 4
|
||||
for(i=0; i<4; i++)
|
||||
{
|
||||
float v = tab1y[(i>>1)] * tab1x[(i&1)];
|
||||
itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
|
||||
itab[i] = convert_short_sat_rte( v * INTER_REMAP_COEF_SCALE );
|
||||
}
|
||||
if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
{
|
||||
@ -185,12 +185,12 @@ __kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar *
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short sx = convert_short_sat(X >> INTER_BITS) - 1;
|
||||
short sy = convert_short_sat(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -265,11 +265,9 @@ __kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __gl
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? 1./W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
short sx = (short)X;
|
||||
short sy = (short)Y;
|
||||
W = (W != 0.0f) ? 1.f/W : 0.0f;
|
||||
short sx = convert_short_sat_rte(X0*W);
|
||||
short sy = convert_short_sat_rte(Y0*W);
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
|
||||
@ -291,12 +289,12 @@ __kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src,
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS);
|
||||
short sy = (short)(Y >> INTER_BITS);
|
||||
short sx = convert_short_sat(X >> INTER_BITS);
|
||||
short sy = convert_short_sat(Y >> INTER_BITS);
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -343,12 +341,12 @@ __kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, _
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short sx = convert_short_sat(X >> INTER_BITS) - 1;
|
||||
short sy = convert_short_sat(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -426,11 +424,9 @@ __kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? 1./W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
short sx = (short)X;
|
||||
short sy = (short)Y;
|
||||
W = (W != 0.0f) ? 1.f/W : 0.0f;
|
||||
short sx = convert_short_sat_rte(X0*W);
|
||||
short sy = convert_short_sat_rte(Y0*W);
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
|
||||
@ -451,12 +447,12 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS);
|
||||
short sy = (short)(Y >> INTER_BITS);
|
||||
short sx = convert_short_sat(X >> INTER_BITS);
|
||||
short sy = convert_short_sat(Y >> INTER_BITS);
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -469,9 +465,9 @@ __kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float *
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
|
||||
taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay;
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
|
||||
tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax;
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
@ -501,12 +497,12 @@ __kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float *
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1;
|
||||
short sy = (short)(Y >> INTER_BITS) - 1;
|
||||
short sx = convert_short_sat(X >> INTER_BITS) - 1;
|
||||
short sy = convert_short_sat(Y >> INTER_BITS) - 1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -561,11 +557,9 @@ __kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * d
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W =(W != 0.0)? 1./W : 0.0;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
short sx = (short)X;
|
||||
short sy = (short)Y;
|
||||
W =(W != 0.0f)? 1.f/W : 0.0f;
|
||||
short sx = convert_short_sat_rte(X0*W);
|
||||
short sy = convert_short_sat_rte(Y0*W);
|
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
|
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
|
||||
@ -589,12 +583,12 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx0 = (short)(X >> INTER_BITS);
|
||||
short sy0 = (short)(Y >> INTER_BITS);
|
||||
short sx0 = convert_short_sat(X >> INTER_BITS);
|
||||
short sy0 = convert_short_sat(Y >> INTER_BITS);
|
||||
short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax0 = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
@ -608,9 +602,9 @@ __kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4
|
||||
|
||||
float tab[4];
|
||||
float taby[2], tabx[2];
|
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[0] = 1.0f - 1.f/INTER_TAB_SIZE*ay0;
|
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0;
|
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[0] = 1.0f - 1.f/INTER_TAB_SIZE*ax0;
|
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
|
||||
|
||||
tab[0] = taby[0] * tabx[0];
|
||||
@ -642,12 +636,12 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2];
|
||||
F Y0 = M[3]*dx + M[4]*dy + M[5];
|
||||
F W = M[6]*dx + M[7]*dy + M[8];
|
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
|
||||
W = (W != 0.0f) ? INTER_TAB_SIZE/W : 0.0f;
|
||||
int X = rint(X0*W);
|
||||
int Y = rint(Y0*W);
|
||||
|
||||
short sx = (short)(X >> INTER_BITS)-1;
|
||||
short sy = (short)(Y >> INTER_BITS)-1;
|
||||
short sx = convert_short_sat(X >> INTER_BITS)-1;
|
||||
short sy = convert_short_sat(Y >> INTER_BITS)-1;
|
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1));
|
||||
short ax = (short)(X & (INTER_TAB_SIZE-1));
|
||||
|
||||
|
@ -192,7 +192,6 @@ __kernel
|
||||
{
|
||||
const int i = get_local_id(0); // index in workgroup
|
||||
const int numOfGroups = get_num_groups(0); // index in workgroup
|
||||
const int groupID = get_group_id(0);
|
||||
const int wg = get_local_size(0); // workgroup size = block size
|
||||
int pos = 0, same = 0;
|
||||
const int offset = get_group_id(0) * wg;
|
||||
|
@ -63,7 +63,7 @@
|
||||
|
||||
///////////// parallel merge sort ///////////////
|
||||
// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
|
||||
uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
static uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
|
||||
uint firstIndex = left;
|
||||
@ -94,7 +94,7 @@ uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
// by a base pointer and left and right index for a particular candidate value. The comparison operator is
|
||||
// passed as a functor parameter my_comp
|
||||
// This function returns an index that is the first index whos value would be equal to the searched value
|
||||
uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
static uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
|
||||
uint firstIndex = left;
|
||||
@ -130,7 +130,7 @@ uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
// passed as a functor parameter my_comp
|
||||
// This function returns an index that is the first index whos value would be greater than the searched value
|
||||
// If the search value is not found in the sequence, upperbound returns the same result as lowerbound
|
||||
uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
static uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
uint upperBound = lowerBoundBinary( data, left, right, searchVal );
|
||||
|
||||
@ -167,9 +167,6 @@ kernel void merge(
|
||||
)
|
||||
{
|
||||
size_t globalID = get_global_id( 0 );
|
||||
size_t groupID = get_group_id( 0 );
|
||||
size_t localID = get_local_id( 0 );
|
||||
size_t wgSize = get_local_size( 0 );
|
||||
|
||||
// Abort threads that are passed the end of the input vector
|
||||
if( globalID >= srcVecSize )
|
||||
@ -230,12 +227,12 @@ kernel void blockInsertionSort(
|
||||
local V_T* val_lds
|
||||
)
|
||||
{
|
||||
size_t gloId = get_global_id( 0 );
|
||||
size_t groId = get_group_id( 0 );
|
||||
size_t locId = get_local_id( 0 );
|
||||
size_t wgSize = get_local_size( 0 );
|
||||
int gloId = get_global_id( 0 );
|
||||
int groId = get_group_id( 0 );
|
||||
int locId = get_local_id( 0 );
|
||||
int wgSize = get_local_size( 0 );
|
||||
|
||||
bool in_range = gloId < vecSize;
|
||||
bool in_range = gloId < (int)vecSize;
|
||||
K_T key;
|
||||
V_T val;
|
||||
// Abort threads that are passed the end of the input vector
|
||||
@ -254,7 +251,7 @@ kernel void blockInsertionSort(
|
||||
{
|
||||
// The last workgroup may have an irregular size, so we calculate a per-block endIndex
|
||||
// endIndex is essentially emulating a mod operator with subtraction and multiply
|
||||
size_t endIndex = vecSize - ( groId * wgSize );
|
||||
int endIndex = vecSize - ( groId * wgSize );
|
||||
endIndex = min( endIndex, wgSize );
|
||||
|
||||
// printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
|
||||
|
@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Xiaopeng Fu, fuxiaopeng2222@163.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -25,7 +26,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -43,42 +44,81 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void distanceToCenters(
|
||||
int label_step, int K,
|
||||
__global float *src,
|
||||
__global int *labels, int dims, int rows,
|
||||
__global float *centers,
|
||||
__global float *dists)
|
||||
#ifdef L1_DIST
|
||||
# define DISTANCE(A, B) fabs((A) - (B))
|
||||
#elif defined L2SQR_DIST
|
||||
# define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
|
||||
#else
|
||||
# define DISTANCE(A, B) ((A) - (B)) * ((A) - (B))
|
||||
#endif
|
||||
|
||||
inline float dist(__global const float * center, __global const float * src, int feature_cols)
|
||||
{
|
||||
int gid = get_global_id(1);
|
||||
|
||||
float dist, euDist, min;
|
||||
int minCentroid;
|
||||
|
||||
if(gid >= rows)
|
||||
return;
|
||||
|
||||
for(int i = 0 ; i < K; i++)
|
||||
float res = 0;
|
||||
float4 tmp4;
|
||||
int i;
|
||||
for(i = 0; i < feature_cols / 4; i += 4, center += 4, src += 4)
|
||||
{
|
||||
euDist = 0;
|
||||
for(int j = 0; j < dims; j++)
|
||||
{
|
||||
dist = (src[j + gid * dims]
|
||||
- centers[j + i * dims]);
|
||||
euDist += dist * dist;
|
||||
}
|
||||
tmp4 = vload4(0, center) - vload4(0, src);
|
||||
#ifdef L1_DIST
|
||||
tmp4 = fabs(tmp4);
|
||||
#else
|
||||
tmp4 *= tmp4;
|
||||
#endif
|
||||
res += tmp4.x + tmp4.y + tmp4.z + tmp4.w;
|
||||
}
|
||||
|
||||
if(i == 0)
|
||||
for(; i < feature_cols; ++i, ++center, ++src)
|
||||
{
|
||||
res += DISTANCE(*src, *center);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// to be distinguished with distanceToCenters in kmeans_kernel.cl
|
||||
__kernel void distanceToCenters(
|
||||
__global const float *src,
|
||||
__global const float *centers,
|
||||
#ifdef USE_INDEX
|
||||
__global const int *indices,
|
||||
#endif
|
||||
__global int *labels,
|
||||
__global float *dists,
|
||||
int feature_cols,
|
||||
int src_step,
|
||||
int centers_step,
|
||||
int label_step,
|
||||
int input_size,
|
||||
int K,
|
||||
int offset_src,
|
||||
int offset_centers
|
||||
)
|
||||
{
|
||||
int gid = get_global_id(0);
|
||||
float euDist, minval;
|
||||
int minCentroid;
|
||||
if(gid >= input_size)
|
||||
{
|
||||
return;
|
||||
}
|
||||
src += offset_src;
|
||||
centers += offset_centers;
|
||||
#ifdef USE_INDEX
|
||||
src += indices[gid] * src_step;
|
||||
#else
|
||||
src += gid * src_step;
|
||||
#endif
|
||||
minval = dist(centers, src, feature_cols);
|
||||
minCentroid = 0;
|
||||
for(int i = 1 ; i < K; i++)
|
||||
{
|
||||
euDist = dist(centers + i * centers_step, src, feature_cols);
|
||||
if(euDist < minval)
|
||||
{
|
||||
min = euDist;
|
||||
minCentroid = 0;
|
||||
}
|
||||
else if(euDist < min)
|
||||
{
|
||||
min = euDist;
|
||||
minval = euDist;
|
||||
minCentroid = i;
|
||||
}
|
||||
}
|
||||
dists[gid] = min;
|
||||
labels[label_step * gid] = minCentroid;
|
||||
labels[gid * label_step] = minCentroid;
|
||||
dists[gid] = minval;
|
||||
}
|
||||
|
@ -129,58 +129,53 @@ __kernel void knn_find_nearest(__global float* sample, int sample_row, int sampl
|
||||
}
|
||||
/*! find_nearest_neighbor done!*/
|
||||
/*! write_results start!*/
|
||||
switch (regression)
|
||||
if (regression)
|
||||
{
|
||||
case true:
|
||||
{
|
||||
TYPE s;
|
||||
TYPE s;
|
||||
#ifdef DOUBLE_SUPPORT
|
||||
s = 0.0;
|
||||
s = 0.0;
|
||||
#else
|
||||
s = 0.0f;
|
||||
s = 0.0f;
|
||||
#endif
|
||||
for(j = 0; j < K1; j++)
|
||||
s += nr[j * nThreads + threadY];
|
||||
for(j = 0; j < K1; j++)
|
||||
s += nr[j * nThreads + threadY];
|
||||
|
||||
_results[y * _results_step] = (float)(s * inv_scale);
|
||||
}
|
||||
break;
|
||||
case false:
|
||||
_results[y * _results_step] = (float)(s * inv_scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
int prev_start = 0, best_count = 0, cur_count;
|
||||
float best_val;
|
||||
|
||||
for(j = K1 - 1; j > 0; j--)
|
||||
{
|
||||
int prev_start = 0, best_count = 0, cur_count;
|
||||
float best_val;
|
||||
|
||||
for(j = K1 - 1; j > 0; j--)
|
||||
bool swap_f1 = false;
|
||||
for(j1 = 0; j1 < j; j1++)
|
||||
{
|
||||
bool swap_f1 = false;
|
||||
for(j1 = 0; j1 < j; j1++)
|
||||
if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
|
||||
{
|
||||
if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
|
||||
{
|
||||
int t;
|
||||
CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
|
||||
swap_f1 = true;
|
||||
}
|
||||
int t;
|
||||
CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
|
||||
swap_f1 = true;
|
||||
}
|
||||
if(!swap_f1)
|
||||
break;
|
||||
}
|
||||
|
||||
best_val = 0;
|
||||
for(j = 1; j <= K1; j++)
|
||||
if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
|
||||
{
|
||||
cur_count = j - prev_start;
|
||||
if(best_count < cur_count)
|
||||
{
|
||||
best_count = cur_count;
|
||||
best_val = nr[(j - 1) * nThreads + threadY];
|
||||
}
|
||||
prev_start = j;
|
||||
}
|
||||
_results[y * _results_step] = best_val;
|
||||
if(!swap_f1)
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
best_val = 0;
|
||||
for(j = 1; j <= K1; j++)
|
||||
if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
|
||||
{
|
||||
cur_count = j - prev_start;
|
||||
if(best_count < cur_count)
|
||||
{
|
||||
best_count = cur_count;
|
||||
best_val = nr[(j - 1) * nThreads + threadY];
|
||||
}
|
||||
prev_start = j;
|
||||
}
|
||||
_results[y * _results_step] = best_val;
|
||||
}
|
||||
///*! write_results done!*/
|
||||
}
|
||||
|
@ -43,8 +43,6 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
|
||||
#if defined (DOUBLE_SUPPORT)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
@ -70,7 +68,7 @@
|
||||
#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
|
||||
// normAcc* are accurate normalization routines which make GPU matchTemplate
|
||||
// consistent with CPU one
|
||||
float normAcc(float num, float denum)
|
||||
inline float normAcc(float num, float denum)
|
||||
{
|
||||
if(fabs(num) < denum)
|
||||
{
|
||||
@ -83,7 +81,7 @@ float normAcc(float num, float denum)
|
||||
return 0;
|
||||
}
|
||||
|
||||
float normAcc_SQDIFF(float num, float denum)
|
||||
inline float normAcc_SQDIFF(float num, float denum)
|
||||
{
|
||||
if(fabs(num) < denum)
|
||||
{
|
||||
|
@ -28,7 +28,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -46,7 +46,7 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
static short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
__global uchar4* in, int in_step, int dst_off, int src_off,
|
||||
int cols, int rows, int sp, int sr, int maxIter, float eps)
|
||||
{
|
||||
@ -56,7 +56,6 @@ short2 do_mean_shift(int x0, int y0, __global uchar4* out,int out_step,
|
||||
src_off = src_off >> 2;
|
||||
dst_off = dst_off >> 2;
|
||||
int idx = src_off + y0 * in_step + x0;
|
||||
// uchar4 c = vload4(0, (__global uchar*)in+idx);
|
||||
uchar4 c = in[idx];
|
||||
int base = dst_off + get_global_id(1)*out_step + get_global_id(0) ;
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -200,7 +200,7 @@ __kernel void normalize_hists_36_kernel(__global float* block_hists,
|
||||
//-------------------------------------------------------------
|
||||
// Normalization of histograms via L2Hys_norm
|
||||
//
|
||||
float reduce_smem(volatile __local float* smem, int size)
|
||||
static float reduce_smem(volatile __local float* smem, int size)
|
||||
{
|
||||
unsigned int tid = get_local_id(0);
|
||||
float sum = smem[tid];
|
||||
@ -564,7 +564,6 @@ __kernel void compute_gradients_8UC4_kernel(
|
||||
const int x = get_global_id(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int gSizeX = get_local_size(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const uchar4* row = img + gidY * img_step;
|
||||
@ -667,7 +666,6 @@ __kernel void compute_gradients_8UC1_kernel(
|
||||
const int x = get_global_id(0);
|
||||
const int tid = get_local_id(0);
|
||||
const int gSizeX = get_local_size(0);
|
||||
const int gidX = get_group_id(0);
|
||||
const int gidY = get_group_id(1);
|
||||
|
||||
__global const uchar* row = img + gidY * img_step;
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -16,7 +16,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
|
@ -44,10 +44,10 @@
|
||||
//M*/
|
||||
|
||||
|
||||
#define tx get_local_id(0)
|
||||
#define tx (int)get_local_id(0)
|
||||
#define ty get_local_id(1)
|
||||
#define bx get_group_id(0)
|
||||
#define bdx get_local_size(0)
|
||||
#define bdx (int)get_local_size(0)
|
||||
|
||||
#define BORDER_SIZE 5
|
||||
#define MAX_KSIZE_HALF 100
|
||||
|
@ -43,32 +43,32 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
int idx_row_low(int y, int last_row)
|
||||
inline int idx_row_low(int y, int last_row)
|
||||
{
|
||||
return abs(y) % (last_row + 1);
|
||||
}
|
||||
|
||||
int idx_row_high(int y, int last_row)
|
||||
inline int idx_row_high(int y, int last_row)
|
||||
{
|
||||
return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
|
||||
}
|
||||
|
||||
int idx_row(int y, int last_row)
|
||||
inline int idx_row(int y, int last_row)
|
||||
{
|
||||
return idx_row_low(idx_row_high(y, last_row), last_row);
|
||||
}
|
||||
|
||||
int idx_col_low(int x, int last_col)
|
||||
inline int idx_col_low(int x, int last_col)
|
||||
{
|
||||
return abs(x) % (last_col + 1);
|
||||
}
|
||||
|
||||
int idx_col_high(int x, int last_col)
|
||||
inline int idx_col_high(int x, int last_col)
|
||||
{
|
||||
return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
|
||||
}
|
||||
|
||||
int idx_col(int x, int last_col)
|
||||
inline int idx_col(int x, int last_col)
|
||||
{
|
||||
return idx_col_low(idx_col_high(x, last_col), last_col);
|
||||
}
|
||||
|
@ -53,7 +53,8 @@
|
||||
#define WAVE_SIZE 1
|
||||
#endif
|
||||
#ifdef CPU
|
||||
void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
|
||||
|
||||
static void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
smem2[tid] = val2;
|
||||
@ -72,7 +73,7 @@ void reduce3(float val1, float val2, float val3, __local float* smem1, __local
|
||||
}
|
||||
}
|
||||
|
||||
void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
|
||||
static void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
smem2[tid] = val2;
|
||||
@ -89,7 +90,7 @@ void reduce2(float val1, float val2, volatile __local float* smem1, volatile __l
|
||||
}
|
||||
}
|
||||
|
||||
void reduce1(float val1, volatile __local float* smem1, int tid)
|
||||
static void reduce1(float val1, volatile __local float* smem1, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -104,7 +105,7 @@ void reduce1(float val1, volatile __local float* smem1, int tid)
|
||||
}
|
||||
}
|
||||
#else
|
||||
void reduce3(float val1, float val2, float val3,
|
||||
static void reduce3(float val1, float val2, float val3,
|
||||
__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
@ -151,7 +152,7 @@ void reduce3(float val1, float val2, float val3,
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
|
||||
static void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
smem2[tid] = val2;
|
||||
@ -190,7 +191,7 @@ void reduce2(float val1, float val2, __local volatile float* smem1, __local vola
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
void reduce1(float val1, __local volatile float* smem1, int tid)
|
||||
static void reduce1(float val1, __local volatile float* smem1, int tid)
|
||||
{
|
||||
smem1[tid] = val1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@ -226,7 +227,7 @@ void reduce1(float val1, __local volatile float* smem1, int tid)
|
||||
// Image read mode
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
|
||||
|
||||
void SetPatch(image2d_t I, float x, float y,
|
||||
static void SetPatch(image2d_t I, float x, float y,
|
||||
float* Pch, float* Dx, float* Dy,
|
||||
float* A11, float* A12, float* A22)
|
||||
{
|
||||
@ -247,7 +248,7 @@ void SetPatch(image2d_t I, float x, float y,
|
||||
*A22 += dIdy * dIdy;
|
||||
}
|
||||
|
||||
void GetPatch(image2d_t J, float x, float y,
|
||||
inline void GetPatch(image2d_t J, float x, float y,
|
||||
float* Pch, float* Dx, float* Dy,
|
||||
float* b1, float* b2)
|
||||
{
|
||||
@ -257,13 +258,13 @@ void GetPatch(image2d_t J, float x, float y,
|
||||
*b2 += diff**Dy;
|
||||
}
|
||||
|
||||
void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
|
||||
inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)
|
||||
{
|
||||
float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;
|
||||
*errval += fabs(diff);
|
||||
}
|
||||
|
||||
void SetPatch4(image2d_t I, const float x, const float y,
|
||||
static void SetPatch4(image2d_t I, const float x, const float y,
|
||||
float4* Pch, float4* Dx, float4* Dy,
|
||||
float* A11, float* A12, float* A22)
|
||||
{
|
||||
@ -286,7 +287,7 @@ void SetPatch4(image2d_t I, const float x, const float y,
|
||||
*A22 += sqIdx.x + sqIdx.y + sqIdx.z;
|
||||
}
|
||||
|
||||
void GetPatch4(image2d_t J, const float x, const float y,
|
||||
static void GetPatch4(image2d_t J, const float x, const float y,
|
||||
const float4* Pch, const float4* Dx, const float4* Dy,
|
||||
float* b1, float* b2)
|
||||
{
|
||||
@ -298,7 +299,7 @@ void GetPatch4(image2d_t J, const float x, const float y,
|
||||
*b2 += xdiff.x + xdiff.y + xdiff.z;
|
||||
}
|
||||
|
||||
void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
|
||||
static void GetError4(image2d_t J, const float x, const float y, const float4* Pch, float* errval)
|
||||
{
|
||||
float4 diff = read_imagef(J, sampler, (float2)(x,y))-*Pch;
|
||||
*errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
|
||||
@ -318,7 +319,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
|
||||
unsigned int gid=get_group_id(0);
|
||||
unsigned int xsize=get_local_size(0);
|
||||
unsigned int ysize=get_local_size(1);
|
||||
int xBase, yBase, i, j, k;
|
||||
int xBase, yBase, k;
|
||||
|
||||
float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
|
||||
|
||||
@ -597,7 +598,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
|
||||
unsigned int gid=get_group_id(0);
|
||||
unsigned int xsize=get_local_size(0);
|
||||
unsigned int ysize=get_local_size(1);
|
||||
int xBase, yBase, i, j, k;
|
||||
int xBase, yBase, k;
|
||||
|
||||
float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -56,7 +56,7 @@
|
||||
#define radius 64
|
||||
#endif
|
||||
|
||||
unsigned int CalcSSD(__local unsigned int *col_ssd)
|
||||
static unsigned int CalcSSD(__local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int cache = col_ssd[0];
|
||||
|
||||
@ -67,7 +67,7 @@ unsigned int CalcSSD(__local unsigned int *col_ssd)
|
||||
return cache;
|
||||
}
|
||||
|
||||
uint2 MinSSD(__local unsigned int *col_ssd)
|
||||
static uint2 MinSSD(__local unsigned int *col_ssd)
|
||||
{
|
||||
unsigned int ssd[N_DISPARITIES];
|
||||
const int win_size = (radius << 1);
|
||||
@ -95,7 +95,7 @@ uint2 MinSSD(__local unsigned int *col_ssd)
|
||||
return (uint2)(mssd, bestIdx);
|
||||
}
|
||||
|
||||
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
static void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d, __local unsigned int *col_ssd)
|
||||
{
|
||||
uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
|
||||
@ -114,7 +114,7 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
|
||||
col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
|
||||
}
|
||||
|
||||
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
|
||||
static void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
|
||||
__global unsigned char* imageR, int d,
|
||||
__local unsigned int *col_ssd)
|
||||
{
|
||||
@ -153,7 +153,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
|
||||
|
||||
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
|
||||
|
||||
#define Y (get_group_id(1) * ROWSperTHREAD + radius)
|
||||
#define Y (int)(get_group_id(1) * ROWSperTHREAD + radius)
|
||||
|
||||
__global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
|
||||
__global unsigned char* disparImage = disp + X + Y * disp_step;
|
||||
@ -241,7 +241,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned
|
||||
/////////////////////////////////// Textureness filtering ////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
|
||||
static float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
|
||||
{
|
||||
float conv = 0;
|
||||
int y1 = y==0? 0 : y-1;
|
||||
@ -256,7 +256,7 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
|
||||
return fabs(conv);
|
||||
}
|
||||
|
||||
float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
|
||||
static float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
|
||||
{
|
||||
unsigned int cache = cols[0];
|
||||
|
||||
|
@ -26,7 +26,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -65,7 +65,7 @@
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////common///////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////
|
||||
T saturate_cast(float v){
|
||||
inline T saturate_cast(float v){
|
||||
#ifdef T_SHORT
|
||||
return convert_short_sat_rte(v);
|
||||
#else
|
||||
@ -73,7 +73,7 @@ T saturate_cast(float v){
|
||||
#endif
|
||||
}
|
||||
|
||||
T4 saturate_cast4(float4 v){
|
||||
inline T4 saturate_cast4(float4 v){
|
||||
#ifdef T_SHORT
|
||||
return convert_short4_sat_rte(v);
|
||||
#else
|
||||
@ -99,7 +99,7 @@ inline float pix_diff_1(const uchar4 l, __global const uchar *rs)
|
||||
return abs((int)(l.x) - *rs);
|
||||
}
|
||||
|
||||
float pix_diff_4(const uchar4 l, __global const uchar *rs)
|
||||
static float pix_diff_4(const uchar4 l, __global const uchar *rs)
|
||||
{
|
||||
uchar4 r;
|
||||
r = *((__global uchar4 *)rs);
|
||||
@ -235,7 +235,7 @@ __kernel void level_up_message(__global T *src, int src_rows, int src_step,
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////// calc all iterations /////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
|
||||
static void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
|
||||
const __global T *dt,
|
||||
int u_step, int msg_disp_step, int data_disp_step,
|
||||
float4 cmax_disc_term, float4 cdisc_single_jump)
|
||||
|
@ -248,7 +248,7 @@ __kernel void get_first_k_initial_local_1(__global float *data_cost_selected_, _
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////// init data cost ////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
float compute_3(__global uchar* left, __global uchar* right,
|
||||
inline float compute_3(__global uchar* left, __global uchar* right,
|
||||
float cdata_weight, float cmax_data_term)
|
||||
{
|
||||
float tb = 0.114f * abs((int)left[0] - right[0]);
|
||||
@ -257,17 +257,21 @@ float compute_3(__global uchar* left, __global uchar* right,
|
||||
|
||||
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
|
||||
}
|
||||
float compute_1(__global uchar* left, __global uchar* right,
|
||||
inline float compute_1(__global uchar* left, __global uchar* right,
|
||||
float cdata_weight, float cmax_data_term)
|
||||
{
|
||||
return fmin(cdata_weight * abs((int)*left - (int)*right), cdata_weight * cmax_data_term);
|
||||
}
|
||||
short round_short(float v){
|
||||
|
||||
inline short round_short(float v)
|
||||
{
|
||||
return convert_short_sat_rte(v);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////init_data_cost///////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void init_data_cost_0(__global short *ctemp, __global uchar *cleft, __global uchar *cright,
|
||||
int h, int w, int level, int channels,
|
||||
int cmsg_step1, float cdata_weight, float cmax_data_term, int cdisp_step1,
|
||||
@ -993,7 +997,8 @@ __kernel void compute_data_cost_reduce_1(__global const float *selected_disp_pyr
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////////// init message /////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
|
||||
|
||||
static void get_first_k_element_increase_0(__global short* u_new, __global short *d_new, __global short *l_new,
|
||||
__global short *r_new, __global const short *u_cur, __global const short *d_cur,
|
||||
__global const short *l_cur, __global const short *r_cur,
|
||||
__global short *data_cost_selected, __global short *disparity_selected_new,
|
||||
@ -1027,7 +1032,8 @@ void get_first_k_element_increase_0(__global short* u_new, __global short *d_new
|
||||
data_cost_new[id * cdisp_step1] = SHRT_MAX;
|
||||
}
|
||||
}
|
||||
void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
|
||||
|
||||
static void get_first_k_element_increase_1(__global float *u_new, __global float *d_new, __global float *l_new,
|
||||
__global float *r_new, __global const float *u_cur, __global const float *d_cur,
|
||||
__global const float *l_cur, __global const float *r_cur,
|
||||
__global float *data_cost_selected, __global float *disparity_selected_new,
|
||||
@ -1190,7 +1196,8 @@ __kernel void init_message_1(__global float *u_new_, __global float *d_new_, __g
|
||||
///////////////////////////////////////////////////////////////
|
||||
//////////////////// calc all iterations /////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
|
||||
|
||||
static void message_per_pixel_0(__global const short *data, __global short *msg_dst, __global const short *msg1,
|
||||
__global const short *msg2, __global const short *msg3,
|
||||
__global const short *dst_disp, __global const short *src_disp,
|
||||
int nr_plane, __global short *temp,
|
||||
@ -1226,7 +1233,8 @@ void message_per_pixel_0(__global const short *data, __global short *msg_dst, __
|
||||
for(int d = 0; d < nr_plane; d++)
|
||||
msg_dst[d * cdisp_step1] = convert_short_sat_rte(temp[d * cdisp_step1] - sum);
|
||||
}
|
||||
void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
|
||||
|
||||
static void message_per_pixel_1(__global const float *data, __global float *msg_dst, __global const float *msg1,
|
||||
__global const float *msg2, __global const float *msg3,
|
||||
__global const float *dst_disp, __global const float *src_disp,
|
||||
int nr_plane, __global float *temp,
|
||||
@ -1262,6 +1270,7 @@ void message_per_pixel_1(__global const float *data, __global float *msg_dst, __
|
||||
for(int d = 0; d < nr_plane; d++)
|
||||
msg_dst[d * cdisp_step1] = temp[d * cdisp_step1] - sum;
|
||||
}
|
||||
|
||||
__kernel void compute_message_0(__global short *u_, __global short *d_, __global short *l_, __global short *r_,
|
||||
__global const short *data_cost_selected, __global const short *selected_disp_pyr_cur,
|
||||
__global short *ctemp, int h, int w, int nr_plane, int i,
|
||||
@ -1293,6 +1302,7 @@ __kernel void compute_message_0(__global short *u_, __global short *d_, __global
|
||||
cmax_disc_term, cdisp_step1, cdisc_single_jump);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void compute_message_1(__global float *u_, __global float *d_, __global float *l_, __global float *r_,
|
||||
__global const float *data_cost_selected, __global const float *selected_disp_pyr_cur,
|
||||
__global float *ctemp, int h, int w, int nr_plane, int i,
|
||||
@ -1327,6 +1337,7 @@ __kernel void compute_message_1(__global float *u_, __global float *d_, __global
|
||||
///////////////////////////////////////////////////////////////
|
||||
/////////////////////////// output ////////////////////////////
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
||||
__kernel void compute_disp_0(__global const short *u_, __global const short *d_, __global const short *l_,
|
||||
__global const short *r_, __global const short * data_cost_selected,
|
||||
__global const short *disp_selected_pyr,
|
||||
@ -1364,6 +1375,7 @@ __kernel void compute_disp_0(__global const short *u_, __global const short *d_,
|
||||
disp[res_step * y + x] = best;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void compute_disp_1(__global const float *u_, __global const float *d_, __global const float *l_,
|
||||
__global const float *r_, __global const float *data_cost_selected,
|
||||
__global const float *disp_selected_pyr,
|
||||
|
@ -25,7 +25,7 @@
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
@ -65,7 +65,7 @@ namespace cv
|
||||
|
||||
static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "")
|
||||
{
|
||||
if( CL_SUCCESS != err)
|
||||
if (CL_SUCCESS != err)
|
||||
cv::error(Error::OpenCLApiCallError, getOpenCLErrorString(err), func, file, line);
|
||||
}
|
||||
}
|
||||
|
@ -148,90 +148,128 @@ namespace cv
|
||||
mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
|
||||
merge_vector_run(mat_src, n, mat_dst);
|
||||
}
|
||||
static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
|
||||
static void split_vector_run(const oclMat &src, oclMat *dst)
|
||||
{
|
||||
|
||||
if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
|
||||
return;
|
||||
}
|
||||
|
||||
Context *clCxt = mat_src.clCxt;
|
||||
int channels = mat_src.oclchannels();
|
||||
int depth = mat_src.depth();
|
||||
Context *clCtx = src.clCxt;
|
||||
int channels = src.channels();
|
||||
int depth = src.depth();
|
||||
depth = (depth == CV_8S) ? CV_8U : depth;
|
||||
depth = (depth == CV_16S) ? CV_16U : depth;
|
||||
|
||||
String kernelName = "split_vector";
|
||||
|
||||
int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
|
||||
{4, 4, 2, 2, 1, 1, 1},
|
||||
{4, 4, 2, 2 , 1, 1, 1},
|
||||
{4, 4, 2, 2, 1, 1, 1}
|
||||
};
|
||||
size_t VEC_SIZE = 4;
|
||||
|
||||
size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()];
|
||||
|
||||
int max_offset_cols = 0;
|
||||
for(int i = 0; i < channels; i++)
|
||||
{
|
||||
int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1);
|
||||
if(max_offset_cols < offset_cols)
|
||||
max_offset_cols = offset_cols;
|
||||
}
|
||||
|
||||
int cols = vector_length == 1 ? divUp(mat_src.cols, vector_length)
|
||||
: divUp(mat_src.cols + max_offset_cols, vector_length);
|
||||
|
||||
size_t localThreads[3] = { 64, 4, 1 };
|
||||
size_t globalThreads[3] = { cols, mat_src.rows, 1 };
|
||||
|
||||
int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
|
||||
std::vector<std::pair<size_t , const void *> > args;
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset));
|
||||
if(channels >= 3)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step));
|
||||
int srcOffsetXBytes = src.offset % src.step;
|
||||
int srcOffsetY = src.offset / src.step;
|
||||
cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}};
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&srcOffset));
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset));
|
||||
}
|
||||
if(channels >= 4)
|
||||
bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false;
|
||||
int alignSize = dst[0].elemSize1() * VEC_SIZE;
|
||||
int alignMask = alignSize - 1;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[0].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[0].step));
|
||||
int dst0OffsetXBytes = dst[0].offset % dst[0].step;
|
||||
int dst0OffsetY = dst[0].offset / dst[0].step;
|
||||
cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}};
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst0Offset));
|
||||
if ((dst0OffsetXBytes & alignMask) == 0)
|
||||
dst0Aligned = true;
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[1].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[1].step));
|
||||
int dst1OffsetXBytes = dst[1].offset % dst[1].step;
|
||||
int dst1OffsetY = dst[1].offset / dst[1].step;
|
||||
cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}};
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst1Offset));
|
||||
if ((dst1OffsetXBytes & alignMask) == 0)
|
||||
dst1Aligned = true;
|
||||
|
||||
// DON'T MOVE VARIABLES INTO 'IF' BODY
|
||||
int dst2OffsetXBytes, dst2OffsetY;
|
||||
cl_int2 dst2Offset;
|
||||
int dst3OffsetXBytes, dst3OffsetY;
|
||||
cl_int2 dst3Offset;
|
||||
if (channels >= 3)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[2].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[2].step));
|
||||
dst2OffsetXBytes = dst[2].offset % dst[2].step;
|
||||
dst2OffsetY = dst[2].offset / dst[2].step;
|
||||
dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY;
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst2Offset));
|
||||
if ((dst2OffsetXBytes & alignMask) == 0)
|
||||
dst2Aligned = true;
|
||||
}
|
||||
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1));
|
||||
if (channels >= 4)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[3].data));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[3].step));
|
||||
dst3OffsetXBytes = dst[3].offset % dst[3].step;
|
||||
dst3OffsetY = dst[3].offset / dst[3].step;
|
||||
dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY;
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst3Offset));
|
||||
if ((dst3OffsetXBytes & alignMask) == 0)
|
||||
dst3Aligned = true;
|
||||
}
|
||||
|
||||
openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
|
||||
cl_int2 size = {{ src.cols, src.rows }};
|
||||
args.push_back( std::make_pair( sizeof(cl_int2), (void *)&size));
|
||||
|
||||
String build_options =
|
||||
cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d",
|
||||
(int)VEC_SIZE, depth, channels);
|
||||
|
||||
if (dst0Aligned)
|
||||
build_options = build_options + " -D DST0_ALIGNED";
|
||||
if (dst1Aligned)
|
||||
build_options = build_options + " -D DST1_ALIGNED";
|
||||
if (dst2Aligned)
|
||||
build_options = build_options + " -D DST2_ALIGNED";
|
||||
if (dst3Aligned)
|
||||
build_options = build_options + " -D DST3_ALIGNED";
|
||||
|
||||
const DeviceInfo& devInfo = clCtx->getDeviceInfo();
|
||||
|
||||
// TODO Workaround for issues. Need to investigate a problem.
|
||||
if (channels == 2
|
||||
&& devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
|
||||
&& devInfo.platform->platformVendor.find("Intel") != std::string::npos
|
||||
&& (devInfo.deviceVersion.find("Build 56860") != std::string::npos
|
||||
|| devInfo.deviceVersion.find("Build 76921") != std::string::npos))
|
||||
build_options = build_options + " -D BYPASS_VSTORE=true";
|
||||
|
||||
size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
|
||||
openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
|
||||
}
|
||||
static void split(const oclMat &mat_src, oclMat *mat_dst)
|
||||
{
|
||||
CV_Assert(mat_dst);
|
||||
|
||||
int depth = mat_src.depth();
|
||||
int num_channels = mat_src.oclchannels();
|
||||
int num_channels = mat_src.channels();
|
||||
Size size = mat_src.size();
|
||||
|
||||
if(num_channels == 1)
|
||||
if (num_channels == 1)
|
||||
{
|
||||
mat_src.copyTo(mat_dst[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
int i;
|
||||
for(i = 0; i < num_channels; i++)
|
||||
for (int i = 0; i < mat_src.oclchannels(); i++)
|
||||
mat_dst[i].create(size, CV_MAKETYPE(depth, 1));
|
||||
|
||||
split_vector_run(mat_src, mat_dst);
|
||||
@ -255,7 +293,7 @@ void cv::ocl::split(const oclMat &src, oclMat *dst)
|
||||
}
|
||||
void cv::ocl::split(const oclMat &src, std::vector<oclMat> &dst)
|
||||
{
|
||||
dst.resize(src.oclchannels());
|
||||
dst.resize(src.oclchannels()); // TODO Why oclchannels?
|
||||
if(src.oclchannels() > 0)
|
||||
split_merge::split(src, &dst[0]);
|
||||
}
|
||||
|
@ -121,10 +121,8 @@ void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMa
|
||||
ocl::pyrDown(u1s[s - 1], u1s[s]);
|
||||
ocl::pyrDown(u2s[s - 1], u2s[s]);
|
||||
|
||||
//ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
|
||||
multiply(0.5, u1s[s], u1s[s]);
|
||||
//ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
|
||||
multiply(0.5, u1s[s], u2s[s]);
|
||||
ocl::multiply(0.5, u1s[s], u1s[s]);
|
||||
ocl::multiply(0.5, u2s[s], u2s[s]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -126,8 +126,12 @@ PARAM_TEST_CASE(Lut, MatDepth, MatDepth, bool, bool)
|
||||
|
||||
void Near(double threshold = 0.)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, Mat(gdst_roi), threshold);
|
||||
Mat whole, roi;
|
||||
gdst_whole.download(whole);
|
||||
gdst_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
|
||||
}
|
||||
};
|
||||
|
||||
@ -222,14 +226,22 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
|
||||
|
||||
void Near(double threshold = 0.)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
|
||||
EXPECT_MAT_NEAR(dst1_roi, Mat(gdst1_roi), threshold);
|
||||
Mat whole, roi;
|
||||
gdst1_whole.download(whole);
|
||||
gdst1_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst1, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst1_roi, roi, threshold);
|
||||
}
|
||||
|
||||
void Near1(double threshold = 0.)
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst2, Mat(gdst2_whole), threshold);
|
||||
EXPECT_MAT_NEAR(dst2_roi, Mat(gdst2_roi), threshold);
|
||||
Mat whole, roi;
|
||||
gdst2_whole.download(whole);
|
||||
gdst2_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst2, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst2_roi, roi, threshold);
|
||||
}
|
||||
};
|
||||
|
||||
@ -724,6 +736,15 @@ OCL_TEST_P(MinMax, MAT)
|
||||
|
||||
OCL_TEST_P(MinMax, MASK)
|
||||
{
|
||||
enum { MAX_IDX = 0, MIN_IDX };
|
||||
static const double minMaxGolds[2][7] =
|
||||
{
|
||||
{ std::numeric_limits<uchar>::min(), std::numeric_limits<char>::min(), std::numeric_limits<ushort>::min(),
|
||||
std::numeric_limits<short>::min(), std::numeric_limits<int>::min(), -std::numeric_limits<float>::max(), -std::numeric_limits<double>::max() },
|
||||
{ std::numeric_limits<uchar>::max(), std::numeric_limits<char>::max(), std::numeric_limits<ushort>::max(),
|
||||
std::numeric_limits<short>::max(), std::numeric_limits<int>::max(), std::numeric_limits<float>::max(), std::numeric_limits<double>::max() },
|
||||
};
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
@ -750,8 +771,16 @@ OCL_TEST_P(MinMax, MASK)
|
||||
double minVal_, maxVal_;
|
||||
cv::ocl::minMax(gsrc1_roi, &minVal_, &maxVal_, gmask_roi);
|
||||
|
||||
EXPECT_DOUBLE_EQ(minVal, minVal_);
|
||||
EXPECT_DOUBLE_EQ(maxVal, maxVal_);
|
||||
if (cv::countNonZero(mask_roi) == 0)
|
||||
{
|
||||
EXPECT_DOUBLE_EQ(minMaxGolds[MIN_IDX][depth], minVal_);
|
||||
EXPECT_DOUBLE_EQ(minMaxGolds[MAX_IDX][depth], maxVal_);
|
||||
}
|
||||
else
|
||||
{
|
||||
EXPECT_DOUBLE_EQ(minVal, minVal_);
|
||||
EXPECT_DOUBLE_EQ(maxVal, maxVal_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -47,73 +47,130 @@
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
using namespace std;
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
template <typename T>
|
||||
void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
|
||||
static void blendLinearGold(const Mat &img1, const Mat &img2,
|
||||
const Mat &weights1, const Mat &weights2,
|
||||
Mat &result_gold)
|
||||
{
|
||||
CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
|
||||
CV_Assert(weights1.size() == weights2.size() && weights1.size() == img1.size() &&
|
||||
weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1);
|
||||
|
||||
result_gold.create(img1.size(), img1.type());
|
||||
|
||||
int cn = img1.channels();
|
||||
int step1 = img1.cols * img1.channels();
|
||||
|
||||
for (int y = 0; y < img1.rows; ++y)
|
||||
{
|
||||
const float *weights1_row = weights1.ptr<float>(y);
|
||||
const float *weights2_row = weights2.ptr<float>(y);
|
||||
const T *img1_row = img1.ptr<T>(y);
|
||||
const T *img2_row = img2.ptr<T>(y);
|
||||
T *result_gold_row = result_gold.ptr<T>(y);
|
||||
const float * const weights1_row = weights1.ptr<float>(y);
|
||||
const float * const weights2_row = weights2.ptr<float>(y);
|
||||
const T * const img1_row = img1.ptr<T>(y);
|
||||
const T * const img2_row = img2.ptr<T>(y);
|
||||
T * const result_gold_row = result_gold.ptr<T>(y);
|
||||
|
||||
for (int x = 0; x < img1.cols * cn; ++x)
|
||||
for (int x = 0; x < step1; ++x)
|
||||
{
|
||||
float w1 = weights1_row[x / cn];
|
||||
float w2 = weights2_row[x / cn];
|
||||
result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
|
||||
int x1 = x / cn;
|
||||
float w1 = weights1_row[x1], w2 = weights2_row[x1];
|
||||
result_gold_row[x] = saturate_cast<T>(((float)img1_row[x] * w1
|
||||
+ (float)img2_row[x] * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
|
||||
PARAM_TEST_CASE(Blend, MatDepth, int, bool)
|
||||
{
|
||||
cv::Size size;
|
||||
int type;
|
||||
int depth, channels;
|
||||
bool useRoi;
|
||||
|
||||
Mat src1, src2, weights1, weights2, dst;
|
||||
Mat src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi;
|
||||
oclMat gsrc1, gsrc2, gweights1, gweights2, gdst, gst;
|
||||
oclMat gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
size = GET_PARAM(0);
|
||||
type = GET_PARAM(1);
|
||||
depth = GET_PARAM(0);
|
||||
channels = GET_PARAM(1);
|
||||
useRoi = GET_PARAM(2);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
const int type = CV_MAKE_TYPE(depth, channels);
|
||||
|
||||
const double upValue = 256;
|
||||
const double sumMinValue = 0.01; // we don't want to divide by "zero"
|
||||
|
||||
Size roiSize = randomSize(1, 20);
|
||||
Border src1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(src1, src1_roi, roiSize, src1Border, type, -upValue, upValue);
|
||||
|
||||
Border src2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(src2, src2_roi, roiSize, src2Border, type, -upValue, upValue);
|
||||
|
||||
Border weights1Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(weights1, weights1_roi, roiSize, weights1Border, CV_32FC1, -upValue, upValue);
|
||||
|
||||
Border weights2Border = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(weights2, weights2_roi, roiSize, weights2Border, CV_32FC1, sumMinValue, upValue); // fill it as a (w1 + w12)
|
||||
|
||||
weights2_roi = weights2_roi - weights1_roi;
|
||||
// check that weights2_roi is still a part of weights2 (not a new matrix)
|
||||
CV_Assert(checkNorm(weights2_roi,
|
||||
weights2(Rect(weights2Border.lef, weights2Border.top, roiSize.width, roiSize.height))) < 1e-6);
|
||||
|
||||
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 5, 16);
|
||||
|
||||
generateOclMat(gsrc1, gsrc1_roi, src1, roiSize, src1Border);
|
||||
generateOclMat(gsrc2, gsrc2_roi, src2, roiSize, src2Border);
|
||||
generateOclMat(gweights1, gweights1_roi, weights1, roiSize, weights1Border);
|
||||
generateOclMat(gweights2, gweights2_roi, weights2, roiSize, weights2Border);
|
||||
generateOclMat(gdst, gdst_roi, dst, roiSize, dstBorder);
|
||||
}
|
||||
|
||||
void Near(double eps = 0.0)
|
||||
{
|
||||
Mat whole, roi;
|
||||
gdst.download(whole);
|
||||
gdst_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst, whole, eps);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, eps);
|
||||
}
|
||||
};
|
||||
|
||||
typedef void (*blendLinearFunc)(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold);
|
||||
|
||||
OCL_TEST_P(Blend, Accuracy)
|
||||
{
|
||||
int depth = CV_MAT_DEPTH(type);
|
||||
for (int i = 0; i < LOOP_TIMES; ++i)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::Mat img1 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
|
||||
cv::Mat img2 = randomMat(size, type, 0.0, depth == CV_8U ? 255.0 : 1.0);
|
||||
cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
|
||||
cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
|
||||
cv::ocl::blendLinear(gsrc1_roi, gsrc2_roi, gweights1_roi, gweights2_roi, gdst_roi);
|
||||
|
||||
cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2);
|
||||
cv::ocl::oclMat dst;
|
||||
static blendLinearFunc funcs[] = {
|
||||
blendLinearGold<uchar>,
|
||||
blendLinearGold<schar>,
|
||||
blendLinearGold<ushort>,
|
||||
blendLinearGold<short>,
|
||||
blendLinearGold<int>,
|
||||
blendLinearGold<float>,
|
||||
};
|
||||
|
||||
cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
|
||||
cv::Mat result;
|
||||
cv::Mat result_gold;
|
||||
dst.download(result);
|
||||
if (depth == CV_8U)
|
||||
blendLinearGold<uchar>(img1, img2, weights1, weights2, result_gold);
|
||||
else
|
||||
blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
|
||||
blendLinearFunc func = funcs[depth];
|
||||
func(src1_roi, src2_roi, weights1_roi, weights2_roi, dst_roi);
|
||||
|
||||
EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f);
|
||||
Near(depth <= CV_32S ? 1.0 : 0.2);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine(
|
||||
DIFFERENT_SIZES,
|
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
|
||||
));
|
||||
#endif
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend,
|
||||
Combine(testing::Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F),
|
||||
testing::Range(1, 5), Bool()));
|
||||
|
@ -59,10 +59,15 @@ using namespace cv;
|
||||
PARAM_TEST_CASE(FilterTestBase, MatType,
|
||||
int, // kernel size
|
||||
Size, // dx, dy
|
||||
int, // border type, or iteration
|
||||
int, // border type
|
||||
double, // optional parameter
|
||||
bool) // roi or not
|
||||
{
|
||||
bool isFP;
|
||||
|
||||
int type, borderType, ksize;
|
||||
Size size;
|
||||
double param;
|
||||
bool useRoi;
|
||||
|
||||
Mat src, dst_whole, src_roi, dst_roi;
|
||||
@ -72,31 +77,53 @@ PARAM_TEST_CASE(FilterTestBase, MatType,
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
size = GET_PARAM(2);
|
||||
borderType = GET_PARAM(3);
|
||||
useRoi = GET_PARAM(4);
|
||||
param = GET_PARAM(4);
|
||||
useRoi = GET_PARAM(5);
|
||||
|
||||
isFP = (CV_MAT_DEPTH(type) == CV_32F || CV_MAT_DEPTH(type) == CV_64F);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
void random_roi(int minSize = 1)
|
||||
{
|
||||
Size roiSize = randomSize(1, MAX_VALUE);
|
||||
if (minSize == 0)
|
||||
minSize = ksize;
|
||||
Size roiSize = randomSize(minSize, MAX_VALUE);
|
||||
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(src, src_roi, roiSize, srcBorder, type, 5, 256);
|
||||
randomSubMat(src, src_roi, roiSize, srcBorder, type, isFP ? 0 : 5, isFP ? 1 : 256);
|
||||
|
||||
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, 5, 16);
|
||||
randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, type, isFP ? 0.20 : 60, isFP ? 0.25 : 70);
|
||||
|
||||
generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
|
||||
generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
|
||||
}
|
||||
|
||||
void Near(double threshold = 0.0)
|
||||
void Near()
|
||||
{
|
||||
if (isFP)
|
||||
Near(1e-6, true);
|
||||
else
|
||||
Near(1, false);
|
||||
}
|
||||
|
||||
void Near(double threshold, bool relative)
|
||||
{
|
||||
Mat roi, whole;
|
||||
gdst_whole.download(whole);
|
||||
gdst_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
|
||||
if (relative)
|
||||
{
|
||||
EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -111,12 +138,12 @@ OCL_TEST_P(Blur, Mat)
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
random_roi(0); // TODO NOTE: min value for size is kernel size (temporary bypass border issues in CPU implementation)
|
||||
|
||||
blur(src_roi, dst_roi, kernelSize, Point(-1, -1), borderType);
|
||||
ocl::blur(gsrc_roi, gdst_roi, kernelSize, Point(-1, -1), borderType); // TODO anchor
|
||||
|
||||
Near(1.0);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,64 +154,51 @@ typedef FilterTestBase LaplacianTest;
|
||||
|
||||
OCL_TEST_P(LaplacianTest, Accuracy)
|
||||
{
|
||||
double scale = param;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
// border type is used as a scale factor for the Laplacian kernel
|
||||
double scale = static_cast<double>(borderType);
|
||||
Laplacian(src_roi, dst_roi, -1, ksize, scale, 0, borderType);
|
||||
ocl::Laplacian(gsrc_roi, gdst_roi, -1, ksize, scale, 0, borderType);
|
||||
|
||||
Laplacian(src_roi, dst_roi, -1, ksize, scale);
|
||||
ocl::Laplacian(gsrc_roi, gdst_roi, -1, ksize, scale);
|
||||
|
||||
Near(1e-5);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// erode & dilate
|
||||
|
||||
struct ErodeDilate :
|
||||
public FilterTestBase
|
||||
{
|
||||
int iterations;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
iterations = GET_PARAM(3);
|
||||
useRoi = GET_PARAM(4);
|
||||
}
|
||||
};
|
||||
|
||||
typedef ErodeDilate Erode;
|
||||
typedef FilterTestBase Erode;
|
||||
|
||||
OCL_TEST_P(Erode, Mat)
|
||||
{
|
||||
// erode or dilate kernel
|
||||
Size kernelSize(ksize, ksize);
|
||||
Mat kernel;
|
||||
int iterations = (int)param;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
|
||||
|
||||
random_roi();
|
||||
|
||||
cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations);
|
||||
ocl::erode(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations); // TODO iterations, borderType
|
||||
kernel = randomMat(kernelSize, CV_8UC1, 0, 3);
|
||||
|
||||
Near(1e-5);
|
||||
cv::erode(src_roi, dst_roi, kernel, Point(-1, -1), iterations);//, borderType);
|
||||
ocl::erode(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations);//, borderType);
|
||||
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
typedef ErodeDilate Dilate;
|
||||
typedef FilterTestBase Dilate;
|
||||
|
||||
OCL_TEST_P(Dilate, Mat)
|
||||
{
|
||||
// erode or dilate kernel
|
||||
Mat kernel;
|
||||
int iterations = (int)param;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
@ -195,79 +209,56 @@ OCL_TEST_P(Dilate, Mat)
|
||||
cv::dilate(src_roi, dst_roi, kernel, Point(-1, -1), iterations);
|
||||
ocl::dilate(gsrc_roi, gdst_roi, kernel, Point(-1, -1), iterations); // TODO iterations, borderType
|
||||
|
||||
Near(1e-5);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Sobel
|
||||
|
||||
struct SobelTest :
|
||||
public FilterTestBase
|
||||
{
|
||||
int dx, dy;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
borderType = GET_PARAM(3);
|
||||
useRoi = GET_PARAM(4);
|
||||
|
||||
Size d = GET_PARAM(2);
|
||||
dx = d.width, dy = d.height;
|
||||
}
|
||||
};
|
||||
typedef FilterTestBase SobelTest;
|
||||
|
||||
OCL_TEST_P(SobelTest, Mat)
|
||||
{
|
||||
int dx = size.width, dy = size.height;
|
||||
double scale = param;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
Sobel(src_roi, dst_roi, -1, dx, dy, ksize, /* scale */ 0.00001, /* delta */0, borderType);
|
||||
ocl::Sobel(gsrc_roi, gdst_roi, -1, dx, dy, ksize, /* scale */ 0.00001, /* delta */ 0, borderType);
|
||||
Sobel(src_roi, dst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
|
||||
ocl::Sobel(gsrc_roi, gdst_roi, -1, dx, dy, ksize, scale, /* delta */0, borderType);
|
||||
|
||||
Near(1);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Scharr
|
||||
|
||||
typedef SobelTest ScharrTest;
|
||||
typedef FilterTestBase ScharrTest;
|
||||
|
||||
OCL_TEST_P(ScharrTest, Mat)
|
||||
{
|
||||
int dx = size.width, dy = size.height;
|
||||
double scale = param;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
Scharr(src_roi, dst_roi, -1, dx, dy, /* scale */ 1, /* delta */ 0, borderType);
|
||||
ocl::Scharr(gsrc_roi, gdst_roi, -1, dx, dy, /* scale */ 1, /* delta */ 0, borderType);
|
||||
Scharr(src_roi, dst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
|
||||
ocl::Scharr(gsrc_roi, gdst_roi, -1, dx, dy, scale, /* delta */ 0, borderType);
|
||||
|
||||
Near(1);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// GaussianBlur
|
||||
|
||||
struct GaussianBlurTest :
|
||||
public FilterTestBase
|
||||
{
|
||||
double sigma1, sigma2;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
type = GET_PARAM(0);
|
||||
ksize = GET_PARAM(1);
|
||||
borderType = GET_PARAM(3);
|
||||
|
||||
sigma1 = rng.uniform(0.1, 1.0);
|
||||
sigma2 = rng.uniform(0.1, 1.0);
|
||||
}
|
||||
};
|
||||
typedef FilterTestBase GaussianBlurTest;
|
||||
|
||||
OCL_TEST_P(GaussianBlurTest, Mat)
|
||||
{
|
||||
@ -275,10 +266,13 @@ OCL_TEST_P(GaussianBlurTest, Mat)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
double sigma1 = rng.uniform(0.1, 1.0);
|
||||
double sigma2 = rng.uniform(0.1, 1.0);
|
||||
|
||||
GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
|
||||
ocl::GaussianBlur(gsrc_roi, gdst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
|
||||
|
||||
Near(1);
|
||||
Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 1e-6, false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -289,19 +283,24 @@ typedef FilterTestBase Filter2D;
|
||||
|
||||
OCL_TEST_P(Filter2D, Mat)
|
||||
{
|
||||
const Size kernelSize(ksize, ksize);
|
||||
Mat kernel;
|
||||
|
||||
for (int j = 0; j < LOOP_TIMES; j++)
|
||||
{
|
||||
kernel = randomMat(kernelSize, CV_32FC1, 0.0, 1.0);
|
||||
|
||||
random_roi();
|
||||
|
||||
cv::filter2D(src_roi, dst_roi, -1, kernel, Point(-1, -1), 0.0, borderType); // TODO anchor
|
||||
ocl::filter2D(gsrc_roi, gdst_roi, -1, kernel, Point(-1, -1), borderType);
|
||||
Point anchor(-1, -1);
|
||||
if (size.width >= 0)
|
||||
anchor.x = size.width % ksize;
|
||||
if (size.height >= 0)
|
||||
anchor.y = size.height % ksize;
|
||||
|
||||
Near(1);
|
||||
const Size kernelSize(ksize, ksize);
|
||||
Mat kernel = randomMat(kernelSize, CV_32FC1, 0, 1.0);
|
||||
kernel *= 1.0 / (double)(ksize * ksize);
|
||||
|
||||
cv::filter2D(src_roi, dst_roi, -1, kernel, anchor, 0.0, borderType);
|
||||
ocl::filter2D(gsrc_roi, gdst_roi, -1, kernel, anchor, 0.0, borderType);
|
||||
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
@ -322,7 +321,7 @@ OCL_TEST_P(Bilateral, Mat)
|
||||
cv::bilateralFilter(src_roi, dst_roi, ksize, sigmacolor, sigmaspace, borderType);
|
||||
ocl::bilateralFilter(gsrc_roi, gdst_roi, ksize, sigmacolor, sigmaspace, borderType);
|
||||
|
||||
Near(1);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
@ -342,7 +341,7 @@ OCL_TEST_P(AdaptiveBilateral, Mat)
|
||||
adaptiveBilateralFilter(src_roi, dst_roi, kernelSize, 5, Point(-1, -1), borderType); // TODO anchor
|
||||
ocl::adaptiveBilateralFilter(gsrc_roi, gdst_roi, kernelSize, 5, Point(-1, -1), borderType);
|
||||
|
||||
Near(1);
|
||||
Near();
|
||||
}
|
||||
}
|
||||
|
||||
@ -366,87 +365,108 @@ OCL_TEST_P(MedianFilter, Mat)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define FILTER_BORDER_SET_NO_ISOLATED \
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_WRAP, (int)BORDER_REFLECT_101/*, \
|
||||
(int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
|
||||
(int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
|
||||
(int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
|
||||
|
||||
#define FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED \
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, /*(int)BORDER_WRAP,*/ (int)BORDER_REFLECT_101/*, \
|
||||
(int)BORDER_CONSTANT|BORDER_ISOLATED, (int)BORDER_REPLICATE|BORDER_ISOLATED, \
|
||||
(int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
|
||||
(int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
|
||||
|
||||
#define FILTER_DATATYPES Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, \
|
||||
CV_32FC1, CV_32FC3, CV_32FC4, \
|
||||
CV_64FC1, CV_64FC3, CV_64FC4)
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
FILTER_DATATYPES,
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
FILTER_DATATYPES,
|
||||
Values(1, 3),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values(1, 2), // value is used as scale factor for kernel
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(1.0, 0.2, 3.0), // scalar
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0, 0)), // not used
|
||||
testing::Range(1, 2),
|
||||
Values(0), // not used
|
||||
Values(1.0, 2.0, 3.0),
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(3, 5, 7),
|
||||
Values(Size(0, 0)), // not used
|
||||
testing::Range(1, 2),
|
||||
Values(0), // not used
|
||||
Values(1.0, 2.0, 3.0),
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, SobelTest, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(3, 5),
|
||||
Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)),
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
|
||||
(int)BORDER_REPLICATE, (int)BORDER_REFLECT),
|
||||
Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)), // dx, dy
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, ScharrTest, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
|
||||
Values(0), // not used
|
||||
Values(Size(0, 1), Size(1, 0)),
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
|
||||
(int)BORDER_REPLICATE, (int)BORDER_REFLECT),
|
||||
Values(1),
|
||||
Values(Size(0, 1), Size(1, 0)), // dx, dy
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(1.0, 0.2), // scalar
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, GaussianBlurTest, Combine(
|
||||
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(3, 5),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
|
||||
(int)BORDER_REPLICATE, (int)BORDER_REFLECT),
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
|
||||
Values(CV_8UC1, CV_32FC1, CV_32FC4),
|
||||
Values(3, 15, 25),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REFLECT101,
|
||||
(int)BORDER_REPLICATE, (int)BORDER_REFLECT),
|
||||
FILTER_DATATYPES,
|
||||
Values(3, 15), // TODO 25: CPU implementation has some issues
|
||||
Values(Size(-1, -1), Size(0, 0), Size(2, 1)), // anchor
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, Bilateral, Combine(
|
||||
Values(CV_8UC1, CV_8UC3),
|
||||
Values(5, 9),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE,
|
||||
(int)BORDER_REFLECT, (int)BORDER_WRAP, (int)BORDER_REFLECT_101),
|
||||
FILTER_BORDER_SET_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, AdaptiveBilateral, Combine(
|
||||
Values(CV_8UC1, CV_8UC3),
|
||||
Values(5, 9),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE,
|
||||
(int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
|
||||
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Filter, MedianFilter, Combine(
|
||||
Values((MatType)CV_8UC1, (MatType)CV_8UC4, (MatType)CV_32FC1, (MatType)CV_32FC4),
|
||||
Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
|
||||
Values(3, 5),
|
||||
Values(Size(0, 0)), // not used
|
||||
Values(0), // not used
|
||||
Values(0.0), // not used
|
||||
Bool()));
|
||||
|
||||
#endif // HAVE_OPENCL
|
||||
|
@ -80,7 +80,7 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
|
||||
useRoi = GET_PARAM(3);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
virtual void random_roi()
|
||||
{
|
||||
Size roiSize = randomSize(1, MAX_VALUE);
|
||||
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
@ -93,14 +93,22 @@ PARAM_TEST_CASE(ImgprocTestBase, MatType,
|
||||
generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
|
||||
}
|
||||
|
||||
void Near(double threshold = 0.0)
|
||||
void Near(double threshold = 0.0, bool relative = false)
|
||||
{
|
||||
Mat whole, roi;
|
||||
Mat roi, whole;
|
||||
gdst_whole.download(whole);
|
||||
gdst_roi.download(roi);
|
||||
|
||||
EXPECT_MAT_NEAR(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
|
||||
if (relative)
|
||||
{
|
||||
EXPECT_MAT_NEAR_RELATIVE(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR_RELATIVE(dst_roi, roi, threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
EXPECT_MAT_NEAR(dst_whole, whole, threshold);
|
||||
EXPECT_MAT_NEAR(dst_roi, roi, threshold);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -191,7 +199,31 @@ OCL_TEST_P(EqualizeHist, Mat)
|
||||
|
||||
////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
|
||||
|
||||
typedef ImgprocTestBase CornerMinEigenVal;
|
||||
struct CornerTestBase :
|
||||
public ImgprocTestBase
|
||||
{
|
||||
virtual void random_roi()
|
||||
{
|
||||
Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
Size roiSize = image.size();
|
||||
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
|
||||
Size wholeSize = Size(roiSize.width + srcBorder.lef + srcBorder.rig, roiSize.height + srcBorder.top + srcBorder.bot);
|
||||
src = randomMat(wholeSize, type, -255, 255, false);
|
||||
src_roi = src(Rect(srcBorder.lef, srcBorder.top, roiSize.width, roiSize.height));
|
||||
image.copyTo(src_roi);
|
||||
|
||||
Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
|
||||
randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, CV_32FC1, 5, 16);
|
||||
|
||||
generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
|
||||
generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
|
||||
}
|
||||
};
|
||||
|
||||
typedef CornerTestBase CornerMinEigenVal;
|
||||
|
||||
OCL_TEST_P(CornerMinEigenVal, Mat)
|
||||
{
|
||||
@ -204,13 +236,13 @@ OCL_TEST_P(CornerMinEigenVal, Mat)
|
||||
cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType);
|
||||
ocl::cornerMinEigenVal(gsrc_roi, gdst_roi, blockSize, apertureSize, borderType);
|
||||
|
||||
Near(1.0);
|
||||
Near(1e-5, true);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////cornerHarris//////////////////////////////////////////
|
||||
|
||||
typedef ImgprocTestBase CornerHarris;
|
||||
typedef CornerTestBase CornerHarris;
|
||||
|
||||
OCL_TEST_P(CornerHarris, Mat)
|
||||
{
|
||||
@ -219,12 +251,12 @@ OCL_TEST_P(CornerHarris, Mat)
|
||||
random_roi();
|
||||
|
||||
int apertureSize = 3;
|
||||
double k = 2.0;
|
||||
double k = randomDouble(0.01, 0.9);
|
||||
|
||||
cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType);
|
||||
ocl::cornerHarris(gsrc_roi, gdst_roi, blockSize, apertureSize, k, borderType);
|
||||
|
||||
Near(1.0);
|
||||
Near(1e-5, true);
|
||||
}
|
||||
}
|
||||
|
||||
@ -484,25 +516,27 @@ INSTANTIATE_TEST_CASE_P(Imgproc, EqualizeHist, Combine(
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Imgproc, CornerMinEigenVal, Combine(
|
||||
Values(CV_8UC1, CV_32FC1),
|
||||
Values(3), // TODO some fails when blockSize != 3 (for example 5)
|
||||
Values((int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE), // TODO does not work with (int)BORDER_REFLECT101
|
||||
Values((MatType)CV_8UC1, (MatType)CV_32FC1),
|
||||
Values(3, 5),
|
||||
Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT101),
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
|
||||
Values((MatType)CV_8UC1), // TODO does not work properly with CV_32FC1
|
||||
Values(3, 5),
|
||||
Values((int)BORDER_REFLECT101, (int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE),
|
||||
Values( (int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
|
||||
Values((MatType)CV_8UC1), // TODO does work with CV_32F, CV_64F
|
||||
Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
|
||||
Values(0), // not used
|
||||
Values(0), // not used
|
||||
Bool()));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(Imgproc, Threshold, Combine(
|
||||
Values(CV_8UC1, CV_32FC1),
|
||||
Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4,
|
||||
CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4,
|
||||
CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
|
||||
Values(0),
|
||||
Values(ThreshOp(THRESH_BINARY),
|
||||
ThreshOp(THRESH_BINARY_INV), ThreshOp(THRESH_TRUNC),
|
||||
|
@ -99,7 +99,6 @@ PARAM_TEST_CASE(Kmeans, int, int, int)
|
||||
}
|
||||
};
|
||||
OCL_TEST_P(Kmeans, Mat){
|
||||
|
||||
if(flags & KMEANS_USE_INITIAL_LABELS)
|
||||
{
|
||||
// inital a given labels
|
||||
@ -116,11 +115,9 @@ OCL_TEST_P(Kmeans, Mat){
|
||||
kmeans(src, K, labels,
|
||||
TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
|
||||
1, flags, centers);
|
||||
|
||||
ocl::kmeans(d_src, K, d_labels,
|
||||
TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0),
|
||||
1, flags, d_centers);
|
||||
|
||||
Mat dd_labels(d_labels);
|
||||
Mat dd_centers(d_centers);
|
||||
if(flags & KMEANS_USE_INITIAL_LABELS)
|
||||
@ -153,9 +150,97 @@ OCL_TEST_P(Kmeans, Mat){
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_ML, Kmeans, Combine(
|
||||
Values(3, 5, 8),
|
||||
Values(CV_32FC1, CV_32FC2, CV_32FC4),
|
||||
Values(OCL_KMEANS_USE_INITIAL_LABELS/*, OCL_KMEANS_PP_CENTERS*/)));
|
||||
|
||||
|
||||
/////////////////////////////// DistanceToCenters //////////////////////////////////////////
|
||||
|
||||
CV_ENUM(DistType, NORM_L1, NORM_L2SQR);
|
||||
|
||||
PARAM_TEST_CASE(distanceToCenters, DistType, bool)
|
||||
{
|
||||
cv::Size size;
|
||||
int distType;
|
||||
bool useRoi;
|
||||
cv::Mat src, centers, src_roi, centers_roi;
|
||||
cv::ocl::oclMat ocl_src, ocl_centers, ocl_src_roi, ocl_centers_roi;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
distType = GET_PARAM(0);
|
||||
useRoi = GET_PARAM(1);
|
||||
}
|
||||
|
||||
void random_roi()
|
||||
{
|
||||
Size roiSize_src = randomSize(10,1000);
|
||||
Size roiSize_centers = randomSize(10, 1000);
|
||||
roiSize_src.width = roiSize_centers.width;
|
||||
|
||||
Border srcBorder = randomBorder(0, useRoi ? 500 : 0);
|
||||
randomSubMat(src, src_roi, roiSize_src, srcBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
|
||||
|
||||
Border centersBorder = randomBorder(0, useRoi ? 500 : 0);
|
||||
randomSubMat(centers, centers_roi, roiSize_centers, centersBorder, CV_32FC1, -SHRT_MAX, SHRT_MAX);
|
||||
|
||||
for(int i = 0; i<centers.rows; i++)
|
||||
centers.at<float>(i, randomInt(0,centers.cols-1)) = (float)randomDouble(SHRT_MAX, INT_MAX);
|
||||
|
||||
generateOclMat(ocl_src, ocl_src_roi, src, roiSize_src, srcBorder);
|
||||
generateOclMat(ocl_centers, ocl_centers_roi, centers, roiSize_centers, centersBorder);
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
OCL_TEST_P(distanceToCenters, Accuracy)
|
||||
{
|
||||
for(int j = 0; j< LOOP_TIMES; j++)
|
||||
{
|
||||
random_roi();
|
||||
|
||||
cv::ocl::oclMat ocl_dists;
|
||||
cv::ocl::oclMat ocl_labels;
|
||||
|
||||
cv::ocl::distanceToCenters(ocl_dists,ocl_labels,ocl_src_roi, ocl_centers_roi, distType);
|
||||
|
||||
Mat labels, dists;
|
||||
ocl_labels.download(labels);
|
||||
ocl_dists.download(dists);
|
||||
|
||||
ASSERT_EQ(ocl_dists.cols, ocl_labels.rows);
|
||||
|
||||
Mat batch_dists;
|
||||
|
||||
cv::batchDistance(src_roi, centers_roi, batch_dists, CV_32FC1, noArray(), distType);
|
||||
|
||||
std::vector<double> gold_dists_v;
|
||||
|
||||
for(int i = 0; i<batch_dists.rows; i++)
|
||||
{
|
||||
Mat r = batch_dists.row(i);
|
||||
double mVal;
|
||||
Point mLoc;
|
||||
minMaxLoc(r, &mVal, NULL, &mLoc, NULL);
|
||||
|
||||
int ocl_label = *(int*)labels.row(i).col(0).data;
|
||||
ASSERT_EQ(mLoc.x, ocl_label);
|
||||
|
||||
gold_dists_v.push_back(mVal);
|
||||
}
|
||||
Mat gold_dists(gold_dists_v);
|
||||
dists.convertTo(dists, CV_64FC1);
|
||||
double relative_error = cv::norm(gold_dists.t(), dists, NORM_INF|NORM_RELATIVE);
|
||||
ASSERT_LE(relative_error, 1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P (OCL_ML, distanceToCenters, Combine(DistType::all(), Bool()) );
|
||||
|
||||
|
||||
#endif
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user