Merge pull request #664 from taka-no-me/ocl
Move OpenCL SURF to nonfree module
This commit is contained in:
commit
b6365699ee
@ -141,9 +141,9 @@ OCV_OPTION(WITH_V4L "Include Video 4 Linux support" ON
|
||||
OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON IF WIN32 )
|
||||
OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) )
|
||||
OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) )
|
||||
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" ON IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" ON IF (NOT ANDROID AND NOT IOS) )
|
||||
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON IF (NOT ANDROID AND NOT IOS) )
|
||||
|
||||
|
||||
# OpenCV build components
|
||||
@ -412,15 +412,6 @@ endif()
|
||||
# --- OpenCL ---
|
||||
if(WITH_OPENCL)
|
||||
include(cmake/OpenCVDetectOpenCL.cmake)
|
||||
if(OPENCL_FOUND)
|
||||
set(HAVE_OPENCL 1)
|
||||
endif()
|
||||
if(WITH_OPENCLAMDFFT AND CLAMDFFT_INCLUDE_DIR)
|
||||
set(HAVE_CLAMDFFT 1)
|
||||
endif()
|
||||
if(WITH_OPENCLAMDBLAS AND CLAMDBLAS_INCLUDE_DIR)
|
||||
set(HAVE_CLAMDBLAS 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
@ -791,17 +782,17 @@ if(HAVE_CUDA)
|
||||
status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
|
||||
endif()
|
||||
|
||||
if(HAVE_OPENCL AND BUILD_opencv_ocl)
|
||||
if(HAVE_OPENCL)
|
||||
status("")
|
||||
status(" OpenCL")
|
||||
if(OPENCL_INCLUDE_DIR)
|
||||
status(" Include:" ${OPENCL_INCLUDE_DIR})
|
||||
status(" Include path:" ${OPENCL_INCLUDE_DIRS})
|
||||
endif()
|
||||
if(OPENCL_LIBRARIES)
|
||||
status(" libraries:" ${OPENCL_LIBRARIES})
|
||||
endif()
|
||||
status(" Use AMDFFT:" HAVE_CLAMDFFT THEN YES ELSE NO)
|
||||
status(" Use AMDBLAS:" HAVE_CLAMDBLAS THEN YES ELSE NO)
|
||||
status(" Use AMD FFT:" HAVE_CLAMDFFT THEN YES ELSE NO)
|
||||
status(" Use AMD BLAS:" HAVE_CLAMDBLAS THEN YES ELSE NO)
|
||||
endif()
|
||||
|
||||
# ========================== python ==========================
|
||||
|
@ -1,154 +1,104 @@
|
||||
if(APPLE)
|
||||
set(OPENCL_FOUND YES)
|
||||
set(OPENCL_LIBRARIES "-framework OpenCL")
|
||||
else()
|
||||
set(OPENCL_LIBRARY "-framework OpenCL" CACHE STRING "OpenCL library")
|
||||
set(OPENCL_INCLUDE_DIR "" CACHE STRING "OpenCL include directory")
|
||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
|
||||
else(APPLE)
|
||||
find_package(OpenCL QUIET)
|
||||
if(WITH_OPENCLAMDFFT)
|
||||
set(CLAMDFFT_SEARCH_PATH $ENV{CLAMDFFT_PATH})
|
||||
if(NOT CLAMDFFT_SEARCH_PATH)
|
||||
if(WIN32)
|
||||
set( CLAMDFFT_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdFft" )
|
||||
endif()
|
||||
endif()
|
||||
set( CLAMDFFT_INCLUDE_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}/include )
|
||||
if(UNIX)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib)
|
||||
else()
|
||||
set(CLAMDFFT_LIB_SEARCH_PATH /usr/lib64)
|
||||
endif()
|
||||
else()
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib32\\import)
|
||||
else()
|
||||
set(CLAMDFFT_LIB_SEARCH_PATH ${CLAMDFFT_SEARCH_PATH}\\lib64\\import)
|
||||
endif()
|
||||
endif()
|
||||
find_path(CLAMDFFT_INCLUDE_DIR
|
||||
NAMES clAmdFft.h
|
||||
PATHS ${CLAMDFFT_INCLUDE_SEARCH_PATH}
|
||||
PATH_SUFFIXES clAmdFft
|
||||
NO_DEFAULT_PATH)
|
||||
find_library(CLAMDFFT_LIBRARY
|
||||
NAMES clAmdFft.Runtime
|
||||
PATHS ${CLAMDFFT_LIB_SEARCH_PATH}
|
||||
NO_DEFAULT_PATH)
|
||||
if(CLAMDFFT_LIBRARY)
|
||||
set(CLAMDFFT_LIBRARIES ${CLAMDFFT_LIBRARY})
|
||||
else()
|
||||
set(CLAMDFFT_LIBRARIES "")
|
||||
endif()
|
||||
endif()
|
||||
if(WITH_OPENCLAMDBLAS)
|
||||
set(CLAMDBLAS_SEARCH_PATH $ENV{CLAMDBLAS_PATH})
|
||||
if(NOT CLAMDBLAS_SEARCH_PATH)
|
||||
if(WIN32)
|
||||
set( CLAMDBLAS_SEARCH_PATH "C:\\Program Files (x86)\\AMD\\clAmdBlas" )
|
||||
endif()
|
||||
endif()
|
||||
set( CLAMDBLAS_INCLUDE_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}/include )
|
||||
if(UNIX)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib)
|
||||
else()
|
||||
set(CLAMDBLAS_LIB_SEARCH_PATH /usr/lib64)
|
||||
endif()
|
||||
else()
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib32\\import)
|
||||
else()
|
||||
set(CLAMDBLAS_LIB_SEARCH_PATH ${CLAMDBLAS_SEARCH_PATH}\\lib64\\import)
|
||||
endif()
|
||||
endif()
|
||||
find_path(CLAMDBLAS_INCLUDE_DIR
|
||||
NAMES clAmdBlas.h
|
||||
PATHS ${CLAMDBLAS_INCLUDE_SEARCH_PATH}
|
||||
PATH_SUFFIXES clAmdBlas
|
||||
NO_DEFAULT_PATH)
|
||||
find_library(CLAMDBLAS_LIBRARY
|
||||
NAMES clAmdBlas
|
||||
PATHS ${CLAMDBLAS_LIB_SEARCH_PATH}
|
||||
NO_DEFAULT_PATH)
|
||||
if(CLAMDBLAS_LIBRARY)
|
||||
set(CLAMDBLAS_LIBRARIES ${CLAMDBLAS_LIBRARY})
|
||||
else()
|
||||
set(CLAMDBLAS_LIBRARIES "")
|
||||
endif()
|
||||
endif()
|
||||
# Try AMD/ATI Stream SDK
|
||||
|
||||
if (NOT OPENCL_FOUND)
|
||||
set(ENV_AMDSTREAMSDKROOT $ENV{AMDAPPSDKROOT})
|
||||
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
|
||||
set(ENV_OPENCLROOT $ENV{OPENCLROOT})
|
||||
set(ENV_CUDA_PATH $ENV{CUDA_PATH})
|
||||
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
|
||||
if(ENV_AMDSTREAMSDKROOT)
|
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDAPPSDKROOT}/include)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86)
|
||||
else()
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDAPPSDKROOT}/lib/x86_64)
|
||||
endif()
|
||||
elseif(ENV_AMDSTREAMSDKROOT)
|
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_AMDSTREAMSDKROOT}/include)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86)
|
||||
else()
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_AMDSTREAMSDKROOT}/lib/x86_64)
|
||||
endif()
|
||||
elseif(ENV_CUDA_PATH AND WIN32)
|
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_CUDA_PATH}/include)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/Win32)
|
||||
else()
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_CUDA_PATH}/lib/x64)
|
||||
endif()
|
||||
elseif(ENV_OPENCLROOT AND UNIX)
|
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_OPENCLROOT}/inc)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib)
|
||||
else()
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} /usr/lib64)
|
||||
endif()
|
||||
elseif(ENV_INTELOCLSDKROOT)
|
||||
set(OPENCL_INCLUDE_SEARCH_PATH ${ENV_INTELOCLSDKROOT}/include)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x86)
|
||||
else()
|
||||
set(OPENCL_LIB_SEARCH_PATH ${OPENCL_LIB_SEARCH_PATH} ${ENV_INTELOCLSDKROOT}/lib/x64)
|
||||
endif()
|
||||
find_path(OPENCL_ROOT_DIR
|
||||
NAMES OpenCL/cl.h CL/cl.h include/CL/cl.h include/nvidia-current/CL/cl.h
|
||||
PATHS ENV OCLROOT ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT
|
||||
DOC "OpenCL root directory"
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_path(OPENCL_INCLUDE_DIR
|
||||
NAMES OpenCL/cl.h CL/cl.h
|
||||
HINTS ${OPENCL_ROOT_DIR}
|
||||
PATH_SUFFIXES include include/nvidia-current
|
||||
DOC "OpenCL include directory")
|
||||
|
||||
if (X86_64)
|
||||
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
|
||||
elseif (X86)
|
||||
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86)
|
||||
endif()
|
||||
|
||||
if(OPENCL_INCLUDE_SEARCH_PATH)
|
||||
find_path(OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h
|
||||
PATHS ${OPENCL_INCLUDE_SEARCH_PATH}
|
||||
NO_DEFAULT_PATH)
|
||||
else()
|
||||
find_path(OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h)
|
||||
endif()
|
||||
|
||||
if(OPENCL_LIB_SEARCH_PATH)
|
||||
find_library(OPENCL_LIBRARY NAMES OpenCL PATHS ${OPENCL_LIB_SEARCH_PATH} NO_DEFAULT_PATH)
|
||||
else()
|
||||
find_library(OPENCL_LIBRARY NAMES OpenCL)
|
||||
endif()
|
||||
find_library(OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
HINTS ${OPENCL_ROOT_DIR}
|
||||
PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES}
|
||||
DOC "OpenCL library")
|
||||
|
||||
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(
|
||||
OPENCL
|
||||
DEFAULT_MSG
|
||||
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
|
||||
)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR )
|
||||
endif()
|
||||
endif(APPLE)
|
||||
|
||||
if(OPENCL_FOUND)
|
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
|
||||
set(HAVE_OPENCL 1)
|
||||
else()
|
||||
set(OPENCL_LIBRARIES)
|
||||
if(OPENCL_FOUND)
|
||||
set(HAVE_OPENCL 1)
|
||||
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
|
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
|
||||
|
||||
if (X86_64)
|
||||
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
|
||||
elseif (X86)
|
||||
set(CLAMD_POSSIBLE_LIB_SUFFIXES lib32/import)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENCLAMDFFT)
|
||||
find_path(CLAMDFFT_ROOT_DIR
|
||||
NAMES include/clAmdFft.h
|
||||
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
|
||||
PATH_SUFFIXES clAmdFft AMD/clAmdFft
|
||||
DOC "AMD FFT root directory"
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_path(CLAMDFFT_INCLUDE_DIR
|
||||
NAMES clAmdFft.h
|
||||
HINTS ${CLAMDFFT_ROOT_DIR}
|
||||
PATH_SUFFIXES include
|
||||
DOC "clAmdFft include directory")
|
||||
|
||||
find_library(CLAMDFFT_LIBRARY
|
||||
NAMES clAmdFft.Runtime
|
||||
HINTS ${CLAMDFFT_ROOT_DIR}
|
||||
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
|
||||
DOC "clAmdFft library")
|
||||
|
||||
if(CLAMDFFT_LIBRARY AND CLAMDFFT_INCLUDE_DIR)
|
||||
set(HAVE_CLAMDFFT 1)
|
||||
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDFFT_INCLUDE_DIR}")
|
||||
list(APPEND OPENCL_LIBRARIES "${CLAMDFFT_LIBRARY}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_OPENCLAMDBLAS)
|
||||
find_path(CLAMDBLAS_ROOT_DIR
|
||||
NAMES include/clAmdBlas.h
|
||||
PATHS ENV CLAMDFFT_PATH ENV ProgramFiles
|
||||
PATH_SUFFIXES clAmdBlas AMD/clAmdBlas
|
||||
DOC "AMD FFT root directory"
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
find_path(CLAMDBLAS_INCLUDE_DIR
|
||||
NAMES clAmdBlas.h
|
||||
HINTS ${CLAMDBLAS_ROOT_DIR}
|
||||
PATH_SUFFIXES include
|
||||
DOC "clAmdFft include directory")
|
||||
|
||||
find_library(CLAMDBLAS_LIBRARY
|
||||
NAMES clAmdBlas
|
||||
HINTS ${CLAMDBLAS_ROOT_DIR}
|
||||
PATH_SUFFIXES ${CLAMD_POSSIBLE_LIB_SUFFIXES}
|
||||
DOC "clAmdBlas library")
|
||||
|
||||
if(CLAMDBLAS_LIBRARY AND CLAMDBLAS_INCLUDE_DIR)
|
||||
set(HAVE_CLAMDBLAS 1)
|
||||
list(APPEND OPENCL_INCLUDE_DIRS "${CLAMDBLAS_INCLUDE_DIR}")
|
||||
list(APPEND OPENCL_LIBRARIES "${CLAMDBLAS_LIBRARY}")
|
||||
endif()
|
||||
else()
|
||||
set(HAVE_OPENCL 1)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -432,10 +432,22 @@ macro(ocv_glob_module_sources)
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
|
||||
|
||||
file(GLOB cl_kernels "src/opencl/*.cl")
|
||||
|
||||
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Include\\detail" FILES ${lib_hdrs_detail})
|
||||
|
||||
if(HAVE_OPENCL AND cl_kernels)
|
||||
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
|
||||
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
|
||||
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
|
||||
source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
endif()
|
||||
|
||||
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
|
||||
endmacro()
|
||||
|
||||
@ -449,6 +461,9 @@ macro(ocv_create_module)
|
||||
|
||||
if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
|
||||
target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
|
||||
if(HAVE_OPENCL AND OPENCL_LIBRARIES)
|
||||
target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_dependencies(opencv_modules ${the_module})
|
||||
|
@ -3,7 +3,7 @@ if(BUILD_ANDROID_PACKAGE)
|
||||
endif()
|
||||
|
||||
set(the_description "Functionality with possible limitations on the use")
|
||||
ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu)
|
||||
ocv_add_module(nonfree opencv_imgproc opencv_features2d opencv_calib3d OPTIONAL opencv_gpu opencv_ocl)
|
||||
ocv_module_include_directories()
|
||||
|
||||
if(HAVE_CUDA AND HAVE_opencv_gpu)
|
||||
|
@ -129,7 +129,6 @@ The function is parallelized with the TBB library.
|
||||
If you are using the C version, make sure you call ``cv::initModule_nonfree()`` from ``nonfree/nonfree.hpp``.
|
||||
|
||||
|
||||
|
||||
gpu::SURF_GPU
|
||||
-------------
|
||||
.. ocv:class:: gpu::SURF_GPU
|
||||
@ -230,3 +229,102 @@ The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descripto
|
||||
The class ``SURF_GPU`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
|
||||
|
||||
.. seealso:: :ocv:class:`SURF`
|
||||
|
||||
|
||||
ocl::SURF_OCL
|
||||
-------------
|
||||
.. ocv:class:: ocl::SURF_OCL
|
||||
|
||||
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
|
||||
|
||||
class SURF_OCL
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_OCL();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
|
||||
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
void uploadKeypoints(const vector<KeyPoint>& keypoints,
|
||||
oclMat& keypointsocl);
|
||||
//! download keypoints from device to host memory
|
||||
void downloadKeypoints(const oclMat& keypointsocl,
|
||||
vector<KeyPoint>& keypoints);
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
void downloadDescriptors(const oclMat& descriptorsocl,
|
||||
vector<float>& descriptors);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
oclMat& keypoints);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
oclMat& keypoints, oclMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints, oclMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints,
|
||||
std::vector<float>& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio;
|
||||
|
||||
oclMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
oclMat det, trace;
|
||||
|
||||
oclMat maxPosBuffer;
|
||||
};
|
||||
|
||||
|
||||
The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
|
||||
|
||||
The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
|
||||
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
|
||||
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
|
||||
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
|
||||
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
|
||||
|
||||
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
|
||||
|
||||
.. seealso:: :ocv:class:`SURF`
|
124
modules/nonfree/include/opencv2/nonfree/ocl.hpp
Normal file
124
modules/nonfree/include/opencv2/nonfree/ocl.hpp
Normal file
@ -0,0 +1,124 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_NONFREE_OCL_HPP__
|
||||
#define __OPENCV_NONFREE_OCL_HPP__
|
||||
|
||||
#include "opencv2/ocl/ocl.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
//! Speeded up robust features, port from GPU module.
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS SURF_OCL
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_OCL();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
|
||||
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
//! upload host keypoints to device memory
|
||||
void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
|
||||
//! download keypoints from device to host memory
|
||||
void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
|
||||
//! download descriptors from device to host memory
|
||||
void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
|
||||
//! finds the keypoints using fast hessian detector used in SURF
|
||||
//! supports CV_8UC1 images
|
||||
//! keypoints will have nFeature cols and 6 rows
|
||||
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
|
||||
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
|
||||
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
|
||||
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
|
||||
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
|
||||
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
|
||||
//! finds the keypoints and computes their descriptors.
|
||||
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
float hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio;
|
||||
oclMat sum, mask1, maskSum, intBuffer;
|
||||
oclMat det, trace;
|
||||
oclMat maxPosBuffer;
|
||||
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif //__OPENCV_NONFREE_OCL_HPP__
|
@ -1,3 +1,4 @@
|
||||
#include "perf_precomp.hpp"
|
||||
#include "opencv2/ts/gpu_perf.hpp"
|
||||
|
||||
CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())
|
||||
|
@ -9,14 +9,15 @@
|
||||
#ifndef __OPENCV_PERF_PRECOMP_HPP__
|
||||
#define __OPENCV_PERF_PRECOMP_HPP__
|
||||
|
||||
#include "cvconfig.h"
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/ts/ts.hpp"
|
||||
#include "opencv2/ts/gpu_perf.hpp"
|
||||
#include "opencv2/nonfree/nonfree.hpp"
|
||||
#include "opencv2/highgui/highgui.hpp"
|
||||
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
# include "opencv2/nonfree/ocl.hpp"
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
|
||||
#include "opencv2/nonfree/gpu.hpp"
|
||||
#endif
|
||||
|
@ -43,61 +43,69 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include <iomanip>
|
||||
#include "perf_precomp.hpp"
|
||||
|
||||
#ifdef HAVE_OPENCL
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
using namespace std;
|
||||
|
||||
#define FILTER_IMAGE "../../../samples/gpu/road.png"
|
||||
typedef perf::TestBaseWithParam<std::string> OCL_SURF;
|
||||
|
||||
TEST(SURF, Performance)
|
||||
#define SURF_IMAGES \
|
||||
"cv/detectors_descriptors_evaluation/images_datasets/leuven/img1.png",\
|
||||
"stitching/a3.png"
|
||||
|
||||
PERF_TEST_P(OCL_SURF, DISABLED_with_data_transfer, testing::Values(SURF_IMAGES))
|
||||
{
|
||||
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
|
||||
string filename = getDataPath(GetParam());
|
||||
Mat img = imread(filename, IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
ocl::SURF_OCL d_surf;
|
||||
ocl::oclMat d_keypoints;
|
||||
ocl::oclMat d_descriptors;
|
||||
SURF_OCL d_surf;
|
||||
oclMat d_keypoints;
|
||||
oclMat d_descriptors;
|
||||
Mat cpu_kp;
|
||||
Mat cpu_dp;
|
||||
|
||||
double totalgputick = 0;
|
||||
double totalgputick_kernel = 0;
|
||||
declare.time(60);
|
||||
|
||||
double t1 = 0;
|
||||
double t2 = 0;
|
||||
for(int j = 0; j < LOOP_TIMES + 1; j ++)
|
||||
TEST_CYCLE()
|
||||
{
|
||||
t1 = (double)cvGetTickCount();//gpu start1
|
||||
oclMat d_src(img);
|
||||
|
||||
ocl::oclMat d_src(img);//upload
|
||||
|
||||
t2 = (double)cvGetTickCount(); //kernel
|
||||
d_surf(d_src, ocl::oclMat(), d_keypoints, d_descriptors);
|
||||
t2 = (double)cvGetTickCount() - t2;//kernel
|
||||
|
||||
cv::Mat cpu_kp, cpu_dp;
|
||||
d_keypoints.download (cpu_kp);//download
|
||||
d_descriptors.download (cpu_dp);//download
|
||||
|
||||
t1 = (double)cvGetTickCount() - t1;//gpu end1
|
||||
|
||||
if(j == 0)
|
||||
continue;
|
||||
|
||||
totalgputick = t1 + totalgputick;
|
||||
|
||||
totalgputick_kernel = t2 + totalgputick_kernel;
|
||||
d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
|
||||
|
||||
d_keypoints.download(cpu_kp);
|
||||
d_descriptors.download(cpu_dp);
|
||||
}
|
||||
|
||||
cout << "average gpu runtime is " << totalgputick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
|
||||
cout << "average gpu runtime without data transfer is " << totalgputick_kernel / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
|
||||
|
||||
|
||||
SANITY_CHECK(cpu_kp, 1);
|
||||
SANITY_CHECK(cpu_dp, 1);
|
||||
}
|
||||
#endif //Have opencl
|
||||
|
||||
PERF_TEST_P(OCL_SURF, DISABLED_without_data_transfer, testing::Values(SURF_IMAGES))
|
||||
{
|
||||
string filename = getDataPath(GetParam());
|
||||
Mat img = imread(filename, IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(img.empty());
|
||||
|
||||
SURF_OCL d_surf;
|
||||
oclMat d_keypoints;
|
||||
oclMat d_descriptors;
|
||||
oclMat d_src(img);
|
||||
|
||||
declare.time(60);
|
||||
|
||||
TEST_CYCLE() d_surf(d_src, oclMat(), d_keypoints, d_descriptors);
|
||||
|
||||
Mat cpu_kp;
|
||||
Mat cpu_dp;
|
||||
d_keypoints.download(cpu_kp);
|
||||
d_descriptors.download(cpu_dp);
|
||||
SANITY_CHECK(cpu_kp, 1);
|
||||
SANITY_CHECK(cpu_dp, 1);
|
||||
}
|
||||
|
||||
#endif // HAVE_OPENCV_OCL
|
@ -104,11 +104,11 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
|
||||
// N = 2
|
||||
// for simple haar paatern
|
||||
float icvCalcHaarPatternSum_2(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -137,11 +137,11 @@ float icvCalcHaarPatternSum_2(
|
||||
|
||||
// N = 3
|
||||
float icvCalcHaarPatternSum_3(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -170,11 +170,11 @@ float icvCalcHaarPatternSum_3(
|
||||
|
||||
// N = 4
|
||||
float icvCalcHaarPatternSum_4(
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
IMAGE_INT32 sumTex,
|
||||
__constant float src[2][5],
|
||||
int oldSize,
|
||||
int newSize,
|
||||
int y, int x,
|
||||
int rows, int cols, int elemPerRow)
|
||||
{
|
||||
|
||||
@ -265,7 +265,7 @@ __kernel void icvCalcLayerDetAndTrace(
|
||||
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
|
||||
|
||||
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
@ -301,9 +301,9 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
|
||||
// Non-maximal suppression to further filtering the candidates from previous step
|
||||
__kernel
|
||||
void icvFindMaximaInLayer_withmask(
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global const float * det,
|
||||
__global const float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -345,26 +345,26 @@ __kernel
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = get_local_size(0) * get_local_size(1);
|
||||
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
N9[localLin - zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
N9[localLin ] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
N9[localLin + zoff] =
|
||||
det[det_step *
|
||||
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
|
||||
+ min(max(j, 0), c_img_cols - 1)]; // x
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -429,9 +429,9 @@ __kernel
|
||||
|
||||
__kernel
|
||||
void icvFindMaximaInLayer(
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
__global float * det,
|
||||
__global float * trace,
|
||||
__global int4 * maxPosBuffer,
|
||||
volatile __global int* maxCounter,
|
||||
int counter_offset,
|
||||
int det_step, // the step of det in bytes
|
||||
@ -474,19 +474,19 @@ __kernel
|
||||
int l_x = min(max(j, 0), c_img_cols - 1);
|
||||
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
|
||||
|
||||
N9[localLin - zoff] =
|
||||
N9[localLin - zoff] =
|
||||
det[det_step * (l_y - c_layer_rows) + l_x];
|
||||
N9[localLin ] =
|
||||
N9[localLin ] =
|
||||
det[det_step * (l_y ) + l_x];
|
||||
N9[localLin + zoff] =
|
||||
N9[localLin + zoff] =
|
||||
det[det_step * (l_y + c_layer_rows) + l_x];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (i < c_layer_rows - margin
|
||||
if (i < c_layer_rows - margin
|
||||
&& j < c_layer_cols - margin
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) > 0
|
||||
&& get_local_id(0) < get_local_size(0) - 1
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) > 0
|
||||
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
|
||||
)
|
||||
{
|
||||
@ -554,17 +554,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
{
|
||||
F invdet = 1.0 / det;
|
||||
|
||||
x[0] = invdet *
|
||||
x[0] = invdet *
|
||||
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
|
||||
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
|
||||
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
|
||||
|
||||
x[1] = invdet *
|
||||
x[1] = invdet *
|
||||
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
|
||||
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
|
||||
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
|
||||
|
||||
x[2] = invdet *
|
||||
x[2] = invdet *
|
||||
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
|
||||
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
|
||||
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
|
||||
@ -585,9 +585,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
__kernel
|
||||
__kernel
|
||||
void icvInterpolateKeypoint(
|
||||
__global const float * det,
|
||||
__global const float * det,
|
||||
__global const int4 * maxPosBuffer,
|
||||
__global float * keypoints,
|
||||
volatile __global int * featureCounter,
|
||||
@ -617,7 +617,7 @@ __kernel
|
||||
|
||||
volatile __local float N9[3][3][3];
|
||||
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
|
||||
det[det_step * (c_layer_rows * layer + i) + j];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@ -715,27 +715,27 @@ __kernel
|
||||
|
||||
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
|
||||
0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
|
||||
0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
|
||||
0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
|
||||
0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
|
||||
0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
|
||||
0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
|
||||
0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
|
||||
0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
|
||||
0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
|
||||
0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
|
||||
0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
|
||||
0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
|
||||
0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
|
||||
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
|
||||
0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
|
||||
0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
|
||||
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
|
||||
0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
|
||||
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
|
||||
0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
@ -748,14 +748,20 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc
|
||||
data[tid] = *partial_reduction;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
if (tid < 16)
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 8)
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 4)
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 2)
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 1)
|
||||
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
#undef op
|
||||
}
|
||||
|
||||
@ -958,8 +964,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
|
||||
// utility for linear filter
|
||||
inline uchar readerGet(
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
int i, int j, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
@ -969,8 +975,8 @@ inline uchar readerGet(
|
||||
}
|
||||
|
||||
inline float linearFilter(
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
IMAGE_INT8 src,
|
||||
const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
|
||||
float y, float x, int rows, int cols, int elemPerRow
|
||||
)
|
||||
{
|
||||
@ -1004,9 +1010,9 @@ void calc_dx_dy(
|
||||
volatile __local float s_dx_bin[25],
|
||||
volatile __local float s_dy_bin[25],
|
||||
volatile __local float s_PATCH[6][6],
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureX,
|
||||
__global const float* featureY,
|
||||
__global const float* featureSize,
|
||||
__global const float* featureDir,
|
||||
int rows,
|
||||
int cols,
|
||||
@ -1058,26 +1064,26 @@ void calc_dx_dy(
|
||||
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
|
||||
|
||||
const float vx = (
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
|
||||
* dw;
|
||||
const float vy = (
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) ] +
|
||||
s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
|
||||
s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
|
||||
* dw;
|
||||
s_dx_bin[tid] = vx;
|
||||
s_dy_bin[tid] = vy;
|
||||
}
|
||||
}
|
||||
void reduce_sum25(
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
volatile __local float* sdata1,
|
||||
volatile __local float* sdata2,
|
||||
volatile __local float* sdata3,
|
||||
volatile __local float* sdata4,
|
||||
int tid
|
||||
)
|
||||
{
|
||||
@ -1115,13 +1121,13 @@ void reduce_sum25(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors64(
|
||||
IMAGE_INT8 imgTex,
|
||||
volatile __global float * descriptors,
|
||||
volatile __global float * descriptors,
|
||||
__global const float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step,
|
||||
int keypoints_step,
|
||||
int rows,
|
||||
int cols,
|
||||
int img_step
|
||||
@ -1155,7 +1161,7 @@ __kernel
|
||||
if (tid < 25)
|
||||
{
|
||||
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (tid < 25)
|
||||
{
|
||||
@ -1171,10 +1177,10 @@ __kernel
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void compute_descriptors128(
|
||||
IMAGE_INT8 imgTex,
|
||||
__global volatile float * descriptors,
|
||||
__global volatile float * descriptors,
|
||||
__global float * keypoints,
|
||||
int descriptors_step,
|
||||
int keypoints_step,
|
||||
@ -1269,7 +1275,7 @@ __kernel
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
||||
@ -1310,7 +1316,7 @@ __kernel
|
||||
// normalize and store in output
|
||||
descriptor_base[get_local_id(0)] = lookup / len;
|
||||
}
|
||||
__kernel
|
||||
__kernel
|
||||
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
|
||||
{
|
||||
descriptors_step /= sizeof(*descriptors);
|
@ -66,4 +66,9 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
# include "opencv2/nonfree/ocl.hpp"
|
||||
# include "opencv2/ocl/private/util.hpp"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -42,10 +42,9 @@
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
#include <iomanip>
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
//#include "opencv2/highgui/highgui.hpp"
|
||||
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
@ -56,7 +55,7 @@ namespace cv
|
||||
namespace ocl
|
||||
{
|
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
extern const char *nonfree_surf;
|
||||
extern const char *surf;
|
||||
|
||||
const char* noImage2dOption = "-D DISABLE_IMAGE2D";
|
||||
|
||||
@ -76,10 +75,11 @@ namespace cv
|
||||
}
|
||||
|
||||
|
||||
static inline int divUp(int total, int grain)
|
||||
static inline int divUp(size_t total, size_t grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
}
|
||||
|
||||
static inline int calcSize(int octave, int layer)
|
||||
{
|
||||
/* Wavelet size at first layer of first octave. */
|
||||
@ -268,7 +268,7 @@ private:
|
||||
int maxFeatures;
|
||||
|
||||
oclMat counters;
|
||||
|
||||
|
||||
// texture buffers
|
||||
cl_mem imgTex;
|
||||
cl_mem sumTex;
|
||||
@ -506,20 +506,20 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
size_t globalThreads[3] =
|
||||
{
|
||||
divUp(max_samples_j, localThreads[0]) *localThreads[0],
|
||||
divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
|
||||
divUp(max_samples_j, localThreads[0]) * localThreads[0],
|
||||
divUp(max_samples_i, localThreads[1]) * localThreads[1] *(nOctaveLayers + 2),
|
||||
1
|
||||
};
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
|
||||
int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
|
||||
int octave, bool useMask, int nLayers, int layer_rows, int layer_cols)
|
||||
{
|
||||
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
|
||||
|
||||
Context *clCxt = det.clCxt;
|
||||
string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
|
||||
string kernelName = useMask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
|
||||
@ -538,7 +538,7 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
|
||||
|
||||
if(use_mask)
|
||||
if(useMask)
|
||||
{
|
||||
if(maskSumTex)
|
||||
{
|
||||
@ -556,11 +556,11 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
|
||||
1
|
||||
};
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
|
||||
oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
|
||||
oclMat &keypoints, oclMat &counters_, int octave, int layer_rows, int max_features)
|
||||
{
|
||||
Context *clCxt = det.clCxt;
|
||||
string kernelName = "icvInterpolateKeypoint";
|
||||
@ -569,19 +569,19 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&counters_.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&max_features));
|
||||
|
||||
size_t localThreads[3] = {3, 3, 3};
|
||||
size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
|
||||
@ -608,7 +608,7 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
|
||||
size_t localThreads[3] = {32, 4, 1};
|
||||
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
|
||||
@ -625,7 +625,7 @@ void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
|
||||
size_t localThreads[3] = {256, 1, 1};
|
||||
size_t globalThreads[3] = {saturate_cast<size_t>(nFeatures), 1, 1};
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
|
||||
|
||||
@ -633,7 +633,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
{
|
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
Context *clCxt = descriptors.clCxt;
|
||||
string kernelName = "";
|
||||
string kernelName;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
size_t localThreads[3] = {1, 1, 1};
|
||||
size_t globalThreads[3] = {1, 1, 1};
|
||||
@ -665,7 +665,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
kernelName = "normalize_descriptors64";
|
||||
|
||||
@ -679,7 +679,7 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -707,8 +707,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.rows));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.cols));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&_img.step));
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
kernelName = "normalize_descriptors128";
|
||||
|
||||
@ -721,7 +721,9 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
|
||||
args.clear();
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif //HAVE_OPENCV_OCL
|
@ -7,7 +7,7 @@ using namespace cv::gpu;
|
||||
using namespace cvtest;
|
||||
using namespace testing;
|
||||
|
||||
int main(int argc, char** argv)
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
try
|
||||
{
|
||||
@ -23,42 +23,42 @@ int main(int argc, char** argv)
|
||||
{
|
||||
cmd.printParams();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
printCudaInfo();
|
||||
|
||||
if (cmd.get<bool>("info"))
|
||||
{
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int device = cmd.get<int>("device");
|
||||
if (device < 0)
|
||||
{
|
||||
{
|
||||
DeviceManager::instance().loadAll();
|
||||
|
||||
std::cout << "Run tests on all supported devices \n" << std::endl;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
DeviceManager::instance().load(device);
|
||||
|
||||
DeviceInfo info(device);
|
||||
std::cout << "Run tests on device " << device << " [" << info.name() << "] \n" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
TS::ptr()->init("cv");
|
||||
InitGoogleTest(&argc, argv);
|
||||
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
catch (const std::exception& e)
|
||||
{
|
||||
std::cerr << e.what() << std::endl;
|
||||
return -1;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
{
|
||||
std::cerr << "Unknown error" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
@ -9,16 +9,16 @@
|
||||
#ifndef __OPENCV_TEST_PRECOMP_HPP__
|
||||
#define __OPENCV_TEST_PRECOMP_HPP__
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cvconfig.h"
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
|
||||
#include "opencv2/ts/ts.hpp"
|
||||
#include "opencv2/imgproc/imgproc.hpp"
|
||||
#include "opencv2/highgui/highgui.hpp"
|
||||
#include "opencv2/nonfree/nonfree.hpp"
|
||||
|
||||
#include "opencv2/opencv_modules.hpp"
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
# include "opencv2/nonfree/ocl.hpp"
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_OPENCV_GPU) && defined(HAVE_CUDA)
|
||||
#include "opencv2/ts/gpu_test.hpp"
|
||||
#include "opencv2/nonfree/gpu.hpp"
|
||||
|
@ -43,20 +43,19 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp"
|
||||
|
||||
#include "precomp.hpp"
|
||||
#ifdef HAVE_OPENCL
|
||||
|
||||
extern std::string workdir;
|
||||
#ifdef HAVE_OPENCV_OCL
|
||||
|
||||
using namespace std;
|
||||
using std::tr1::get;
|
||||
|
||||
static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
|
||||
{
|
||||
const double maxPtDif = 1.0;
|
||||
const double maxSizeDif = 1.0;
|
||||
const double maxAngleDif = 2.0;
|
||||
const double maxResponseDif = 0.1;
|
||||
const double maxPtDif = 0.1;
|
||||
const double maxSizeDif = 0.1;
|
||||
const double maxAngleDif = 0.1;
|
||||
const double maxResponseDif = 0.01;
|
||||
|
||||
double dist = cv::norm(p1.pt - p2.pt);
|
||||
|
||||
@ -73,22 +72,10 @@ static bool keyPointsEquals(const cv::KeyPoint& p1, const cv::KeyPoint& p2)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
struct KeyPointLess : std::binary_function<cv::KeyPoint, cv::KeyPoint, bool>
|
||||
{
|
||||
bool operator()(const cv::KeyPoint& kp1, const cv::KeyPoint& kp2) const
|
||||
{
|
||||
return kp1.pt.y < kp2.pt.y || (kp1.pt.y == kp2.pt.y && kp1.pt.x < kp2.pt.x);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#define ASSERT_KEYPOINTS_EQ(gold, actual) EXPECT_PRED_FORMAT2(assertKeyPointsEquals, gold, actual);
|
||||
|
||||
static int getMatchedPointsCount(std::vector<cv::KeyPoint>& gold, std::vector<cv::KeyPoint>& actual)
|
||||
{
|
||||
std::sort(actual.begin(), actual.end(), KeyPointLess());
|
||||
std::sort(gold.begin(), gold.end(), KeyPointLess());
|
||||
std::sort(actual.begin(), actual.end(), perf::comparators::KeypointGreater());
|
||||
std::sort(gold.begin(), gold.end(), perf::comparators::KeypointGreater());
|
||||
|
||||
int validCount = 0;
|
||||
|
||||
@ -122,13 +109,24 @@ static int getMatchedPointsCount(const std::vector<cv::KeyPoint>& keypoints1, co
|
||||
return validCount;
|
||||
}
|
||||
|
||||
IMPLEMENT_PARAM_CLASS(SURF_HessianThreshold, double)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Octaves, int)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_OctaveLayers, int)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Extended, bool)
|
||||
IMPLEMENT_PARAM_CLASS(SURF_Upright, bool)
|
||||
#define PARAM_TEST_CASE(name, ...) struct name : testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
|
||||
#define IMPLEMENT_PARAM_CLASS(name, type) \
|
||||
namespace { class name { \
|
||||
public: \
|
||||
name ( type arg = type ()) : val_(arg) {} \
|
||||
operator type () const {return val_;} \
|
||||
private: \
|
||||
type val_; \
|
||||
}; \
|
||||
inline void PrintTo( name param, std::ostream* os) {*os << #name << "=" << testing::PrintToString(static_cast< type >(param));}}
|
||||
|
||||
PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SURF_Extended, SURF_Upright)
|
||||
IMPLEMENT_PARAM_CLASS(HessianThreshold, double)
|
||||
IMPLEMENT_PARAM_CLASS(Octaves, int)
|
||||
IMPLEMENT_PARAM_CLASS(OctaveLayers, int)
|
||||
IMPLEMENT_PARAM_CLASS(Extended, bool)
|
||||
IMPLEMENT_PARAM_CLASS(Upright, bool)
|
||||
|
||||
PARAM_TEST_CASE(SURF, HessianThreshold, Octaves, OctaveLayers, Extended, Upright)
|
||||
{
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
@ -138,16 +136,17 @@ PARAM_TEST_CASE(SURF, SURF_HessianThreshold, SURF_Octaves, SURF_OctaveLayers, SU
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
hessianThreshold = GET_PARAM(0);
|
||||
nOctaves = GET_PARAM(1);
|
||||
nOctaveLayers = GET_PARAM(2);
|
||||
extended = GET_PARAM(3);
|
||||
upright = GET_PARAM(4);
|
||||
hessianThreshold = get<0>(GetParam());
|
||||
nOctaves = get<1>(GetParam());
|
||||
nOctaveLayers = get<2>(GetParam());
|
||||
extended = get<3>(GetParam());
|
||||
upright = get<4>(GetParam());
|
||||
}
|
||||
};
|
||||
TEST_P(SURF, Detector)
|
||||
|
||||
TEST_P(SURF, DISABLED_Detector)
|
||||
{
|
||||
cv::Mat image = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
cv::ocl::SURF_OCL surf;
|
||||
@ -175,12 +174,12 @@ TEST_P(SURF, Detector)
|
||||
int matchedCount = getMatchedPointsCount(keypoints_gold, keypoints);
|
||||
double matchedRatio = static_cast<double>(matchedCount) / keypoints_gold.size();
|
||||
|
||||
EXPECT_GT(matchedRatio, 0.95);
|
||||
EXPECT_GT(matchedRatio, 0.99);
|
||||
}
|
||||
|
||||
TEST_P(SURF, Descriptor)
|
||||
TEST_P(SURF, DISABLED_Descriptor)
|
||||
{
|
||||
cv::Mat image = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
|
||||
cv::Mat image = cv::imread(string(cvtest::TS::ptr()->get_data_path()) + "shared/fruits.png", cv::IMREAD_GRAYSCALE);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
cv::ocl::SURF_OCL surf;
|
||||
@ -218,10 +217,10 @@ TEST_P(SURF, Descriptor)
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(OCL_Features2D, SURF, testing::Combine(
|
||||
testing::Values(/*SURF_HessianThreshold(100.0), */SURF_HessianThreshold(500.0), SURF_HessianThreshold(1000.0)),
|
||||
testing::Values(SURF_Octaves(3), SURF_Octaves(4)),
|
||||
testing::Values(SURF_OctaveLayers(2), SURF_OctaveLayers(3)),
|
||||
testing::Values(SURF_Extended(false), SURF_Extended(true)),
|
||||
testing::Values(SURF_Upright(false), SURF_Upright(true))));
|
||||
testing::Values(HessianThreshold(500.0), HessianThreshold(1000.0)),
|
||||
testing::Values(Octaves(3), Octaves(4)),
|
||||
testing::Values(OctaveLayers(2), OctaveLayers(3)),
|
||||
testing::Values(Extended(false), Extended(true)),
|
||||
testing::Values(Upright(false), Upright(true))));
|
||||
|
||||
#endif
|
||||
#endif // HAVE_OPENCV_OCL
|
@ -1,69 +1,7 @@
|
||||
# Will be modified later
|
||||
if(NOT HAVE_OPENCL)
|
||||
ocv_module_disable(ocl)
|
||||
endif()
|
||||
|
||||
set(the_description "OpenCL-accelerated Computer Vision")
|
||||
ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
|
||||
|
||||
ocv_module_include_directories()
|
||||
|
||||
file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
|
||||
set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
|
||||
set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${kernels_cpp}
|
||||
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
|
||||
DEPENDS ${CL_FILES} ${cl2cpp_script})
|
||||
|
||||
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
|
||||
file(GLOB lib_srcs "src/*.cpp")
|
||||
file(GLOB lib_int_hdrs "src/*.h*")
|
||||
|
||||
source_group("Include" FILES ${lib_hdrs})
|
||||
source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
|
||||
|
||||
if (HAVE_OPENCL)
|
||||
set(ocl_link_libs ${OPENCL_LIBRARIES})
|
||||
if(OPENCL_INCLUDE_DIR)
|
||||
ocv_include_directories(${OPENCL_INCLUDE_DIR})
|
||||
endif()
|
||||
if (HAVE_CLAMDFFT)
|
||||
set(ocl_link_libs ${ocl_link_libs} ${CLAMDFFT_LIBRARIES})
|
||||
ocv_include_directories(${CLAMDFFT_INCLUDE_DIR})
|
||||
endif()
|
||||
if (HAVE_CLAMDBLAS)
|
||||
set(ocl_link_libs ${ocl_link_libs} ${CLAMDBLAS_LIBRARIES})
|
||||
ocv_include_directories(${CLAMDBLAS_INCLUDE_DIR})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video)
|
||||
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
|
||||
|
||||
ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
|
||||
ocv_create_module(${ocl_link_libs})
|
||||
|
||||
install(FILES ${lib_hdrs}
|
||||
DESTINATION include/opencv2/${name}
|
||||
COMPONENT main)
|
||||
|
||||
ocv_add_precompiled_headers(${the_module})
|
||||
|
||||
################################################################################################################
|
||||
################################ OpenCL Module Tests ##################################################
|
||||
################################################################################################################
|
||||
file(GLOB test_srcs "test/*.cpp")
|
||||
file(GLOB test_hdrs "test/*.hpp" "test/*.h")
|
||||
|
||||
ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
|
||||
FILES "Src" ${test_srcs})
|
||||
|
||||
################################################################################################################
|
||||
################################ OpenCL Module Performance ##################################################
|
||||
################################################################################################################
|
||||
file(GLOB perf_srcs "perf/*.cpp")
|
||||
file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
|
||||
|
||||
ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
|
||||
FILES "Src" ${perf_srcs})
|
||||
|
@ -88,102 +88,3 @@ Computes a proximity map for a raster template and an image where the template i
|
||||
* ``CV_TM_CCORR``
|
||||
|
||||
.. seealso:: :ocv:func:`matchTemplate`
|
||||
|
||||
|
||||
ocl::SURF_OCL
|
||||
-------------
|
||||
.. ocv:class:: ocl::SURF_OCL
|
||||
|
||||
Class used for extracting Speeded Up Robust Features (SURF) from an image. ::
|
||||
|
||||
class SURF_OCL
|
||||
{
|
||||
public:
|
||||
enum KeypointLayout
|
||||
{
|
||||
X_ROW = 0,
|
||||
Y_ROW,
|
||||
LAPLACIAN_ROW,
|
||||
OCTAVE_ROW,
|
||||
SIZE_ROW,
|
||||
ANGLE_ROW,
|
||||
HESSIAN_ROW,
|
||||
ROWS_COUNT
|
||||
};
|
||||
|
||||
//! the default constructor
|
||||
SURF_OCL();
|
||||
//! the full constructor taking all the necessary parameters
|
||||
explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
|
||||
int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
int descriptorSize() const;
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
void uploadKeypoints(const vector<KeyPoint>& keypoints,
|
||||
oclMat& keypointsocl);
|
||||
//! download keypoints from device to host memory
|
||||
void downloadKeypoints(const oclMat& keypointsocl,
|
||||
vector<KeyPoint>& keypoints);
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
void downloadDescriptors(const oclMat& descriptorsocl,
|
||||
vector<float>& descriptors);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
oclMat& keypoints);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
oclMat& keypoints, oclMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints, oclMat& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void operator()(const oclMat& img, const oclMat& mask,
|
||||
std::vector<KeyPoint>& keypoints,
|
||||
std::vector<float>& descriptors,
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
// SURF parameters
|
||||
double hessianThreshold;
|
||||
int nOctaves;
|
||||
int nOctaveLayers;
|
||||
bool extended;
|
||||
bool upright;
|
||||
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
float keypointsRatio;
|
||||
|
||||
oclMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
oclMat det, trace;
|
||||
|
||||
oclMat maxPosBuffer;
|
||||
};
|
||||
|
||||
|
||||
The class ``SURF_OCL`` implements Speeded Up Robust Features descriptor. There is a fast multi-scale Hessian keypoint detector that can be used to find the keypoints (which is the default option). But the descriptors can also be computed for the user-specified keypoints. Only 8-bit grayscale images are supported.
|
||||
|
||||
The class ``SURF_OCL`` can store results in the GPU and CPU memory. It provides functions to convert results between CPU and GPU version ( ``uploadKeypoints``, ``downloadKeypoints``, ``downloadDescriptors`` ). The format of CPU results is the same as ``SURF`` results. GPU results are stored in ``oclMat``. The ``keypoints`` matrix is :math:`\texttt{nFeatures} \times 7` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
* ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i-th feature.
|
||||
* ``keypoints.ptr<float>(LAPLACIAN_ROW)[i]`` contains the laplacian sign of the i-th feature.
|
||||
* ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i-th feature.
|
||||
* ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i-th feature.
|
||||
* ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contain orientation of the i-th feature.
|
||||
* ``keypoints.ptr<float>(HESSIAN_ROW)[i]`` contains the response of the i-th feature.
|
||||
|
||||
The ``descriptors`` matrix is :math:`\texttt{nFeatures} \times \texttt{descriptorSize}` matrix with the ``CV_32FC1`` type.
|
||||
|
||||
The class ``SURF_OCL`` uses some buffers and provides access to it. All buffers can be safely released between function calls.
|
||||
|
||||
.. seealso:: :ocv:class:`SURF`
|
@ -69,28 +69,28 @@ namespace cv
|
||||
|
||||
enum DevMemRW
|
||||
{
|
||||
DEVICE_MEM_R_W = 0,
|
||||
DEVICE_MEM_R_ONLY,
|
||||
DEVICE_MEM_R_W = 0,
|
||||
DEVICE_MEM_R_ONLY,
|
||||
DEVICE_MEM_W_ONLY
|
||||
};
|
||||
|
||||
|
||||
enum DevMemType
|
||||
{
|
||||
DEVICE_MEM_DEFAULT = 0,
|
||||
{
|
||||
DEVICE_MEM_DEFAULT = 0,
|
||||
DEVICE_MEM_AHP, //alloc host pointer
|
||||
DEVICE_MEM_UHP, //use host pointer
|
||||
DEVICE_MEM_CHP, //copy host pointer
|
||||
DEVICE_MEM_PM //persistent memory
|
||||
};
|
||||
|
||||
//Get the global device memory and read/write type
|
||||
//Get the global device memory and read/write type
|
||||
//return 1 if unified memory system supported, otherwise return 0
|
||||
CV_EXPORTS int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type);
|
||||
|
||||
//Set the global device memory and read/write type,
|
||||
//Set the global device memory and read/write type,
|
||||
//the newly generated oclMat will all use this type
|
||||
//return -1 if the target type is unsupported, otherwise return 0
|
||||
CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
|
||||
CV_EXPORTS int setDevMemType(DevMemRW rw_type = DEVICE_MEM_R_W, DevMemType mem_type = DEVICE_MEM_DEFAULT);
|
||||
|
||||
//this class contains ocl runtime information
|
||||
class CV_EXPORTS Info
|
||||
@ -135,20 +135,28 @@ namespace cv
|
||||
|
||||
//////////////////////////////// OpenCL context ////////////////////////
|
||||
//This is a global singleton class used to represent a OpenCL context.
|
||||
class Context
|
||||
class CV_EXPORTS Context
|
||||
{
|
||||
protected:
|
||||
Context();
|
||||
friend class auto_ptr<Context>;
|
||||
static auto_ptr<Context> clCxt;
|
||||
|
||||
private:
|
||||
static auto_ptr<Context> clCxt;
|
||||
static int val;
|
||||
public:
|
||||
~Context();
|
||||
static int val;
|
||||
static Context *getContext();
|
||||
void release();
|
||||
Info::Impl* impl;
|
||||
|
||||
static Context* getContext();
|
||||
static void setContext(Info &oclinfo);
|
||||
struct Impl;
|
||||
Impl *impl;
|
||||
|
||||
enum {CL_DOUBLE, CL_UNIFIED_MEM};
|
||||
bool supportsFeature(int ftype);
|
||||
size_t computeUnits();
|
||||
void* oclContext();
|
||||
void* oclCommandQueue();
|
||||
};
|
||||
|
||||
//! Calls a kernel, by string. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing.
|
||||
@ -1073,156 +1081,6 @@ namespace cv
|
||||
};
|
||||
|
||||
|
||||
|
||||
//! Speeded up robust features, port from GPU module.
|
||||
////////////////////////////////// SURF //////////////////////////////////////////
|
||||
|
||||
class CV_EXPORTS SURF_OCL
|
||||
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
enum KeypointLayout
|
||||
|
||||
{
|
||||
|
||||
X_ROW = 0,
|
||||
|
||||
Y_ROW,
|
||||
|
||||
LAPLACIAN_ROW,
|
||||
|
||||
OCTAVE_ROW,
|
||||
|
||||
SIZE_ROW,
|
||||
|
||||
ANGLE_ROW,
|
||||
|
||||
HESSIAN_ROW,
|
||||
|
||||
ROWS_COUNT
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//! the default constructor
|
||||
|
||||
SURF_OCL();
|
||||
|
||||
//! the full constructor taking all the necessary parameters
|
||||
|
||||
explicit SURF_OCL(double _hessianThreshold, int _nOctaves = 4,
|
||||
|
||||
int _nOctaveLayers = 2, bool _extended = false, float _keypointsRatio = 0.01f, bool _upright = false);
|
||||
|
||||
|
||||
|
||||
//! returns the descriptor size in float's (64 or 128)
|
||||
|
||||
int descriptorSize() const;
|
||||
|
||||
|
||||
|
||||
//! upload host keypoints to device memory
|
||||
|
||||
void uploadKeypoints(const vector<cv::KeyPoint> &keypoints, oclMat &keypointsocl);
|
||||
|
||||
//! download keypoints from device to host memory
|
||||
|
||||
void downloadKeypoints(const oclMat &keypointsocl, vector<KeyPoint> &keypoints);
|
||||
|
||||
|
||||
|
||||
//! download descriptors from device to host memory
|
||||
|
||||
void downloadDescriptors(const oclMat &descriptorsocl, vector<float> &descriptors);
|
||||
|
||||
|
||||
|
||||
//! finds the keypoints using fast hessian detector used in SURF
|
||||
|
||||
//! supports CV_8UC1 images
|
||||
|
||||
//! keypoints will have nFeature cols and 6 rows
|
||||
|
||||
//! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
|
||||
|
||||
//! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
|
||||
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints);
|
||||
|
||||
//! finds the keypoints and computes their descriptors.
|
||||
|
||||
//! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
|
||||
|
||||
void operator()(const oclMat &img, const oclMat &mask, oclMat &keypoints, oclMat &descriptors,
|
||||
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
|
||||
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints);
|
||||
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, oclMat &descriptors,
|
||||
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
|
||||
|
||||
void operator()(const oclMat &img, const oclMat &mask, std::vector<KeyPoint> &keypoints, std::vector<float> &descriptors,
|
||||
|
||||
bool useProvidedKeypoints = false);
|
||||
|
||||
|
||||
|
||||
void releaseMemory();
|
||||
|
||||
|
||||
|
||||
// SURF parameters
|
||||
|
||||
float hessianThreshold;
|
||||
|
||||
int nOctaves;
|
||||
|
||||
int nOctaveLayers;
|
||||
|
||||
bool extended;
|
||||
|
||||
bool upright;
|
||||
|
||||
|
||||
|
||||
//! max keypoints = min(keypointsRatio * img.size().area(), 65535)
|
||||
|
||||
float keypointsRatio;
|
||||
|
||||
|
||||
|
||||
oclMat sum, mask1, maskSum, intBuffer;
|
||||
|
||||
|
||||
|
||||
oclMat det, trace;
|
||||
|
||||
|
||||
|
||||
oclMat maxPosBuffer;
|
||||
|
||||
};
|
||||
|
||||
////////////////////////feature2d_ocl/////////////////
|
||||
/****************************************************************************************\
|
||||
* Distance *
|
||||
|
130
modules/ocl/include/opencv2/ocl/private/util.hpp
Normal file
130
modules/ocl/include/opencv2/ocl/private/util.hpp
Normal file
@ -0,0 +1,130 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_OCL_PRIVATE_UTIL__
|
||||
#define __OPENCV_OCL_PRIVATE_UTIL__
|
||||
|
||||
#include "opencv2/ocl/ocl.hpp"
|
||||
|
||||
#if defined __APPLE__
|
||||
#include <OpenCL/OpenCL.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
enum openCLMemcpyKind
|
||||
{
|
||||
clMemcpyHostToDevice = 0,
|
||||
clMemcpyDeviceToHost,
|
||||
clMemcpyDeviceToDevice
|
||||
};
|
||||
///////////////////////////OpenCL call wrappers////////////////////////////
|
||||
void CV_EXPORTS openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
|
||||
size_t widthInBytes, size_t height);
|
||||
void CV_EXPORTS openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
|
||||
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type);
|
||||
void CV_EXPORTS openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
|
||||
const void *src, size_t spitch,
|
||||
size_t width, size_t height, openCLMemcpyKind kind, int channels = -1);
|
||||
void CV_EXPORTS openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
|
||||
const void *src, size_t spitch,
|
||||
size_t width, size_t height, int src_offset);
|
||||
void CV_EXPORTS openCLFree(void *devPtr);
|
||||
cl_mem CV_EXPORTS openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
|
||||
void CV_EXPORTS openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
|
||||
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
|
||||
const char **source, std::string kernelName);
|
||||
cl_kernel CV_EXPORTS openCLGetKernelFromSource(const Context *clCxt,
|
||||
const char **source, std::string kernelName, const char *build_options);
|
||||
void CV_EXPORTS openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
|
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
|
||||
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
|
||||
void CV_EXPORTS openCLExecuteKernel_(Context *clCxt , const char **source, std::string kernelName,
|
||||
size_t globalThreads[3], size_t localThreads[3],
|
||||
std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, const char *build_options);
|
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth);
|
||||
void CV_EXPORTS openCLExecuteKernel(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
|
||||
int depth, const char *build_options);
|
||||
|
||||
cl_mem CV_EXPORTS load_constant(cl_context context, cl_command_queue command_queue, const void *value,
|
||||
const size_t size);
|
||||
|
||||
cl_mem CV_EXPORTS openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
|
||||
|
||||
int CV_EXPORTS savetofile(const Context *clcxt, cl_program &program, const char *fileName);
|
||||
|
||||
enum FLUSH_MODE
|
||||
{
|
||||
CLFINISH = 0,
|
||||
CLFLUSH,
|
||||
DISABLE
|
||||
};
|
||||
|
||||
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
|
||||
void CV_EXPORTS openCLExecuteKernel2(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels,
|
||||
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
|
||||
// bind oclMat to OpenCL image textures
|
||||
// note:
|
||||
// 1. there is no memory management. User need to explicitly release the resource
|
||||
// 2. for faster clamping, there is no buffer padding for the constructed texture
|
||||
cl_mem CV_EXPORTS bindTexture(const oclMat &mat);
|
||||
void CV_EXPORTS releaseTexture(cl_mem& texture);
|
||||
|
||||
// returns whether the current context supports image2d_t format or not
|
||||
bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());
|
||||
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
|
||||
#endif //__OPENCV_OCL_PRIVATE_UTIL__
|
@ -132,7 +132,7 @@ inline int divUp(int total, int grain)
|
||||
template<typename T>
|
||||
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString, void *_scalar)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -195,7 +195,7 @@ static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
}
|
||||
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -272,7 +272,7 @@ typedef void (*MulDivFunc)(const oclMat &src1, const oclMat &src2, oclMat &dst,
|
||||
|
||||
void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
|
||||
{
|
||||
if((src1.clCxt -> impl -> double_support != 0) && (src1.depth() == CV_64F))
|
||||
if(src1.clCxt->supportsFeature(Context::CL_DOUBLE) && (src1.depth() == CV_64F))
|
||||
arithmetic_run<double>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
|
||||
else
|
||||
arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
|
||||
@ -280,7 +280,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
|
||||
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
|
||||
{
|
||||
|
||||
if(src1.clCxt -> impl -> double_support != 0)
|
||||
if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
|
||||
else
|
||||
arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
|
||||
@ -289,7 +289,7 @@ void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double
|
||||
template <typename WT , typename CL_WT>
|
||||
void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -361,7 +361,7 @@ void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
|
||||
static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, const char **kernelString, double scalar)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -405,7 +405,7 @@ static void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelN
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
|
||||
else
|
||||
{
|
||||
@ -464,7 +464,7 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
|
||||
}
|
||||
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -524,7 +524,7 @@ static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, str
|
||||
|
||||
void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -599,7 +599,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
|
||||
template <typename T>
|
||||
Scalar arithmetic_sum(const oclMat &src, int type = 0)
|
||||
{
|
||||
size_t groupnum = src.clCxt->impl->maxComputeUnits;
|
||||
size_t groupnum = src.clCxt->computeUnits();
|
||||
CV_Assert(groupnum != 0);
|
||||
int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
|
||||
Context *clCxt = src.clCxt;
|
||||
@ -627,7 +627,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
|
||||
typedef Scalar (*sumFunc)(const oclMat &src, int type);
|
||||
Scalar cv::ocl::sum(const oclMat &src)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -638,13 +638,13 @@ Scalar cv::ocl::sum(const oclMat &src)
|
||||
};
|
||||
|
||||
sumFunc func;
|
||||
func = functab[src.clCxt->impl->double_support];
|
||||
func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
|
||||
return func(src, 0);
|
||||
}
|
||||
|
||||
Scalar cv::ocl::absSum(const oclMat &src)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -655,13 +655,13 @@ Scalar cv::ocl::absSum(const oclMat &src)
|
||||
};
|
||||
|
||||
sumFunc func;
|
||||
func = functab[src.clCxt->impl->double_support];
|
||||
func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
|
||||
return func(src, 1);
|
||||
}
|
||||
|
||||
Scalar cv::ocl::sqrSum(const oclMat &src)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -672,7 +672,7 @@ Scalar cv::ocl::sqrSum(const oclMat &src)
|
||||
};
|
||||
|
||||
sumFunc func;
|
||||
func = functab[src.clCxt->impl->double_support];
|
||||
func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
|
||||
return func(src, 2);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
@ -771,7 +771,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
|
||||
|
||||
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
|
||||
{
|
||||
size_t groupnum = src.clCxt->impl->maxComputeUnits;
|
||||
size_t groupnum = src.clCxt->computeUnits();
|
||||
CV_Assert(groupnum != 0);
|
||||
groupnum = groupnum * 2;
|
||||
int vlen = 8;
|
||||
@ -810,7 +810,7 @@ typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, co
|
||||
void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
|
||||
{
|
||||
CV_Assert(src.oclchannels() == 1);
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -894,7 +894,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -943,7 +943,7 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
|
||||
}
|
||||
static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1123,7 +1123,7 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel
|
||||
CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
|
||||
|
||||
Context *clCxt = src.clCxt;
|
||||
if(clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1164,7 +1164,7 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1212,7 +1212,7 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
|
||||
|
||||
static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1276,7 +1276,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
|
||||
static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
|
||||
string kernelName, bool angleInDegrees)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1331,7 +1331,7 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
|
||||
static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
|
||||
string kernelName)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -1452,7 +1452,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
|
||||
Point *minLoc, Point *maxLoc, const oclMat &mask)
|
||||
{
|
||||
CV_Assert(src.oclchannels() == 1);
|
||||
size_t groupnum = src.clCxt->impl->maxComputeUnits;
|
||||
size_t groupnum = src.clCxt->computeUnits();
|
||||
CV_Assert(groupnum != 0);
|
||||
int minloc = -1 , maxloc = -1;
|
||||
int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
|
||||
@ -1513,7 +1513,7 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
|
||||
void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
|
||||
Point *minLoc, Point *maxLoc, const oclMat &mask)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -1524,7 +1524,7 @@ void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
|
||||
};
|
||||
|
||||
minMaxLocFunc func;
|
||||
func = functab[src.clCxt->impl->double_support];
|
||||
func = functab[(int)src.clCxt->supportsFeature(Context::CL_DOUBLE)];
|
||||
func(src, minVal, maxVal, minLoc, maxLoc, mask);
|
||||
}
|
||||
|
||||
@ -1559,8 +1559,8 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
|
||||
|
||||
int cv::ocl::countNonZero(const oclMat &src)
|
||||
{
|
||||
size_t groupnum = src.clCxt->impl->maxComputeUnits;
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
size_t groupnum = src.clCxt->computeUnits();
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -1845,7 +1845,7 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
|
||||
|
||||
void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1858,7 +1858,7 @@ void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
|
||||
void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
// dst.create(src1.size(),src1.type());
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1874,7 +1874,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co
|
||||
|
||||
void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1889,7 +1889,7 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co
|
||||
void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
// dst.create(src1.size(),src1.type());
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1906,7 +1906,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c
|
||||
|
||||
void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1920,7 +1920,7 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c
|
||||
|
||||
void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -1939,7 +1939,7 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c
|
||||
void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
|
||||
{
|
||||
|
||||
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
@ -2036,7 +2036,7 @@ oclMatExpr::operator oclMat() const
|
||||
#define BLOCK_ROWS (256/TILE_DIM)
|
||||
static void transpose_run(const oclMat &src, oclMat &dst, string kernelName)
|
||||
{
|
||||
if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
@ -2135,7 +2135,7 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
|
||||
|
||||
if(src1.clCxt -> impl -> double_support != 0)
|
||||
if(src1.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
|
||||
args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
|
||||
@ -2282,7 +2282,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
|
||||
if(src1.clCxt -> impl -> double_support == 0)
|
||||
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
float pf = p;
|
||||
args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
|
||||
@ -2294,7 +2294,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
|
||||
}
|
||||
void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
|
||||
{
|
||||
if(x.clCxt -> impl -> double_support == 0 && x.type() == CV_64F)
|
||||
if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
|
||||
{
|
||||
cout << "Selected device do not support double" << endl;
|
||||
return;
|
||||
|
@ -43,9 +43,7 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include <iomanip>
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
@ -100,7 +98,7 @@ void cv::ocl::CannyBuf::create(const Size &image_size, int apperture_size)
|
||||
{
|
||||
openCLFree(counter);
|
||||
}
|
||||
counter = clCreateBuffer( Context::getContext()->impl->clContext, CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
|
||||
counter = clCreateBuffer( (cl_context)getoclContext(), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err );
|
||||
openCLSafeCall(err);
|
||||
}
|
||||
|
||||
@ -356,7 +354,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, in
|
||||
void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols)
|
||||
{
|
||||
unsigned int count;
|
||||
openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL));
|
||||
Context *clCxt = map.clCxt;
|
||||
string kernelName = "edgesHysteresisGlobal";
|
||||
vector< pair<size_t, const void *> > args;
|
||||
@ -366,7 +364,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
|
||||
int count_i[1] = {0};
|
||||
while(count > 0)
|
||||
{
|
||||
openCLSafeCall(clEnqueueWriteBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL));
|
||||
|
||||
args.clear();
|
||||
size_t globalThreads[3] = {std::min(count, 65535u) * 128, DIVUP(count, 65535), 1};
|
||||
@ -381,7 +379,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, voi
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
|
||||
|
||||
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
|
||||
openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueReadBuffer((cl_command_queue)getoclCommandQueue(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
|
||||
std::swap(st1, st2);
|
||||
}
|
||||
#undef DIVUP
|
||||
|
@ -206,7 +206,7 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
|
||||
clStridesIn[2] = is_row_dft ? clStridesIn[1] : dft_size.width * clStridesIn[1];
|
||||
clStridesOut[2] = is_row_dft ? clStridesOut[1] : dft_size.width * clStridesOut[1];
|
||||
|
||||
openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, Context::getContext()->impl->clContext, dim, clLengthsIn ) );
|
||||
openCLSafeCall( clAmdFftCreateDefaultPlan( &plHandle, (cl_context)getoclContext(), dim, clLengthsIn ) );
|
||||
|
||||
openCLSafeCall( clAmdFftSetResultLocation( plHandle, CLFFT_OUTOFPLACE ) );
|
||||
openCLSafeCall( clAmdFftSetLayout( plHandle, inLayout, outLayout ) );
|
||||
@ -220,7 +220,8 @@ cv::ocl::FftPlan::FftPlan(Size _dft_size, int _src_step, int _dst_step, int _fla
|
||||
openCLSafeCall( clAmdFftSetPlanScale ( plHandle, is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale_ ) );
|
||||
|
||||
//ready to bake
|
||||
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &(Context::getContext()->impl->clCmdQueue), NULL, NULL ) );
|
||||
cl_command_queue clq = (cl_command_queue)getoclCommandQueue();
|
||||
openCLSafeCall( clAmdFftBakePlan( plHandle, 1, &clq, NULL, NULL ) );
|
||||
}
|
||||
cv::ocl::FftPlan::~FftPlan()
|
||||
{
|
||||
@ -338,16 +339,17 @@ void cv::ocl::dft(const oclMat &src, oclMat &dst, Size dft_size, int flags)
|
||||
if (buffersize)
|
||||
{
|
||||
cl_int medstatus;
|
||||
clMedBuffer = clCreateBuffer ( src.clCxt->impl->clContext, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
|
||||
clMedBuffer = clCreateBuffer ( (cl_context)src.clCxt->oclContext(), CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
|
||||
openCLSafeCall( medstatus );
|
||||
}
|
||||
cl_command_queue clq = (cl_command_queue)src.clCxt->oclCommandQueue();
|
||||
openCLSafeCall( clAmdFftEnqueueTransform( plHandle,
|
||||
is_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD,
|
||||
1,
|
||||
&src.clCxt->impl->clCmdQueue,
|
||||
&clq,
|
||||
0, NULL, NULL,
|
||||
(cl_mem *)&src.data, (cl_mem *)&dst.data, clMedBuffer ) );
|
||||
openCLSafeCall( clFinish(src.clCxt->impl->clCmdQueue) );
|
||||
openCLSafeCall( clFinish(clq) );
|
||||
if(clMedBuffer)
|
||||
{
|
||||
openCLFree(clMedBuffer);
|
||||
|
@ -48,8 +48,7 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
@ -1479,7 +1478,7 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
|
||||
|
||||
void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale)
|
||||
{
|
||||
if (src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F)
|
||||
if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
|
||||
return;
|
||||
|
@ -87,7 +87,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
int offb = src2.offset;
|
||||
int offc = dst.offset;
|
||||
|
||||
|
||||
cl_command_queue clq = (cl_command_queue)src1.clCxt->oclCommandQueue();
|
||||
switch(src1.type())
|
||||
{
|
||||
case CV_32FC1:
|
||||
@ -97,11 +97,12 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
offa /= sizeof(float);
|
||||
offb /= sizeof(float);
|
||||
offc /= sizeof(float);
|
||||
|
||||
openCLSafeCall
|
||||
(
|
||||
clAmdBlasSgemmEx(order, transA, transB, M, N, K,
|
||||
alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
|
||||
beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
|
||||
beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
|
||||
);
|
||||
break;
|
||||
case CV_64FC1:
|
||||
@ -115,7 +116,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
(
|
||||
clAmdBlasDgemmEx(order, transA, transB, M, N, K,
|
||||
alpha, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
|
||||
beta, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
|
||||
beta, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
|
||||
);
|
||||
break;
|
||||
case CV_32FC2:
|
||||
@ -132,7 +133,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
(
|
||||
clAmdBlasCgemmEx(order, transA, transB, M, N, K,
|
||||
alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
|
||||
beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
|
||||
beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
|
||||
);
|
||||
}
|
||||
break;
|
||||
@ -150,7 +151,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
|
||||
(
|
||||
clAmdBlasZgemmEx(order, transA, transB, M, N, K,
|
||||
alpha_2, (const cl_mem)src1.data, offa, lda, (const cl_mem)src2.data, offb, ldb,
|
||||
beta_2, (cl_mem)dst.data, offc, ldc, 1, &src1.clCxt->impl->clCmdQueue, 0, NULL, NULL)
|
||||
beta_2, (cl_mem)dst.data, offc, ldc, 1, &clq, 0, NULL, NULL)
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
@ -971,7 +971,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->impl->maxComputeUnits) *localThreads[0],
|
||||
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
@ -1047,21 +1047,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
|
||||
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
|
||||
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
|
||||
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0,
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
//flag = 1;
|
||||
//}
|
||||
|
||||
@ -1186,7 +1186,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int grp_per_CU = 12;
|
||||
size_t blocksize = 8;
|
||||
size_t localThreads[3] = { blocksize, blocksize , 1 };
|
||||
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->impl->maxComputeUnits *localThreads[0],
|
||||
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
|
||||
localThreads[1], 1
|
||||
};
|
||||
int outputsz = 256 * globalThreads[0] / localThreads[0];
|
||||
@ -1195,7 +1195,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode));
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, nodebuffer, 1, 0,
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
|
||||
nodenum * sizeof(GpuHidHaarTreeNode),
|
||||
node, 0, NULL, NULL));
|
||||
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
|
||||
@ -1252,16 +1252,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
int splitnode = stage[0].count + stage[1].count + stage[2].count;
|
||||
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
|
||||
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
|
||||
//openCLVerifyCall(status);
|
||||
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
|
||||
//openCLVerifyCall(status);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
|
||||
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
|
||||
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->impl->clCmdQueue, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
|
||||
//int argcount = 0;
|
||||
|
||||
vector<pair<size_t, const void *> > args;
|
||||
@ -1286,7 +1286,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
|
||||
candidate = (int *)clEnqueueMapBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
|
||||
candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
|
||||
|
||||
for(int i = 0; i < outputsz; i++)
|
||||
{
|
||||
@ -1297,7 +1297,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
|
||||
free(scaleinfo);
|
||||
free(p);
|
||||
free(correction);
|
||||
clEnqueueUnmapMemObject(gsum.clCxt->impl->clCmdQueue, candidatebuffer, candidate, 0, 0, 0);
|
||||
clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
|
||||
openCLSafeCall(clReleaseMemObject(stagebuffer));
|
||||
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
|
||||
openCLSafeCall(clReleaseMemObject(nodebuffer));
|
||||
|
@ -44,7 +44,7 @@
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
using namespace std;
|
||||
|
@ -290,8 +290,8 @@ namespace cv
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
|
||||
float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
|
||||
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
|
||||
}
|
||||
@ -319,7 +319,7 @@ namespace cv
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
|
||||
}
|
||||
@ -383,7 +383,7 @@ namespace cv
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
|
||||
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
|
||||
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
|
||||
@ -824,12 +824,12 @@ namespace cv
|
||||
string kernelName = "warpAffine" + s[interpolation];
|
||||
|
||||
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
cl_int st;
|
||||
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
|
||||
coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
|
||||
openCLVerifyCall(st);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -839,8 +839,8 @@ namespace cv
|
||||
{
|
||||
float_coeffs[m][n] = coeffs[m][n];
|
||||
}
|
||||
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
|
||||
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
|
||||
coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
|
||||
|
||||
}
|
||||
//TODO: improve this kernel
|
||||
@ -894,12 +894,12 @@ namespace cv
|
||||
string s[3] = {"NN", "Linear", "Cubic"};
|
||||
string kernelName = "warpPerspective" + s[interpolation];
|
||||
|
||||
if(src.clCxt -> impl -> double_support != 0)
|
||||
if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
cl_int st;
|
||||
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
|
||||
coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
|
||||
openCLVerifyCall(st);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -908,9 +908,9 @@ namespace cv
|
||||
for(int n = 0; n < 3; n++)
|
||||
float_coeffs[m][n] = coeffs[m][n];
|
||||
|
||||
coeffs_cm = clCreateBuffer( clCxt->impl->clContext, CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
|
||||
coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
|
||||
openCLVerifyCall(st);
|
||||
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
|
||||
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
|
||||
}
|
||||
//TODO: improve this kernel
|
||||
size_t blkSizeX = 16, blkSizeY = 16;
|
||||
@ -1018,7 +1018,7 @@ namespace cv
|
||||
void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
|
||||
{
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -1192,7 +1192,7 @@ namespace cv
|
||||
void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
|
||||
double k, int borderType)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -1206,7 +1206,7 @@ namespace cv
|
||||
|
||||
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
|
||||
{
|
||||
if(src.clCxt->impl->double_support == 0 && src.depth() == CV_64F)
|
||||
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
|
||||
{
|
||||
CV_Error(CV_GpuNotSupported, "select device don't support double");
|
||||
}
|
||||
@ -1260,7 +1260,7 @@ namespace cv
|
||||
if( src.depth() != CV_8U || src.oclchannels() != 4 )
|
||||
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
|
||||
|
||||
// if(src.clCxt->impl->double_support == 0)
|
||||
// if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
// {
|
||||
// CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
|
||||
// }
|
||||
@ -1328,7 +1328,7 @@ namespace cv
|
||||
if( src.depth() != CV_8U || src.oclchannels() != 4 )
|
||||
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
|
||||
|
||||
// if(src.clCxt->impl->double_support == 0)
|
||||
// if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
|
||||
// {
|
||||
// CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
|
||||
// }
|
||||
|
@ -77,7 +77,7 @@ namespace cv
|
||||
ProgramCache *programCache = NULL;
|
||||
DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
|
||||
DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
|
||||
int gDevMemTypeValueMap[5] = {0,
|
||||
int gDevMemTypeValueMap[5] = {0,
|
||||
CL_MEM_ALLOC_HOST_PTR,
|
||||
CL_MEM_USE_HOST_PTR,
|
||||
CL_MEM_COPY_HOST_PTR,
|
||||
@ -124,26 +124,8 @@ namespace cv
|
||||
cacheSize = 0;
|
||||
}
|
||||
|
||||
////////////////////////Common OpenCL specific calls///////////////
|
||||
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
|
||||
{
|
||||
rw_type = gDeviceMemRW;
|
||||
mem_type = gDeviceMemType;
|
||||
return Context::getContext()->impl->unified_memory;
|
||||
}
|
||||
|
||||
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
|
||||
{
|
||||
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
|
||||
mem_type == DEVICE_MEM_UHP ||
|
||||
mem_type == DEVICE_MEM_CHP )
|
||||
return -1;
|
||||
gDeviceMemRW = rw_type;
|
||||
gDeviceMemType = mem_type;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct Info::Impl
|
||||
struct Info::Impl
|
||||
{
|
||||
cl_platform_id oclplatform;
|
||||
std::vector<cl_device_id> devices;
|
||||
@ -152,18 +134,144 @@ namespace cv
|
||||
cl_context oclcontext;
|
||||
cl_command_queue clCmdQueue;
|
||||
int devnum;
|
||||
cl_uint maxDimensions;
|
||||
size_t maxWorkGroupSize;
|
||||
size_t *maxWorkItemSizes;
|
||||
cl_uint maxDimensions; // == maxWorkItemSizes.size()
|
||||
std::vector<size_t> maxWorkItemSizes;
|
||||
cl_uint maxComputeUnits;
|
||||
char extra_options[512];
|
||||
int double_support;
|
||||
int unified_memory; //1 means integrated GPU, otherwise this value is 0
|
||||
string binpath;
|
||||
int refcounter;
|
||||
|
||||
Impl()
|
||||
{
|
||||
refcounter = 1;
|
||||
oclplatform = 0;
|
||||
oclcontext = 0;
|
||||
clCmdQueue = 0;
|
||||
devnum = -1;
|
||||
maxComputeUnits = 0;
|
||||
maxWorkGroupSize = 0;
|
||||
memset(extra_options, 0, 512);
|
||||
double_support = 0;
|
||||
unified_memory = 0;
|
||||
}
|
||||
|
||||
void setDevice(void *ctx, void *q, int devnum);
|
||||
|
||||
void release()
|
||||
{
|
||||
if(1 == CV_XADD(&refcounter, -1))
|
||||
{
|
||||
releaseResources();
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
Impl* copy()
|
||||
{
|
||||
CV_XADD(&refcounter, 1);
|
||||
return this;
|
||||
}
|
||||
|
||||
private:
|
||||
Impl(const Impl&);
|
||||
Impl& operator=(const Impl&);
|
||||
void releaseResources();
|
||||
};
|
||||
|
||||
void Info::Impl::releaseResources()
|
||||
{
|
||||
devnum = -1;
|
||||
|
||||
if(clCmdQueue)
|
||||
{
|
||||
openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
|
||||
clCmdQueue = 0;
|
||||
}
|
||||
|
||||
if(oclcontext)
|
||||
{
|
||||
openCLSafeCall(clReleaseContext(oclcontext));
|
||||
oclcontext = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Info::Impl::setDevice(void *ctx, void *q, int dnum)
|
||||
{
|
||||
if((ctx && q) || devnum != dnum)
|
||||
releaseResources();
|
||||
|
||||
CV_Assert(dnum >= 0 && dnum < (int)devices.size());
|
||||
devnum = dnum;
|
||||
if(ctx && q)
|
||||
{
|
||||
oclcontext = (cl_context)ctx;
|
||||
clCmdQueue = (cl_command_queue)q;
|
||||
clRetainContext(oclcontext);
|
||||
clRetainCommandQueue(clCmdQueue);
|
||||
}
|
||||
else
|
||||
{
|
||||
cl_int status = 0;
|
||||
cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
|
||||
oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
|
||||
openCLVerifyCall(status);
|
||||
clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
|
||||
openCLVerifyCall(status);
|
||||
}
|
||||
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
|
||||
maxWorkItemSizes.resize(maxDimensions);
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
|
||||
|
||||
cl_bool unfymem = false;
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
|
||||
unified_memory = unfymem ? 1 : 0;
|
||||
|
||||
//initialize extra options for compilation. Currently only fp64 is included.
|
||||
//Assume 4KB is enough to store all possible extensions.
|
||||
const int EXT_LEN = 4096 + 1 ;
|
||||
char extends_set[EXT_LEN];
|
||||
size_t extends_size;
|
||||
openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
|
||||
extends_set[EXT_LEN - 1] = 0;
|
||||
size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
|
||||
|
||||
if(fp64_khr != std::string::npos)
|
||||
{
|
||||
sprintf(extra_options, "-D DOUBLE_SUPPORT");
|
||||
double_support = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
memset(extra_options, 0, 512);
|
||||
double_support = 0;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////Common OpenCL specific calls///////////////
|
||||
int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
|
||||
{
|
||||
rw_type = gDeviceMemRW;
|
||||
mem_type = gDeviceMemType;
|
||||
return Context::getContext()->impl->unified_memory;
|
||||
}
|
||||
|
||||
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
|
||||
{
|
||||
if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
|
||||
mem_type == DEVICE_MEM_UHP ||
|
||||
mem_type == DEVICE_MEM_CHP )
|
||||
return -1;
|
||||
gDeviceMemRW = rw_type;
|
||||
gDeviceMemType = mem_type;
|
||||
return 0;
|
||||
}
|
||||
|
||||
inline int divUp(int total, int grain)
|
||||
{
|
||||
return (total + grain - 1) / grain;
|
||||
@ -171,6 +279,9 @@ namespace cv
|
||||
|
||||
int getDevice(std::vector<Info> &oclinfo, int devicetype)
|
||||
{
|
||||
//TODO: cache oclinfo vector
|
||||
oclinfo.clear();
|
||||
|
||||
switch(devicetype)
|
||||
{
|
||||
case CVCL_DEVICE_TYPE_DEFAULT:
|
||||
@ -180,125 +291,62 @@ namespace cv
|
||||
case CVCL_DEVICE_TYPE_ALL:
|
||||
break;
|
||||
default:
|
||||
CV_Error(CV_GpuApiCallError, "Unkown device type");
|
||||
return 0;
|
||||
}
|
||||
int devcienums = 0;
|
||||
// Platform info
|
||||
cl_int status = 0;
|
||||
cl_uint numPlatforms;
|
||||
Info ocltmpinfo;
|
||||
openCLSafeCall(clGetPlatformIDs(0, NULL, &numPlatforms));
|
||||
CV_Assert(numPlatforms > 0);
|
||||
cl_platform_id *platforms = new cl_platform_id[numPlatforms];
|
||||
|
||||
openCLSafeCall(clGetPlatformIDs(numPlatforms, platforms, NULL));
|
||||
// Platform info
|
||||
cl_uint numPlatforms;
|
||||
openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
|
||||
if(numPlatforms < 1) return 0;
|
||||
|
||||
std::vector<cl_platform_id> platforms(numPlatforms);
|
||||
openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
|
||||
|
||||
char deviceName[256];
|
||||
int devcienums = 0;
|
||||
for (unsigned i = 0; i < numPlatforms; ++i)
|
||||
{
|
||||
cl_uint numsdev;
|
||||
status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
|
||||
cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
|
||||
if(status != CL_DEVICE_NOT_FOUND)
|
||||
{
|
||||
openCLVerifyCall(status);
|
||||
}
|
||||
|
||||
if(numsdev > 0)
|
||||
{
|
||||
devcienums += numsdev;
|
||||
cl_device_id *devices = new cl_device_id[numsdev];
|
||||
openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, devices, NULL));
|
||||
std::vector<cl_device_id> devices(numsdev);
|
||||
openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
|
||||
|
||||
Info ocltmpinfo;
|
||||
ocltmpinfo.impl->oclplatform = platforms[i];
|
||||
for(unsigned j = 0; j < numsdev; j++)
|
||||
for(unsigned j = 0; j < numsdev; ++j)
|
||||
{
|
||||
ocltmpinfo.impl->devices.push_back(devices[j]);
|
||||
openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 256, deviceName, NULL));
|
||||
ocltmpinfo.impl->devName.push_back(std::string(deviceName));
|
||||
ocltmpinfo.DeviceName.push_back(std::string(deviceName));
|
||||
openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
|
||||
ocltmpinfo.impl->devName.push_back(deviceName);
|
||||
ocltmpinfo.DeviceName.push_back(deviceName);
|
||||
}
|
||||
delete[] devices;
|
||||
oclinfo.push_back(ocltmpinfo);
|
||||
ocltmpinfo.release();
|
||||
}
|
||||
}
|
||||
delete[] platforms;
|
||||
if(devcienums > 0)
|
||||
{
|
||||
setDevice(oclinfo[0]);
|
||||
}
|
||||
return devcienums;
|
||||
}
|
||||
|
||||
static void fillClcontext(Info &oclinfo)
|
||||
{
|
||||
//get device information
|
||||
size_t devnum = oclinfo.impl->devnum;
|
||||
|
||||
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE,
|
||||
sizeof(size_t), (void *)&oclinfo.impl->maxWorkGroupSize, NULL));
|
||||
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
|
||||
sizeof(cl_uint), (void *)&oclinfo.impl->maxDimensions, NULL));
|
||||
oclinfo.impl->maxWorkItemSizes = new size_t[oclinfo.impl->maxDimensions];
|
||||
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES,
|
||||
sizeof(size_t)*oclinfo.impl->maxDimensions, (void *)oclinfo.impl->maxWorkItemSizes, NULL));
|
||||
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS,
|
||||
sizeof(cl_uint), (void *)&oclinfo.impl->maxComputeUnits, NULL));
|
||||
//initialize extra options for compilation. Currently only fp64 is included.
|
||||
//Assume 4KB is enough to store all possible extensions.
|
||||
|
||||
const int EXT_LEN = 4096 + 1 ;
|
||||
char extends_set[EXT_LEN];
|
||||
size_t extends_size;
|
||||
openCLSafeCall(clGetDeviceInfo(oclinfo.impl->devices[devnum], CL_DEVICE_EXTENSIONS,
|
||||
EXT_LEN, (void *)extends_set, &extends_size));
|
||||
CV_Assert(extends_size < (size_t)EXT_LEN);
|
||||
extends_set[EXT_LEN - 1] = 0;
|
||||
memset(oclinfo.impl->extra_options, 0, 512);
|
||||
oclinfo.impl->double_support = 0;
|
||||
int fp64_khr = string(extends_set).find("cl_khr_fp64");
|
||||
|
||||
if(fp64_khr >= 0 && fp64_khr < EXT_LEN)
|
||||
{
|
||||
sprintf(oclinfo.impl->extra_options , "-D DOUBLE_SUPPORT");
|
||||
oclinfo.impl -> double_support = 1;
|
||||
}
|
||||
Context::setContext(oclinfo);
|
||||
|
||||
}
|
||||
|
||||
void setDevice(Info &oclinfo, int devnum)
|
||||
{
|
||||
CV_Assert(devnum >= 0);
|
||||
cl_int status = 0;
|
||||
cl_context_properties cps[3] =
|
||||
{
|
||||
CL_CONTEXT_PLATFORM, (cl_context_properties)(oclinfo.impl->oclplatform), 0
|
||||
};
|
||||
oclinfo.impl->devnum = devnum;
|
||||
oclinfo.impl->oclcontext = clCreateContext(cps, 1, &oclinfo.impl->devices[devnum], NULL, NULL, &status);
|
||||
openCLVerifyCall(status);
|
||||
//create the command queue using the first device of the list
|
||||
oclinfo.impl->clCmdQueue = clCreateCommandQueue(oclinfo.impl->oclcontext, oclinfo.impl->devices[devnum],
|
||||
CL_QUEUE_PROFILING_ENABLE, &status);
|
||||
openCLVerifyCall(status);
|
||||
fillClcontext(oclinfo);
|
||||
oclinfo.impl->setDevice(0, 0, devnum);
|
||||
Context::setContext(oclinfo);
|
||||
}
|
||||
|
||||
void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
|
||||
{
|
||||
CV_Assert(devnum >= 0);
|
||||
oclinfo.impl->devnum = devnum;
|
||||
if(ctx && q)
|
||||
{
|
||||
oclinfo.impl->oclcontext = (cl_context)ctx;
|
||||
oclinfo.impl->clCmdQueue = (cl_command_queue)q;
|
||||
clRetainContext((cl_context)ctx);
|
||||
clRetainCommandQueue((cl_command_queue)q);
|
||||
fillClcontext(oclinfo);
|
||||
}
|
||||
oclinfo.impl->setDevice(ctx, q, devnum);
|
||||
Context::setContext(oclinfo);
|
||||
}
|
||||
|
||||
void *getoclContext()
|
||||
{
|
||||
return &(Context::getContext()->impl->clContext);
|
||||
return &(Context::getContext()->impl->oclcontext);
|
||||
}
|
||||
|
||||
void *getoclCommandQueue()
|
||||
@ -316,7 +364,7 @@ namespace cv
|
||||
cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
|
||||
{
|
||||
cl_int status;
|
||||
cl_mem buffer = clCreateBuffer(clCxt->impl->clContext, (cl_mem_flags)flag, size, NULL, &status);
|
||||
cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
|
||||
openCLVerifyCall(status);
|
||||
return buffer;
|
||||
}
|
||||
@ -331,8 +379,7 @@ namespace cv
|
||||
size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
|
||||
{
|
||||
cl_int status;
|
||||
|
||||
*dev_ptr = clCreateBuffer(clCxt->impl->clContext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
|
||||
widthInBytes * height, 0, &status);
|
||||
openCLVerifyCall(status);
|
||||
*pitch = widthInBytes;
|
||||
@ -340,7 +387,7 @@ namespace cv
|
||||
|
||||
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
|
||||
const void *src, size_t spitch,
|
||||
size_t width, size_t height, enum openCLMemcpyKind kind, int channels)
|
||||
size_t width, size_t height, openCLMemcpyKind kind, int channels)
|
||||
{
|
||||
size_t buffer_origin[3] = {0, 0, 0};
|
||||
size_t host_origin[3] = {0, 0, 0};
|
||||
@ -398,7 +445,7 @@ namespace cv
|
||||
void setBinpath(const char *path)
|
||||
{
|
||||
Context *clcxt = Context::getContext();
|
||||
clcxt->impl->Binpath = path;
|
||||
clcxt->impl->binpath = path;
|
||||
}
|
||||
|
||||
int savetofile(const Context*, cl_program &program, const char *fileName)
|
||||
@ -442,11 +489,11 @@ namespace cv
|
||||
|
||||
if(NULL != build_options)
|
||||
{
|
||||
src_sign << (int64)(*source) << clCxt->impl->clContext << "_" << build_options;
|
||||
src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
|
||||
}
|
||||
else
|
||||
{
|
||||
src_sign << (int64)(*source) << clCxt->impl->clContext;
|
||||
src_sign << (int64)(*source) << clCxt->impl->oclcontext;
|
||||
}
|
||||
srcsign = src_sign.str();
|
||||
|
||||
@ -466,24 +513,24 @@ namespace cv
|
||||
strcat(all_build_options, build_options);
|
||||
if(all_build_options != NULL)
|
||||
{
|
||||
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + all_build_options + ".clb";
|
||||
filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
|
||||
}
|
||||
else
|
||||
{
|
||||
filename = clCxt->impl->Binpath + kernelName + "_" + clCxt->impl->devName + ".clb";
|
||||
filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
|
||||
}
|
||||
|
||||
FILE *fp = fopen(filename.c_str(), "rb");
|
||||
if(fp == NULL || clCxt->impl->Binpath.size() == 0) //we should generate a binary file for the first time.
|
||||
if(fp == NULL || clCxt->impl->binpath.size() == 0) //we should generate a binary file for the first time.
|
||||
{
|
||||
if(fp != NULL)
|
||||
fclose(fp);
|
||||
|
||||
program = clCreateProgramWithSource(
|
||||
clCxt->impl->clContext, 1, source, NULL, &status);
|
||||
clCxt->impl->oclcontext, 1, source, NULL, &status);
|
||||
openCLVerifyCall(status);
|
||||
status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL);
|
||||
if(status == CL_SUCCESS && clCxt->impl->Binpath.size())
|
||||
status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
|
||||
if(status == CL_SUCCESS && clCxt->impl->binpath.size())
|
||||
savetofile(clCxt, program, filename.c_str());
|
||||
}
|
||||
else
|
||||
@ -495,15 +542,15 @@ namespace cv
|
||||
CV_Assert(1 == fread(binary, binarySize, 1, fp));
|
||||
fclose(fp);
|
||||
cl_int status = 0;
|
||||
program = clCreateProgramWithBinary(clCxt->impl->clContext,
|
||||
program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
|
||||
1,
|
||||
&(clCxt->impl->devices),
|
||||
&(clCxt->impl->devices[clCxt->impl->devnum]),
|
||||
(const size_t *)&binarySize,
|
||||
(const unsigned char **)&binary,
|
||||
NULL,
|
||||
&status);
|
||||
openCLVerifyCall(status);
|
||||
status = clBuildProgram(program, 1, &(clCxt->impl->devices), all_build_options, NULL, NULL);
|
||||
status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
|
||||
delete[] binary;
|
||||
}
|
||||
|
||||
@ -515,14 +562,14 @@ namespace cv
|
||||
char *buildLog = NULL;
|
||||
size_t buildLogSize = 0;
|
||||
logStatus = clGetProgramBuildInfo(program,
|
||||
clCxt->impl->devices, CL_PROGRAM_BUILD_LOG, buildLogSize,
|
||||
clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
|
||||
buildLog, &buildLogSize);
|
||||
if(logStatus != CL_SUCCESS)
|
||||
cout << "Failed to build the program and get the build info." << endl;
|
||||
buildLog = new char[buildLogSize];
|
||||
CV_DbgAssert(!!buildLog);
|
||||
memset(buildLog, 0, buildLogSize);
|
||||
openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices,
|
||||
openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
|
||||
CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
|
||||
cout << "\n\t\t\tBUILD LOG\n";
|
||||
cout << buildLog << endl;
|
||||
@ -544,13 +591,13 @@ namespace cv
|
||||
void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
|
||||
{
|
||||
size_t kernelWorkGroupSize;
|
||||
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices,
|
||||
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
|
||||
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
|
||||
CV_Assert( (localThreads[0] <= clCxt->impl->maxWorkItemSizes[0]) &&
|
||||
(localThreads[1] <= clCxt->impl->maxWorkItemSizes[1]) &&
|
||||
(localThreads[2] <= clCxt->impl->maxWorkItemSizes[2]) &&
|
||||
((localThreads[0] * localThreads[1] * localThreads[2]) <= kernelWorkGroupSize) &&
|
||||
(localThreads[0] * localThreads[1] * localThreads[2]) <= clCxt->impl->maxWorkGroupSize);
|
||||
CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
|
||||
CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
|
||||
CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
|
||||
CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
|
||||
CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
|
||||
}
|
||||
|
||||
#ifdef PRINT_KERNEL_RUN_TIME
|
||||
@ -664,10 +711,10 @@ namespace cv
|
||||
cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
|
||||
size_t globalThreads[3], size_t localThreads[3],
|
||||
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
|
||||
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
|
||||
bool finish, bool measureKernelTime, bool cleanUp)
|
||||
|
||||
{
|
||||
@ -764,7 +811,7 @@ namespace cv
|
||||
f.read(str, fileSize);
|
||||
f.close();
|
||||
str[size] = '\0';
|
||||
|
||||
|
||||
s = str;
|
||||
delete[] str;
|
||||
return 0;
|
||||
@ -775,7 +822,7 @@ namespace cv
|
||||
|
||||
double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
|
||||
size_t globalThreads[3], size_t localThreads[3],
|
||||
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
|
||||
vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
|
||||
bool finish, bool measureKernelTime, bool cleanUp)
|
||||
|
||||
{
|
||||
@ -795,8 +842,8 @@ namespace cv
|
||||
delete []source;
|
||||
return kernelTime;
|
||||
}
|
||||
|
||||
cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
|
||||
|
||||
cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
|
||||
const size_t size)
|
||||
{
|
||||
int status;
|
||||
@ -815,142 +862,143 @@ namespace cv
|
||||
/////////////////////////////OpenCL initialization/////////////////
|
||||
auto_ptr<Context> Context::clCxt;
|
||||
int Context::val = 0;
|
||||
Mutex cs;
|
||||
Context *Context::getContext()
|
||||
static Mutex cs;
|
||||
Context* Context::getContext()
|
||||
{
|
||||
if(val == 0)
|
||||
if(*((volatile int*)&val) != 1)
|
||||
{
|
||||
AutoLock al(cs);
|
||||
if( NULL == clCxt.get())
|
||||
if(*((volatile int*)&val) != 1)
|
||||
{
|
||||
if( 0 == clCxt.get())
|
||||
clCxt.reset(new Context);
|
||||
|
||||
std::vector<Info> oclinfo;
|
||||
CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
|
||||
oclinfo[0].impl->setDevice(0, 0, 0);
|
||||
clCxt.get()->impl = oclinfo[0].impl->copy();
|
||||
|
||||
*((volatile int*)&val) = 1;
|
||||
}
|
||||
}
|
||||
return clCxt.get();
|
||||
}
|
||||
|
||||
void Context::setContext(Info &oclinfo)
|
||||
{
|
||||
AutoLock guard(cs);
|
||||
if(*((volatile int*)&val) != 1)
|
||||
{
|
||||
if( 0 == clCxt.get())
|
||||
clCxt.reset(new Context);
|
||||
|
||||
val = 1;
|
||||
return clCxt.get();
|
||||
clCxt.get()->impl = oclinfo.impl->copy();
|
||||
|
||||
*((volatile int*)&val) = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return clCxt.get();
|
||||
clCxt.get()->impl->release();
|
||||
clCxt.get()->impl = oclinfo.impl->copy();
|
||||
}
|
||||
}
|
||||
void Context::setContext(Info &oclinfo)
|
||||
{
|
||||
Context *clcxt = getContext();
|
||||
clcxt->impl->clContext = oclinfo.impl->oclcontext;
|
||||
clcxt->impl->clCmdQueue = oclinfo.impl->clCmdQueue;
|
||||
clcxt->impl->devices = oclinfo.impl->devices[oclinfo.impl->devnum];
|
||||
clcxt->impl->devName = oclinfo.impl->devName[oclinfo.impl->devnum];
|
||||
clcxt->impl->maxDimensions = oclinfo.impl->maxDimensions;
|
||||
clcxt->impl->maxWorkGroupSize = oclinfo.impl->maxWorkGroupSize;
|
||||
for(size_t i=0; i<clcxt->impl->maxDimensions && i<4; i++)
|
||||
clcxt->impl->maxWorkItemSizes[i] = oclinfo.impl->maxWorkItemSizes[i];
|
||||
clcxt->impl->maxComputeUnits = oclinfo.impl->maxComputeUnits;
|
||||
clcxt->impl->double_support = oclinfo.impl->double_support;
|
||||
//extra options to recognize compiler options
|
||||
memcpy(clcxt->impl->extra_options, oclinfo.impl->extra_options, 512);
|
||||
cl_bool unfymem = false;
|
||||
openCLSafeCall(clGetDeviceInfo(clcxt->impl->devices, CL_DEVICE_HOST_UNIFIED_MEMORY,
|
||||
sizeof(cl_bool), (void *)&unfymem, NULL));
|
||||
if(unfymem)
|
||||
clcxt->impl->unified_memory = 1;
|
||||
}
|
||||
|
||||
Context::Context()
|
||||
{
|
||||
impl = new Impl;
|
||||
//Information of the OpenCL context
|
||||
impl->clContext = NULL;
|
||||
impl->clCmdQueue = NULL;
|
||||
impl->devices = NULL;
|
||||
impl->maxDimensions = 0;
|
||||
impl->maxWorkGroupSize = 0;
|
||||
for(int i=0; i<4; i++)
|
||||
impl->maxWorkItemSizes[i] = 0;
|
||||
impl->maxComputeUnits = 0;
|
||||
impl->double_support = 0;
|
||||
//extra options to recognize vendor specific fp64 extensions
|
||||
memset(impl->extra_options, 0, 512);
|
||||
impl->unified_memory = 0;
|
||||
impl = 0;
|
||||
programCache = ProgramCache::getProgramCache();
|
||||
}
|
||||
|
||||
Context::~Context()
|
||||
{
|
||||
delete impl;
|
||||
release();
|
||||
}
|
||||
|
||||
void Context::release()
|
||||
{
|
||||
if (impl)
|
||||
impl->release();
|
||||
programCache->releaseProgram();
|
||||
}
|
||||
|
||||
bool Context::supportsFeature(int ftype)
|
||||
{
|
||||
switch(ftype)
|
||||
{
|
||||
case CL_DOUBLE:
|
||||
return impl->double_support == 1;
|
||||
case CL_UNIFIED_MEM:
|
||||
return impl->unified_memory == 1;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
size_t Context::computeUnits()
|
||||
{
|
||||
return impl->maxComputeUnits;
|
||||
}
|
||||
|
||||
void* Context::oclContext()
|
||||
{
|
||||
return impl->oclcontext;
|
||||
}
|
||||
|
||||
void* Context::oclCommandQueue()
|
||||
{
|
||||
return impl->clCmdQueue;
|
||||
}
|
||||
|
||||
Info::Info()
|
||||
{
|
||||
impl = new Impl;
|
||||
impl->oclplatform = 0;
|
||||
impl->oclcontext = 0;
|
||||
impl->clCmdQueue = 0;
|
||||
impl->devnum = 0;
|
||||
impl->maxDimensions = 0;
|
||||
impl->maxWorkGroupSize = 0;
|
||||
impl->maxWorkItemSizes = 0;
|
||||
impl->maxComputeUnits = 0;
|
||||
impl->double_support = 0;
|
||||
//extra_options = 0;
|
||||
}
|
||||
|
||||
void Info::release()
|
||||
{
|
||||
fft_teardown();
|
||||
if(impl->oclplatform)
|
||||
{
|
||||
impl->oclplatform = 0;
|
||||
}
|
||||
if(impl->clCmdQueue)
|
||||
{
|
||||
openCLSafeCall(clReleaseCommandQueue(impl->clCmdQueue));
|
||||
}
|
||||
ProgramCache::getProgramCache()->releaseProgram();
|
||||
if(impl->oclcontext)
|
||||
{
|
||||
openCLSafeCall(clReleaseContext(impl->oclcontext));
|
||||
}
|
||||
if(impl->maxWorkItemSizes)
|
||||
{
|
||||
delete[] impl->maxWorkItemSizes;
|
||||
impl->maxWorkItemSizes = 0;
|
||||
}
|
||||
//if(extra_options)
|
||||
//{
|
||||
// delete[] extra_options;
|
||||
// extra_options = 0;
|
||||
//}
|
||||
impl->devices.clear();
|
||||
impl->devName.clear();
|
||||
impl->release();
|
||||
impl = new Impl;
|
||||
DeviceName.clear();
|
||||
}
|
||||
|
||||
Info::~Info()
|
||||
{
|
||||
release();
|
||||
delete impl;
|
||||
fft_teardown();
|
||||
impl->release();
|
||||
}
|
||||
|
||||
Info &Info::operator = (const Info &m)
|
||||
{
|
||||
impl->oclplatform = m.impl->oclplatform;
|
||||
impl->oclcontext = m.impl->oclcontext;
|
||||
impl->clCmdQueue = m.impl->clCmdQueue;
|
||||
impl->devnum = m.impl->devnum;
|
||||
impl->maxDimensions = m.impl->maxDimensions;
|
||||
impl->maxWorkGroupSize = m.impl->maxWorkGroupSize;
|
||||
impl->maxWorkItemSizes = m.impl->maxWorkItemSizes;
|
||||
impl->maxComputeUnits = m.impl->maxComputeUnits;
|
||||
impl->double_support = m.impl->double_support;
|
||||
memcpy(impl->extra_options, m.impl->extra_options, 512);
|
||||
for(size_t i = 0; i < m.impl->devices.size(); i++)
|
||||
{
|
||||
impl->devices.push_back(m.impl->devices[i]);
|
||||
impl->devName.push_back(m.impl->devName[i]);
|
||||
DeviceName.push_back(m.DeviceName[i]);
|
||||
}
|
||||
impl->release();
|
||||
impl = m.impl->copy();
|
||||
DeviceName = m.DeviceName;
|
||||
return *this;
|
||||
}
|
||||
|
||||
Info::Info(const Info &m)
|
||||
{
|
||||
impl = new Impl;
|
||||
*this = m;
|
||||
impl = m.impl->copy();
|
||||
DeviceName = m.DeviceName;
|
||||
}
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
|
||||
#if defined BUILD_SHARED_LIBS && defined CVAPI_EXPORTS && defined WIN32 && !defined WINCE
|
||||
#include <windows.h>
|
||||
BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID );
|
||||
|
||||
BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID )
|
||||
{
|
||||
if( fdwReason == DLL_PROCESS_DETACH )
|
||||
{
|
||||
// application hangs if call clReleaseCommandQueue here, so release context only
|
||||
// without context release application hangs as well
|
||||
cl_context ctx = (cl_context)getoclContext();
|
||||
if(ctx)
|
||||
openCLSafeCall(clReleaseContext(ctx));
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
#endif
|
||||
|
@ -43,9 +43,7 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include <iomanip>
|
||||
#include "precomp.hpp"
|
||||
#include "mcwutil.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace cv;
|
||||
|
@ -1,865 +0,0 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 1e7f
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
int c = 0;
|
||||
int ix = (int)x;
|
||||
|
||||
for (int i = 0 ; i < 32 ; i++)
|
||||
{
|
||||
c += ix & 0x1;
|
||||
ix >>= 1;
|
||||
}
|
||||
|
||||
return (float)c;
|
||||
}
|
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
||||
local size: dim0 is block_size, dim1 is block_size.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
__local float *s_distance = (__local float *)(sharebuffer);
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
//Dist dist;
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//myBestImgidx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
const int loadX = lidx + i * block_size;
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
local float *s_distance = (local float *)sharebuffer;
|
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0.0f;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k,
|
||||
int block_size
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
@ -190,7 +190,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
|
||||
int pitch = wholeSize.width * 3 * m.elemSize1();
|
||||
int tail_padding = m.elemSize1() * 3072;
|
||||
int err;
|
||||
cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
|
||||
cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
|
||||
(pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
|
||||
openCLVerifyCall(err);
|
||||
|
||||
@ -242,7 +242,7 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
|
||||
int pitch = wholecols * 3 * m.elemSize1();
|
||||
int tail_padding = m.elemSize1() * 3072;
|
||||
int err;
|
||||
cl_mem temp = clCreateBuffer(clCxt->impl->clContext, CL_MEM_READ_WRITE,
|
||||
cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
|
||||
(pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
|
||||
openCLVerifyCall(err);
|
||||
|
||||
@ -595,7 +595,7 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, stri
|
||||
#ifdef CL_VERSION_1_2
|
||||
if(dst.offset == 0 && dst.cols == dst.wholecols)
|
||||
{
|
||||
clEnqueueFillBuffer(dst.clCxt->impl->clCmdQueue, (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
|
||||
clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -43,17 +43,14 @@
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "mcwutil.hpp"
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if defined (HAVE_OPENCL)
|
||||
#ifndef CL_VERSION_1_2
|
||||
#define CL_VERSION_1_2 0
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
@ -94,15 +91,15 @@ namespace cv
|
||||
for(size_t i = 0; i < args.size(); i ++)
|
||||
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
|
||||
|
||||
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
|
||||
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL, globalThreads,
|
||||
localThreads, 0, NULL, NULL));
|
||||
|
||||
switch(finish_mode)
|
||||
{
|
||||
case CLFINISH:
|
||||
clFinish(clCxt->impl->clCmdQueue);
|
||||
clFinish((cl_command_queue)clCxt->oclCommandQueue());
|
||||
case CLFLUSH:
|
||||
clFlush(clCxt->impl->clCmdQueue);
|
||||
clFlush((cl_command_queue)clCxt->oclCommandQueue());
|
||||
break;
|
||||
case DISABLE:
|
||||
default:
|
||||
@ -126,7 +123,7 @@ namespace cv
|
||||
openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
|
||||
build_options, finish_mode);
|
||||
}
|
||||
|
||||
|
||||
cl_mem bindTexture(const oclMat &mat)
|
||||
{
|
||||
cl_mem texture;
|
||||
@ -177,10 +174,10 @@ namespace cv
|
||||
desc.buffer = NULL;
|
||||
desc.num_mip_levels = 0;
|
||||
desc.num_samples = 0;
|
||||
texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
|
||||
#else
|
||||
texture = clCreateImage2D(
|
||||
mat.clCxt->impl->clContext,
|
||||
(cl_context)mat.clCxt->oclContext(),
|
||||
CL_MEM_READ_WRITE,
|
||||
&format,
|
||||
mat.cols,
|
||||
@ -195,10 +192,10 @@ namespace cv
|
||||
cl_mem devData;
|
||||
if (mat.cols * mat.elemSize() != mat.step)
|
||||
{
|
||||
devData = clCreateBuffer(mat.clCxt->impl->clContext, CL_MEM_READ_ONLY, mat.cols * mat.rows
|
||||
devData = clCreateBuffer((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_ONLY, mat.cols * mat.rows
|
||||
* mat.elemSize(), NULL, NULL);
|
||||
const size_t regin[3] = {mat.cols * mat.elemSize(), mat.rows, 1};
|
||||
clEnqueueCopyBufferRect(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, devData, origin, origin,
|
||||
clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
|
||||
regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
|
||||
}
|
||||
else
|
||||
@ -206,10 +203,10 @@ namespace cv
|
||||
devData = (cl_mem)mat.data;
|
||||
}
|
||||
|
||||
clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, devData, texture, 0, origin, region, 0, NULL, 0);
|
||||
clEnqueueCopyBufferToImage((cl_command_queue)mat.clCxt->oclCommandQueue(), devData, texture, 0, origin, region, 0, NULL, 0);
|
||||
if ((mat.cols * mat.elemSize() != mat.step))
|
||||
{
|
||||
clFinish(mat.clCxt->impl->clCmdQueue);
|
||||
clFinish((cl_command_queue)mat.clCxt->oclCommandQueue());
|
||||
clReleaseMemObject(devData);
|
||||
}
|
||||
|
||||
@ -234,7 +231,7 @@ namespace cv
|
||||
try
|
||||
{
|
||||
cv::ocl::openCLGetKernelFromSource(clCxt, &_kernel_string, "test_func");
|
||||
_support = true;
|
||||
//_support = true;
|
||||
}
|
||||
catch (const cv::Exception& e)
|
||||
{
|
||||
@ -254,4 +251,3 @@ namespace cv
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
#endif
|
@ -1,81 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef _OPENCV_MCWUTIL_
|
||||
#define _OPENCV_MCWUTIL_
|
||||
|
||||
#include "precomp.hpp"
|
||||
using namespace std;
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
enum FLUSH_MODE
|
||||
{
|
||||
CLFINISH = 0,
|
||||
CLFLUSH,
|
||||
DISABLE
|
||||
};
|
||||
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
|
||||
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
|
||||
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
|
||||
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
|
||||
// bind oclMat to OpenCL image textures
|
||||
// note:
|
||||
// 1. there is no memory management. User need to explicitly release the resource
|
||||
// 2. for faster clamping, there is no buffer padding for the constructed texture
|
||||
cl_mem bindTexture(const oclMat &mat);
|
||||
void releaseTexture(cl_mem& texture);
|
||||
|
||||
// returns whether the current context supports image2d_t format or not
|
||||
bool support_image2d(Context *clCxt = Context::getContext());
|
||||
|
||||
}//namespace ocl
|
||||
|
||||
}//namespace cv
|
||||
|
||||
#endif //_OPENCV_MCWUTIL_
|
@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
|
||||
|
||||
bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2;
|
||||
|
||||
if (!cv::ocl::Context::getContext()->impl->double_support && is_float)
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE) && is_float)
|
||||
{
|
||||
CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!");
|
||||
}
|
||||
@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
|
||||
|
||||
cv::Mat dst(dst_a);
|
||||
a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0;
|
||||
if (!cv::ocl::Context::getContext()->impl->double_support)
|
||||
if (!cv::ocl::Context::getContext()->supportsFeature(Context::CL_DOUBLE))
|
||||
{
|
||||
for (int i = 0; i < contour->total; ++i)
|
||||
{
|
||||
|
@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
uchar4 src1_data ,src2_data;
|
||||
uchar4 src1_data ,src2_data;
|
||||
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
|
||||
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
|
||||
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
|
||||
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
|
||||
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
|
||||
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
|
||||
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
|
||||
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
|
||||
@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
|
||||
|
||||
#define dst_align ((dst_offset >> bitOfInt) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
|
||||
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
|
||||
@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = ~ src1_data;
|
||||
|
||||
|
||||
/* if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
|
||||
{
|
||||
int src_index = mad24(y, src_step, (x << 3) + src_offset);
|
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
|
||||
|
||||
|
||||
char8 data;
|
||||
|
||||
data = *((__global char8 *)((__global char *)src + src_index));
|
||||
data = ~ data;
|
||||
|
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
|
||||
char4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
char4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
|
||||
@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
|
||||
@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
{
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
|
||||
@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
|
||||
@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
|
||||
@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
if(src1_index < 0)
|
||||
{
|
||||
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
|
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
|
||||
@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
|
||||
@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
|
||||
@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/***********************************Compare LT*******************************/
|
||||
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
|
||||
__global uchar *src2, int src2_step, int src2_offset,
|
||||
@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
|
||||
@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
|
||||
@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align (dst_offset & 3)
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
|
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
|
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
if(src1_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
uchar4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
ushort4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 1) & 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
|
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
short4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
|
||||
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
|
||||
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src1_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
int4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
|
||||
@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 2)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
float4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
|
||||
@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
{
|
||||
x = x << 2;
|
||||
#define dst_align ((dst_offset >> 3)& 3)
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
|
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
|
||||
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
|
||||
if(src1_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
|
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
if(src2_index < 0)
|
||||
{
|
||||
double4 tmp;
|
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
|
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
|
||||
}
|
||||
|
||||
|
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
|
||||
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
|
||||
@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
|
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
|
||||
int y = get_global_id(1);
|
||||
|
||||
if (x < cols && y < rows)
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
x = x << 2;
|
||||
|
||||
#define dst_align ((dst_offset >> 2) & 3)
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
|
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset);
|
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
|
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
|
||||
@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
|
||||
src1_data.s01234567 = src1_data.s45670123;
|
||||
if(src1_index== -2)
|
||||
src1_data.s01234567 = src1_data.s23456701;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
|
||||
|
@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
|
||||
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
|
||||
}
|
||||
}
|
||||
|
@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
|
||||
dst[gid + groupnum] = localmem_max[0];
|
||||
}
|
||||
}
|
||||
|
@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
|
||||
dst[gid] = localmem_sum[0];
|
||||
}
|
||||
}
|
||||
|
@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
|
||||
dst[gid*3+2] = localmem_sum3[0];
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
// Liu Liujun, liujun@multicorewareinc.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0(
|
||||
int pos = mad24(idy,istep >> 2,idx);
|
||||
int wpos = mad24(idy,wstep >> 2,idx);
|
||||
float4 w1 = weight1[wpos], w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0(
|
||||
int wpos = mad24(idy,wstep, idx);
|
||||
float w1 = weight1[wpos];
|
||||
float w2 = weight2[wpos];
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
|
||||
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
|
||||
}
|
||||
}
|
||||
@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
|
||||
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
|
||||
}
|
||||
}
|
||||
|
865
modules/ocl/src/opencl/brute_force_match.cl
Normal file
865
modules/ocl/src/opencl/brute_force_match.cl
Normal file
@ -0,0 +1,865 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 1e7f
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
int c = 0;
|
||||
int ix = (int)x;
|
||||
|
||||
for (int i = 0 ; i < 32 ; i++)
|
||||
{
|
||||
c += ix & 0x1;
|
||||
ix >>= 1;
|
||||
}
|
||||
|
||||
return (float)c;
|
||||
}
|
||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
||||
local size: dim0 is block_size, dim1 is block_size.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
__local float *s_distance = (__local float *)(sharebuffer);
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
//Dist dist;
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
//myBestImgidx = imgIdx;
|
||||
myBestDistance = result;
|
||||
myBestTrainIdx = trainIdx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
s_distance[lidx] = myBestDistance;
|
||||
s_trainIdx[lidx] = myBestTrainIdx;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
{
|
||||
myBestDistance = s_distance[k];
|
||||
myBestTrainIdx = s_trainIdx[k];
|
||||
}
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = myBestTrainIdx;
|
||||
bestDistance[queryIdx] = myBestDistance;
|
||||
}
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int bestTrainIdx_cols,
|
||||
int step,
|
||||
int ostep,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
const int groupidy = get_group_id(1);
|
||||
|
||||
const int queryIdx = groupidy * block_size + lidy;
|
||||
const int trainIdx = groupidx * block_size + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
|
||||
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; ++j)
|
||||
{
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
if (ind < bestTrainIdx_cols)
|
||||
{
|
||||
//bestImgIdx = imgIdx;
|
||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
}
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loopUnrolledCached
|
||||
volatile int imgIdx = 0;
|
||||
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
const int loadX = lidx + i * block_size;
|
||||
//load a block_size * block_size block into local train.
|
||||
const int loadx = lidx + i * block_size;
|
||||
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
local float *s_distance = (local float *)sharebuffer;
|
||||
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType
|
||||
)
|
||||
{
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * block_size + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + block_size * block_size;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
int myBestTrainIdx1 = -1;
|
||||
int myBestTrainIdx2 = -1;
|
||||
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
{
|
||||
float result = 0.0f;
|
||||
|
||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * block_size;
|
||||
//load query and train into local memory
|
||||
s_query[lidy * block_size + lidx] = 0;
|
||||
s_train[lidx * block_size + lidy] = 0;
|
||||
|
||||
if (loadx < query_cols)
|
||||
{
|
||||
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||
|
||||
switch (distType)
|
||||
{
|
||||
case 0:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
case 1:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||
result += qr * qr;
|
||||
}
|
||||
|
||||
break;
|
||||
case 2:
|
||||
|
||||
for (int j = 0 ; j < block_size ; j++)
|
||||
{
|
||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
const int trainIdx = t * block_size + lidx;
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
if (result < myBestDistance1)
|
||||
{
|
||||
myBestDistance2 = myBestDistance1;
|
||||
myBestTrainIdx2 = myBestTrainIdx1;
|
||||
myBestDistance1 = result;
|
||||
myBestTrainIdx1 = trainIdx;
|
||||
}
|
||||
else if (result < myBestDistance2)
|
||||
{
|
||||
myBestDistance2 = result;
|
||||
myBestTrainIdx2 = trainIdx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__local float *s_distance = (__local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||
|
||||
//findBestMatch
|
||||
s_distance += lidy * block_size;
|
||||
s_trainIdx += lidy * block_size;
|
||||
|
||||
s_distance[lidx] = myBestDistance1;
|
||||
s_trainIdx[lidx] = myBestTrainIdx1;
|
||||
|
||||
float bestDistance1 = MAX_FLOAT;
|
||||
float bestDistance2 = MAX_FLOAT;
|
||||
int bestTrainIdx1 = -1;
|
||||
int bestTrainIdx2 = -1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance1)
|
||||
{
|
||||
bestDistance2 = bestDistance1;
|
||||
bestTrainIdx2 = bestTrainIdx1;
|
||||
|
||||
bestDistance1 = val;
|
||||
bestTrainIdx1 = s_trainIdx[i];
|
||||
}
|
||||
else if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
s_distance[lidx] = myBestDistance2;
|
||||
s_trainIdx[lidx] = myBestTrainIdx2;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (lidx == 0)
|
||||
{
|
||||
for (int i = 0 ; i < block_size ; i++)
|
||||
{
|
||||
float val = s_distance[i];
|
||||
|
||||
if (val < bestDistance2)
|
||||
{
|
||||
bestDistance2 = val;
|
||||
bestTrainIdx2 = s_trainIdx[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
myBestDistance1 = bestDistance1;
|
||||
myBestDistance2 = bestDistance2;
|
||||
|
||||
myBestTrainIdx1 = bestTrainIdx1;
|
||||
myBestTrainIdx2 = bestTrainIdx2;
|
||||
|
||||
if (queryIdx < query_rows && lidx == 0)
|
||||
{
|
||||
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
|
||||
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
int train_cols,
|
||||
int step,
|
||||
int distType)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k,
|
||||
int block_size
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
@ -234,4 +234,3 @@ __kernel
|
||||
map_y[y * step_y + x] = ycoo;
|
||||
}
|
||||
}
|
||||
|
@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
|
||||
dst[start_addr] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -559,7 +559,3 @@ if(result)
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
|
||||
newnode[counter].alpha[0] = t1.alpha[0];
|
||||
newnode[counter].alpha[1] = t1.alpha[1];
|
||||
}
|
||||
|
@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
|
||||
dst[gy*(dst_step >> 2)+gx] = res;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user